diff --git a/dev_space/Skill_Extraction_ST_FAISS_CAG.ipynb b/dev_space/Skill_Extraction_ST_FAISS_CAG.ipynb new file mode 100644 index 0000000..2ef01f3 --- /dev/null +++ b/dev_space/Skill_Extraction_ST_FAISS_CAG.ipynb @@ -0,0 +1,412 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "uA8gi75Zz2RD" + }, + "source": [ + "# Skill Extraction with CAG\n", + "**Objective:** Extract relevant skills from text by first retrieving a candidate set using semantic search (MPNet embeddings + FAISS) and then using an LLM (Gemma via VLLM) to refine selections from these candidates.\n", + "- **Date:** 9-June-25\n", + "- **Author:** Anket Patil\n", + "- **Colab** https://colab.research.google.com/drive/1Xu8hAheSLEotEvjliysLgDE7VEhz4qYs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TKA1yFi20thu" + }, + "source": [ + "## 1. Install Dependencies\n", + "Installs required libraries:\n", + "\n", + "* `faiss-cpu`: For similarity search.\n", + "* `sentence-transformers`: Create embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "L--xh6vhzVZd", + "outputId": "30c31442-3059-49e9-c5ad-d88b0c56a5ce" + }, + "outputs": [], + "source": [ + "!pip install -q sentence-transformers faiss-cpu" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wU_EncDX07hj" + }, + "source": [ + "## 2. Import Libraries\n", + "Imports essential Python libraries for data handling, numerical operations, model interaction, and FAISS." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2S7TEBNxuJ3y" + }, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sentence_transformers import SentenceTransformer, util\n", + "import faiss\n", + "import time\n", + "import random\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zAtxJ2431Had" + }, + "source": [ + "## 3. Create FAISS Index for Semantic Search\n", + "Loads ESCO skills, turns them into vectors using a all-mpnet-base-v2, and builds a FAISS index so we can quickly find the most relevant skills for any course or job description." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 546, + "referenced_widgets": [ + "c596dd765f8442c9a0ff3c4dfdc34ea6", + "f8e52cb74f504fd98756a59242a68d01", + "3bcb2a02884c4657bdf64c224f43067d", + "5cc580c5bb4c4c71821625ccaf54a672", + "323c99bb03e4466096b8335ac80b2082", + "8b413a6eb0874a86849ba8a8dc8b567b", + "a90ff854e88846efa6978368aae70449", + "0ec53f8d5628442b8447604c2f133d97", + "f9a0462c00284820ab6bd68889fb1727", + "0fd6846212a149fcb984922da185e237", + "9313c30bbb12483796d50f9fa4ab72ca", + "0c4d56e18b9d40f99c0fa75a59a6d5f0", + "76e75a9bee2846caa176b6f2b4bf9dbe", + "44982d23d5b3445081ee4e5a7a7cf2e4", + "32718a0fb5e741ae9d1871172cfc5b26", + "5f62199e798b4eb69f04aa6b44a68804", + "8dc56c33205f405cbce40f9bda616fd8", + "048ee1900e8846c9ab3681f3d7c963c5", + "04a06c4e517c4df19246d4663f0f0497", + "7bfca87ce1a341d0ba35000e037f5584", + "43d02cb906284b8181564e30eea9e7b9", + "dd93e77e8c2a45f0b1419acc699b1b9c", + "02ddc4d9868b4ca9bf342fc1622000e2", + "653b5dd6d2d94e4cb920755d30118c42", + "fd2fec2bd2374a94987247f8464853c6", + "41de1d39a5c1419a8fe6aa27952c1ac2", + "4166d5b9a3dc41adba4d72dbc8b0e8f0", + "0ea37819d54347c8ad550125f8bc3e92", + "e4c172dabf3048a88c5db70e23741c75", + "8f2d0acba3714614b4b7595f3ab9a581", + "4b39b94fa1544e66b12394a39f69dc9f", + "806ad4c4020b4dc9b8c278df600c11b3", + "2f94be65159b411b909beefe4800db9a", + "91c31660bfa14b8c8e0722e61a7d4e3f", + "386ae6b7d3dc466f9c843b4706c69c7d", + "2e546d64b6674388aec07170c1a39e35", + "32959ea1660e4d51b48e0d818830ad75", + "c2b9bd6427824f11aa169cc640979c23", + "fd1f339662ea44a799b01ae09fdce3da", + "6119fdd7436a4a9d8ab593c730b24a88", + "837f6c48f3d54d3fbd2a0fa9071b39b5", + "d07c5819d55848bb89cc27152a705c27", + "52c03907d91140d1a30d292a0e8b4d2a", + "24dc16c6ad88439b9d23c8fdde5c3e3d", + "fed4fe863475491e81f7dbc50da6bfa9", + "7d5eff4a9c6b4fc88e278cdaea33b1f9", + "6d3bb3e846f44d9789b965f2c762f7fb", + "0bea2a0a235f4e90b532e35a8d2b4da4", + "5127d401bdf44e9c95d5fbc1a80d197c", + "2f2026ec941b4b30aedd15b073673356", + "2cacf3c29a9749469b9d843a398e8fc8", + "61c42b7f46e74a148a243ee5e7f78f4b", + "cd6a570e6a034f45bb4a707d2a09d0b2", + "b73bbe5803384c3381fe514163236d0a", + "5c33ce12f61a41e289255a4fc8873055", + "eeaf332884cd47f3b3fa42a99e5c0e3b", + "e3e9e8477508411baaf3f981780b0a44", + "e9938f130fe844a9be3c5e897604d537", + "f3ee631242784ee4aa0ebac67ef66566", + "7e2752d6a21c45519785b667fe9ea1b3", + "7d8cf97b6e7241f69657d6c670b7f23a", + "24f8e2091b724f0fb849f59da11a7500", + "1582721f67994b5b9b6ae99ad2075aa8", + "c0e057e1e4fb46c690f2094de6dd9a8e", + "c5a84485402d425f94278b500acc95b8", + "3e3b1183e352411f970c5cf21ecb9e75", + "f87ffeec194a4ab7ad81b2166e70f975", + "5e67912a186547039804e0cae999c09e", + "306e579dc7804545b0e4b3680dc47cfd", + "0a3a322a09324f46bb513c358298640c", + "8ece429324554563b21f6d1a6b45c227", + "b5a1f8879d30462a9f10cdebdfe217af", + "8768e251835941e687cc2c2ab7d9369f", + "51463c3d219c44e393e11bd28dc795be", + "a51b1ebc07074f7197ef42fbade5eb6a", + "4c8285cf2abf4d9ba39435fb5481a45a", + "5bb111ffea8341639008feecf3229cde", + "1bd9809ae7c94367921dbee8af74995f", + "51c7a0f94ee54e92b3e9d640c016ef6a", + "685816768a8e41cca30a248e9a5bde4b", + "6c4068a802c04380b855e023c8de530c", + "93fe25bd73524af0af13bf5803ca4795", + "34da87ae4c3f491991f852650fb3a14c", + "7680f2acfe304a948893c79b7007d40a", + "d3bd2b9fbd5542f5824d7368affcea0e", + "df45e1a0ba4347c9b8ca1f8a9de03c60", + "6dbf52e757444326968f446d587aa666", + "c1b9bcc973da40229a93f1eef4f0276c", + "160a2a23b35843c2942a2c2dd576cac8", + "fc4886776eec46fea6a42c84ed7a0e88", + "33bb1952873a43818fdcaffa0890334a", + "33cf6dc7bbc841bfa82a35dcc2107947", + "c2a878feab8549db964a289ea3decf41", + "20470302ceea4760b75b63e84171512c", + "2861963f83a04b8fad1790a61cacfca6", + "2df4905367314eadb75d761f5d22e3ff", + "cab6f1cddeae4ee3bc976f04857a40ba", + "7afd84afb384450ab195ed6431d6703c", + "852f59af196a4a10b53b5f216c6ac3ff", + "7119b315c5de48bb81bda73038c77502", + "e1541194643c4933961d01e719f91531", + "144f3125a79442d7816119591c90618b", + "3125ba1ab36b42b99802657608c2fac0", + "b9d18420fc1a43a9acb6710de33a7925", + "733ff40879084f1cb7441f07413f3543", + "acb094d7981341319f9a0c6de4893f29", + "3f72f7b2178e42a6b970cee07ba03575", + "49e56ece8a7d48bd96a393554d7baf2f", + "feb1648bd09c4e3eb5eaf3dcc8cece69", + "e6d982002f5248d696f8783734f0805f", + "0aac073ee78a4b2e8de44123f8078d05", + "2cba2909cc2f4346a841a28c28435e11", + "33381a7a977942dfa519917c50f9cdc7", + "2eb92535eebe4f8287097aa5f363c83b", + "48b2f29ad107491d90f3f17ddd6a3912", + "ed90a96adcd94450844e9cbadf6ace79", + "b714f615e9034f8696924876bef88864", + "b9292826865540a9920ce0faab025d7e", + "cd62a9b872e74e16acdfc33c039f5537", + "91acfd596e644f6dbcda3c2ac9f98d4d", + "ab3dc42087b94d06bc26b73447227bd9", + "25319d01824648ecb453164d009fc55b", + "49eb00f5c7c94bd5a5d4318e65424aab", + "be1dbb86cccd4762b6cc313468004ef0", + "955b7d1b57ae4155bb50cefff8001272", + "683b8503d30a412c90a58a3f54ca6b04", + "1ef653fa4b4a424284f6581d7d781e7a", + "2ff7108dff384236ad8711b9bce616e5", + "1157b041ed7f4316ab06a4290ffa38f6", + "bd81c5e0ed4c49aa81dcfd8a6b8386b7", + "dd7915df60064835b79ad63947604242", + "e33e46b0e52d47dd88f8a5908faa8f39" + ] + }, + "id": "KAieUOU_uPcL", + "outputId": "f8b0ef75-8116-4b93-92c9-0631c5b05c7a" + }, + "outputs": [], + "source": [ + "# Load ESCO skill data\n", + "esco_df = pd.read_csv(\"https://raw.githubusercontent.com/LAiSER-Software/datasets/refs/heads/master/taxonomies/ESCO_skills_Taxonomy.csv\") # replace with your file if needed\n", + "skill_names = esco_df[\"preferredLabel\"].tolist()\n", + "\n", + "# Embed ESCO skills using SentenceTransformer\n", + "model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')\n", + "print(\"Embedding ESCO skills...\")\n", + "esco_embeddings = model.encode(skill_names, convert_to_numpy=True, show_progress_bar=True)\n", + "\n", + "# ⚡ Normalize & Index using FAISS (cosine sim = L2 norm + dot product)\n", + "dimension = esco_embeddings.shape[1]\n", + "index = faiss.IndexFlatIP(dimension)\n", + "faiss.normalize_L2(esco_embeddings)\n", + "index.add(esco_embeddings)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJ5e4mBH1qPc" + }, + "source": [ + "## 4. Find Most Relevant ESCO Skills\n", + "Takes a course description, turns it into a vector using the embedding model, and searches the FAISS index to return the top 50 most relevant ESCO skills based on similarity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UPlZdOyVuWoE" + }, + "outputs": [], + "source": [ + "def get_top_esco_skills(course_desc, top_k=50):\n", + " emb = model.encode(course_desc, convert_to_numpy=True)\n", + " faiss.normalize_L2(emb.reshape(1, -1))\n", + " scores, indices = index.search(emb.reshape(1, -1), top_k)\n", + " return [skill_names[i] for i in indices[0]]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fzhymtfx2A3j" + }, + "source": [ + "## 5. Load Syllabi Dataset\n", + "Loads a preprocessed dataset of 50 course syllabi from OpenSyllabus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "QXPLnv0zvxqa", + "outputId": "10608262-6006-4f69-89ef-4c3de3d721e4" + }, + "outputs": [], + "source": [ + "syllabi_data = pd.read_csv(\"https://raw.githubusercontent.com/LAiSER-Software/datasets/refs/heads/master/syllabi-data/preprocessed_50_opensyllabus_syllabi_data.csv\")\n", + "syllabi_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "meJ6B7r34L5o" + }, + "source": [ + "## Test Skill Extraction on a Single Course Description\n", + "This cell randomly selects one course from the dataset, extracts its top 50 relevant ESCO skills using the semantic search pipeline, and displays the time taken for the operation in milliseconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4ytlK-UZ4Xik", + "outputId": "5414d6e6-56dd-4b51-f4fe-f3b6291bbee8" + }, + "outputs": [], + "source": [ + "\n", + "# Randomly pick an index between 0 and 45\n", + "rand_idx = random.randint(0, 49)\n", + "row = syllabi_data.loc[rand_idx]\n", + "\n", + "course_title = row['title']\n", + "course_desc = row['description']\n", + "\n", + "print(f\"Course Title: {course_title}\")\n", + "print(f\"Course Description:\\n{course_desc}\\n\")\n", + "\n", + "start_time = time.time()\n", + "single_skills = get_top_esco_skills(course_desc)\n", + "end_time = time.time()\n", + "\n", + "time_ms = (end_time - start_time) * 1000\n", + "print(f\"Time taken: {round(time_ms, 2)} ms\")\n", + "print(f\"\\nTop 50 ESCO Skills:\\n{single_skills}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tX1SfOIU2pio" + }, + "source": [ + "## Extract Top 50 ESCO Skills for Each Course\n", + "Loops through each course description in the syllabi dataset, uses the get_top_esco_skills function to find the top 50 relevant ESCO skills\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ig-cBK68yiLU" + }, + "outputs": [], + "source": [ + "def get_top_k_skills_bulk(df, text_col='description', top_k=50):\n", + " top_skills_list = []\n", + " for i, row in df.iterrows():\n", + " course_desc = row[text_col]\n", + " if not isinstance(course_desc, str) or not course_desc.strip():\n", + " top_skills_list.append([])\n", + " continue\n", + "\n", + " skills = get_top_esco_skills(course_desc, top_k=top_k)\n", + " top_skills_list.append(skills)\n", + "\n", + " return top_skills_list\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "VsaFeGjz4Dnb", + "outputId": "207453bb-20a7-4ca7-a741-9e7abe8c10f1" + }, + "outputs": [], + "source": [ + "# ⏱️ Measure total time in milliseconds\n", + "start_time = time.time()\n", + "\n", + "# Bulk processing\n", + "syllabi_data['top_50_esco_skills'] = get_top_k_skills_bulk(syllabi_data)\n", + "\n", + "end_time = time.time()\n", + "total_time_ms = (end_time - start_time) * 1000\n", + "print(f\"Total time taken for all rows: {round(total_time_ms, 2)} ms\")\n", + "syllabi_data[['title','description','top_50_esco_skills']]" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}