LAiSER-Software · Anket11 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/laiser/data_access.py b/laiser/data_access.py
@@ -238,7 +238,9 @@ def __init__(self, data_access: DataAccessLayer):
 
     # Issue: Do we even need this? Can't this be done in init
     # Issue [GFI_OddEven]: Split these into two seperate modules load and build index
-    def initialize_index(self, force_rebuild: bool = False, debug: bool = False) -> faiss.IndexFlatIP:
+    def initialize_index(
+        self, force_rebuild: bool = False, debug: bool = False
+    ) -> faiss.IndexFlatIP:
         """Initialize FAISS index (load or build).
 
         Behavior (minimal & strict):
@@ -260,7 +262,9 @@ def initialize_index(self, force_rebuild: bool = False, debug: bool = False) ->
             ## Issue: Embedding (npy) is not accessed. Cosine Calculations might be faster if npy is accessed.
             try:
                 self.index = self.data_access.load_faiss_index(str(local_index_path))
-                self.metadata = self.data_access.load_skill_metadata(str(local_json_path))
+                self.metadata = self.data_access.load_skill_metadata(
+                    str(local_json_path)
+                )
             except Exception as e:
                 if debug:
                     logger.warning(f"[initialize_index] load attempt failed: {e}")
@@ -318,7 +322,9 @@ def initialize_index(self, force_rebuild: bool = False, debug: bool = False) ->
                     "taxonomy",
                 ]:
                     # find actual column name (case-insensitive)
-                    found = next((col for col in single_df.columns if col.lower() == c), None)
+                    found = next(
+                        (col for col in single_df.columns if col.lower() == c), None
+                    )
                     if found:
                         keep_cols.append(found)
                 single_df = single_df[keep_cols].copy()
@@ -511,7 +517,9 @@ def _find(cols_substr):
 
         # Persist metadata JSON and FAISS index (best-effort with debug warnings)
         try:
-            self.data_access.save_skill_metadata_json(self.metadata, str(local_json_path))
+            self.data_access.save_skill_metadata_json(
+                self.metadata, str(local_json_path)
+            )
         except Exception as e:
             if debug:
                 logger.warning(f"[initialize_index] Failed to write metadata JSON: {e}")
@@ -554,10 +562,14 @@ def search_similar_skills(
           ordered by similarity. If you want a safety limit, pass `max_results` (int).
         """
         if self.index is None:
-            raise FAISSIndexError("FAISS index not initialized. Call initialize_index() first.")
+            raise FAISSIndexError(
+                "FAISS index not initialized. Call initialize_index() first."
+            )
 
         if self.metadata is None:
-            raise FAISSIndexError("Metadata not initialized. Call initialize_index() first.")
+            raise FAISSIndexError(
+                "Metadata not initialized. Call initialize_index() first."
+            )
 
         # Ensure skill_names come from metadata
         if self.skill_names is None:
@@ -598,7 +610,9 @@ def search_similar_skills(
                 top_k = max(1, min(int(top_k), ntotal))
                 scores, indices = self.index.search(q, top_k)
                 results = []
-                for rank, (score, idx) in enumerate(zip(scores[0], indices[0]), start=1):
+                for rank, (score, idx) in enumerate(
+                    zip(scores[0], indices[0]), start=1
+                ):
                     if idx == -1:
                         continue
                     results.append(

diff --git a/laiser/llm_methods.py b/laiser/llm_methods.py
@@ -85,7 +85,10 @@
 
     # Provide a fallback function
     def llm_router(*args, **kwargs):
-        raise ImportError("llm_router is not available. Please check your installation.")
+        raise ImportError(
+            "llm_router is not available. Please check your installation."
+        )
+
 
 
 torch.cuda.empty_cache()
@@ -517,7 +520,9 @@ def get_completion_vllm(input_text, text_columns, id_column, input_type, llm, ba
     """
 
     try:
-        result = vllm_generate(llm, input_text, input_type=input_type, batch_size=batch_size)
+        result = vllm_generate(
+            llm, input_text, input_type=input_type, batch_size=batch_size
+        )
     except Exception as e:
         print(f"Error in vLLM generation: {e}")
         return []
@@ -537,7 +542,9 @@ def get_completion_vllm(input_text, text_columns, id_column, input_type, llm, ba
                 parsed_output.extend(parsed)
             except Exception as e:
                 print(f"Error parsing output for index {i}: {e}")
-                print(f"DataFrame shape: {input_text.shape}, trying to access index {i}")
+                print(
+                    f"DataFrame shape: {input_text.shape}, trying to access index {i}"
+                )
                 print(f"Available indices: {list(input_text.index)}")
                 continue
 
@@ -601,7 +608,9 @@ def get_ksa_details(
         raw_text = llm_router(prompt, model_id, use_gpu, llm, tokenizer, model, api_key)
         json_match = re.search(r"\{.*\}", raw_text, re.DOTALL)
         if not json_match:
-            print(f"[get_ksa_details] No JSON match found in response for skill '{skill}'")
+            print(
+                f"[get_ksa_details] No JSON match found in response for skill '{skill}'"
+            )
             return [], []
 
         parsed = json.loads(json_match.group())

diff --git a/laiser/llm_models/hugging_face_llm.py b/laiser/llm_models/hugging_face_llm.py
@@ -83,7 +83,9 @@ def llm_generate(prompt: str, tokenizer, model, model_id: str, use_gpu: bool):
 
 def llm_generate_vllm(prompt, llm):
     if not VLLM_AVAILABLE:
-        raise ImportError("vLLM is not installed. Please install it to use this function.")
+        raise ImportError(
+            "vLLM is not installed. Please install it to use this function."
+        )
 
     sampling_params = SamplingParams(max_tokens=200, seed=42)
     result = llm.generate([prompt], sampling_params=sampling_params)

diff --git a/laiser/llm_models/llama_cpp_handler.py b/laiser/llm_models/llama_cpp_handler.py
@@ -35,7 +35,9 @@ def __init__(
 
         model_path = model_path or os.getenv("LAISER_LLAMA_CPP_MODEL_PATH")
         if not model_path:
-            raise ValueError("Set LAISER_LLAMA_CPP_MODEL_PATH or pass model_path to LlamaCppBackend.")
+            raise ValueError(
+                "Set LAISER_LLAMA_CPP_MODEL_PATH or pass model_path to LlamaCppBackend."
+            )
 
         model_path = str(Path(model_path))
         # model_path = str(Path(model_path).expanduser().resolve())
@@ -107,7 +109,9 @@ def llama_cpp_chat(
 ) -> str:
 
     if llama is None:
-        raise ValueError("llama is None; expected an initialized llama_cpp.Llama instance.")
+        raise ValueError(
+            "llama is None; expected an initialized llama_cpp.Llama instance."
+        )
 
     messages = [
         {"role": "system", "content": system},

diff --git a/laiser/llm_models/llm_router.py b/laiser/llm_models/llm_router.py
@@ -83,12 +83,17 @@ def gemini_generate(*args, **kwargs):
     print(f"Warning: HuggingFace LLM support not available: {e}")
 
     def llm_generate_vllm(*args, **kwargs):
-        raise ImportError("HuggingFace LLM support is not available. Please install required packages.")
+        raise ImportError(
+            "HuggingFace LLM support is not available. Please install required packages."
+        )
+
 
 
 class LLMRouter:
 
-    def __init__(self, model_id: str, use_gpu: bool, hf_token=None, api_key=None, backend=None):
+    def __init__(
+        self, model_id: str, use_gpu: bool, hf_token=None, api_key=None, backend=None
+    ):
         self.model_id = model_id
         self.use_gpu = use_gpu
         self.hf_token = hf_token

diff --git a/laiser/llm_models/model_loader.py b/laiser/llm_models/model_loader.py
@@ -100,7 +100,9 @@ def load_model_from_transformer(model_id: str = None, token: str = ""):
 def load_model_from_vllm(model_id: str = None, token: str = None, dtype: str = None, quantization: str = None):
 
     if not VLLM_AVAILABLE:
-        raise ImportError("vLLM is not installed. Cannot load model using vLLM backend.")
+        raise ImportError(
+            "vLLM is not installed. Cannot load model using vLLM backend."
+        )
 
     model_id = model_id or DEFAULT_VLLM_MODEL_ID
     dtype = dtype or "float16"
@@ -120,7 +122,9 @@ def load_model_from_vllm(model_id: str = None, token: str = None, dtype: str = N
         llm = LLM(**llm_args)
 
         quant_info = f" with {quantization} quantization" if quantization else ""
-        print(f"[INFO] Successfully loaded vLLM model: {model_id} with dtype: {dtype}{quant_info}")
+        print(
+            f"[INFO] Successfully loaded vLLM model: {model_id} with dtype: {dtype}{quant_info}"
+        )
     except Exception as e:
         print(f"[WARN] Failed to load model '{model_id}' with dtype '{dtype}': {e}")
         print(f"[INFO] Falling back to default model: {DEFAULT_VLLM_MODEL_ID}")

diff --git a/laiser/services.py b/laiser/services.py
@@ -36,7 +36,9 @@ class PromptBuilder:
     def build_skill_extraction_prompt(input_text: str, input_type: str) -> str:
         """Build prompt for basic skill extraction"""
         if input_type == "job_desc":
-            extraction_prompt = SKILL_EXTRACTION_PROMPT_JOB.format(description=input_text)
+            extraction_prompt = SKILL_EXTRACTION_PROMPT_JOB.format(
+                description=input_text
+            )
             return extraction_prompt
         elif input_type == "syllabus":
             return SKILL_EXTRACTION_PROMPT_SYLLABUS.format(
@@ -83,7 +85,9 @@ def build_ksa_extraction_prompt(
         )
 
     @staticmethod
-    def build_ksa_details_prompt(skill: str, description: str, num_key_kr: int = 3, num_key_tas: int = 3) -> str:
+    def build_ksa_details_prompt(
+        skill: str, description: str, num_key_kr: int = 3, num_key_tas: int = 3
+    ) -> str:
         """Build prompt for getting detailed KSA information for a specific skill"""
         return KSA_DETAILS_PROMPT.format(
             skill=skill,
@@ -226,7 +230,9 @@ def parse_ksa_extraction_response(response: str) -> List[Dict[str, Any]]:
                         skill_data["Knowledge Required"] = [k.strip() for k in knowledge_raw.split(",") if k.strip()]
 
                     # Extract task abilities (multi-line support)
-                    task_match = re.search(r"Task Abilities:\s*(.*?)(?=\s*$)", item, re.DOTALL)
+                    task_match = re.search(
+                        r"Task Abilities:\s*(.*?)(?=\s*$)", item, re.DOTALL
+                    )
                     if task_match:
                         task_raw = task_match.group(1).strip()
                         skill_data["Task Abilities"] = [t.strip() for t in task_raw.split(",") if t.strip()]
@@ -293,19 +299,24 @@ def align_skills_to_taxonomy(
         raw_skills_matched = []
         taxonomy_descriptions = []
         taxonomy_sources = []
+        source_urls = []
         correlations = []
 
         def log_debug(msg: str):
             if debug:
                 logger.debug(msg)
 
-        log_debug(f"[align] raw_skills={len(raw_skills)} threshold={similarity_threshold} top_k={top_k}")
+        log_debug(
+            f"[align] raw_skills={len(raw_skills)} threshold={similarity_threshold} top_k={top_k}"
+        )
 
         model = self.data_access.get_embedding_model()
 
         # metadata loaded once
         metadata = self.faiss_manager.get_metadata()
-        log_debug(f"[align] metadata type={type(metadata).__name__} len={len(metadata)}")
+        log_debug(
+            f"[align] metadata type={type(metadata).__name__} len={len(metadata)}"
+        )
         if isinstance(metadata, pd.DataFrame) and not metadata.empty:
             log_debug(f"[align] metadata columns={list(metadata.columns)}")
 
@@ -330,7 +341,9 @@ def log_debug(msg: str):
             meta_idx = best.get("Index")
             canonical_skill = str(best.get("Skill", "")).strip()
 
-            log_debug(f"[skill {i}] best='{canonical_skill}' sim={similarity:.4f} meta_idx={meta_idx}")
+            log_debug(
+                f"[skill {i}] best='{canonical_skill}' sim={similarity:.4f} meta_idx={meta_idx}"
+            )
 
             if similarity < similarity_threshold:
                 log_debug(f"[skill {i}] below threshold -> skip")
@@ -340,9 +353,13 @@ def log_debug(msg: str):
                 log_debug(f"[skill {i}] empty canonical_skill -> skip")
                 continue
             if meta_idx is None:
-                log_debug(f"[skill {i}] meta_idx is None (search_similar_skills may not return Index)")
+                log_debug(
+                    f"[skill {i}] meta_idx is None (search_similar_skills may not return Index)"
+                )
             elif int(meta_idx) >= len(metadata) or int(meta_idx) < 0:
-                log_debug(f"[skill {i}] meta_idx out of range: {meta_idx} (metadata len={len(metadata)})")
+                log_debug(
+                    f"[skill {i}] meta_idx out of range: {meta_idx} (metadata len={len(metadata)})"
+                )
             else:
                 # ✅ DataFrame row by position
                 meta = metadata.iloc[int(meta_idx)].to_dict()
@@ -358,6 +375,7 @@ def log_debug(msg: str):
             raw_skills_matched.append(skill)
             taxonomy_descriptions.append(taxonomy_description)
             taxonomy_sources.append(taxonomy_source)
+            source_urls.append(meta.get("source_url", ""))
             correlations.append(similarity)
 
         log_debug(f"[align] matched={len(mapped_skills)} of {len(raw_skills)}")
@@ -426,7 +444,9 @@ def __init__(
         self.nlp = None
         self.data_access = DataAccessLayer()
         self.faiss_manager = FAISSIndexManager(self.data_access)
-        self.alignment_service = SkillAlignmentService(self.data_access, self.faiss_manager)
+        self.alignment_service = SkillAlignmentService(
+            self.data_access, self.faiss_manager
+        )
         self.prompt_builder = PromptBuilder()
         self.llm_parser = ResponseParser()
         self.response_parser = ResponseParser()
@@ -599,7 +619,9 @@ def align_extracted_skills(
             )
 
         if not isinstance(raw_skills, list):
-            print(f"Warning: raw_skills is not a list, converting from {type(raw_skills)}")
+            print(
+                f"Warning: raw_skills is not a list, converting from {type(raw_skills)}"
+            )
             raw_skills = [str(raw_skills)] if raw_skills else []
 
         return self.alignment_service.align_skills_to_taxonomy(

diff --git a/laiser/utils.py b/laiser/utils.py
@@ -90,7 +90,9 @@ def build_faiss_index_esco():
     # Embed ESCO skills using SentenceTransformer
     model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
     print("Embedding ESCO skills...")
-    esco_embeddings = model.encode(skill_names, convert_to_numpy=True, show_progress_bar=True)
+    esco_embeddings = model.encode(
+        skill_names, convert_to_numpy=True, show_progress_bar=True
+    )
 
     # ⚡ Normalize & Index using FAISS (cosine sim = L2 norm + dot product)
     dimension = esco_embeddings.shape[1]
@@ -121,7 +123,9 @@ def load_faiss_index_esco():
 
         index_path = os.path.join(os.path.dirname(__file__), "input/esco_faiss_index.index")
         if not os.path.exists(index_path):
-            raise FileNotFoundError(f"FAISS index file not found at {index_path}. Please ensure the file exists.")
+            raise FileNotFoundError(
+                f"FAISS index file not found at {index_path}. Please ensure the file exists."
+            )
         index = faiss.read_index(index_path)
         print("FAISS index for ESCO skills loaded successfully.")
         return index
@@ -144,7 +148,11 @@ def get_top_esco_skills(input_text, top_k=10):
     emb = model.encode(input_text, convert_to_numpy=True)
     faiss.normalize_L2(emb.reshape(1, -1))
     scores, indices = index.search(emb.reshape(1, -1), top_k)
-    return [{"Skill": skill_names[i], "index": int(i), "score": float(scores[0][j])} for j, i in enumerate(indices[0])]
+    return [
+        {"Skill": skill_names[i], "index": int(i), "score": float(scores[0][j])}
+        for j, i in enumerate(indices[0])
+    ]
+
 
 
 def get_embedding(nlp, input_text):

diff --git a/print_system_specs.py b/print_system_specs.py
@@ -19,7 +19,9 @@
 def _safe_run(cmd, timeout: float = 5.0):
     """Run a command safely with a timeout; return (code, stdout, stderr)."""
     try:
-        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        p = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+        )
         try:
             out, err = p.communicate(timeout=timeout)
         except subprocess.TimeoutExpired:

diff --git a/tests/test_alignment_core.py b/tests/test_alignment_core.py
@@ -88,7 +88,9 @@ def test_align_skills_to_taxonomy_real_flow():
 
     # Minimal, meaningful assertions
     assert isinstance(df, pd.DataFrame)
-    assert set(["Raw Skill", "Taxonomy Skill", "Correlation Coefficient"]).issubset(df.columns)
+    assert set(["Raw Skill", "Taxonomy Skill", "Correlation Coefficient"]).issubset(
+        df.columns
+    )
 
     # For these common skills, we usually expect at least one match if index is healthy.
     # (We don't assert exact mapping because taxonomies may change.)

diff --git a/tests/test_anthropic.py b/tests/test_anthropic.py
@@ -2,6 +2,8 @@
 
 import pytest
 
+import pytest
+
 # ✅ import from wherever you saved that function
 # Example:
 # from laiser.llm_models.openai_helper import openai_generate