diff --git a/laiser/data_access.py b/laiser/data_access.py index 4348328..9cd1853 100644 --- a/laiser/data_access.py +++ b/laiser/data_access.py @@ -238,7 +238,9 @@ def __init__(self, data_access: DataAccessLayer): # Issue: Do we even need this? Can't this be done in init # Issue [GFI_OddEven]: Split these into two seperate modules load and build index - def initialize_index(self, force_rebuild: bool = False, debug: bool = False) -> faiss.IndexFlatIP: + def initialize_index( + self, force_rebuild: bool = False, debug: bool = False + ) -> faiss.IndexFlatIP: """Initialize FAISS index (load or build). Behavior (minimal & strict): @@ -260,7 +262,9 @@ def initialize_index(self, force_rebuild: bool = False, debug: bool = False) -> ## Issue: Embedding (npy) is not accessed. Cosine Calculations might be faster if npy is accessed. try: self.index = self.data_access.load_faiss_index(str(local_index_path)) - self.metadata = self.data_access.load_skill_metadata(str(local_json_path)) + self.metadata = self.data_access.load_skill_metadata( + str(local_json_path) + ) except Exception as e: if debug: logger.warning(f"[initialize_index] load attempt failed: {e}") @@ -318,7 +322,9 @@ def initialize_index(self, force_rebuild: bool = False, debug: bool = False) -> "taxonomy", ]: # find actual column name (case-insensitive) - found = next((col for col in single_df.columns if col.lower() == c), None) + found = next( + (col for col in single_df.columns if col.lower() == c), None + ) if found: keep_cols.append(found) single_df = single_df[keep_cols].copy() @@ -511,7 +517,9 @@ def _find(cols_substr): # Persist metadata JSON and FAISS index (best-effort with debug warnings) try: - self.data_access.save_skill_metadata_json(self.metadata, str(local_json_path)) + self.data_access.save_skill_metadata_json( + self.metadata, str(local_json_path) + ) except Exception as e: if debug: logger.warning(f"[initialize_index] Failed to write metadata JSON: {e}") @@ -554,10 +562,14 @@ def search_similar_skills( ordered by similarity. If you want a safety limit, pass `max_results` (int). """ if self.index is None: - raise FAISSIndexError("FAISS index not initialized. Call initialize_index() first.") + raise FAISSIndexError( + "FAISS index not initialized. Call initialize_index() first." + ) if self.metadata is None: - raise FAISSIndexError("Metadata not initialized. Call initialize_index() first.") + raise FAISSIndexError( + "Metadata not initialized. Call initialize_index() first." + ) # Ensure skill_names come from metadata if self.skill_names is None: @@ -598,7 +610,9 @@ def search_similar_skills( top_k = max(1, min(int(top_k), ntotal)) scores, indices = self.index.search(q, top_k) results = [] - for rank, (score, idx) in enumerate(zip(scores[0], indices[0]), start=1): + for rank, (score, idx) in enumerate( + zip(scores[0], indices[0]), start=1 + ): if idx == -1: continue results.append( diff --git a/laiser/llm_methods.py b/laiser/llm_methods.py index 8835b8a..a2be0a6 100644 --- a/laiser/llm_methods.py +++ b/laiser/llm_methods.py @@ -85,7 +85,10 @@ # Provide a fallback function def llm_router(*args, **kwargs): - raise ImportError("llm_router is not available. Please check your installation.") + raise ImportError( + "llm_router is not available. Please check your installation." + ) + torch.cuda.empty_cache() @@ -517,7 +520,9 @@ def get_completion_vllm(input_text, text_columns, id_column, input_type, llm, ba """ try: - result = vllm_generate(llm, input_text, input_type=input_type, batch_size=batch_size) + result = vllm_generate( + llm, input_text, input_type=input_type, batch_size=batch_size + ) except Exception as e: print(f"Error in vLLM generation: {e}") return [] @@ -537,7 +542,9 @@ def get_completion_vllm(input_text, text_columns, id_column, input_type, llm, ba parsed_output.extend(parsed) except Exception as e: print(f"Error parsing output for index {i}: {e}") - print(f"DataFrame shape: {input_text.shape}, trying to access index {i}") + print( + f"DataFrame shape: {input_text.shape}, trying to access index {i}" + ) print(f"Available indices: {list(input_text.index)}") continue @@ -601,7 +608,9 @@ def get_ksa_details( raw_text = llm_router(prompt, model_id, use_gpu, llm, tokenizer, model, api_key) json_match = re.search(r"\{.*\}", raw_text, re.DOTALL) if not json_match: - print(f"[get_ksa_details] No JSON match found in response for skill '{skill}'") + print( + f"[get_ksa_details] No JSON match found in response for skill '{skill}'" + ) return [], [] parsed = json.loads(json_match.group()) diff --git a/laiser/llm_models/hugging_face_llm.py b/laiser/llm_models/hugging_face_llm.py index b3b03f1..3408460 100644 --- a/laiser/llm_models/hugging_face_llm.py +++ b/laiser/llm_models/hugging_face_llm.py @@ -83,7 +83,9 @@ def llm_generate(prompt: str, tokenizer, model, model_id: str, use_gpu: bool): def llm_generate_vllm(prompt, llm): if not VLLM_AVAILABLE: - raise ImportError("vLLM is not installed. Please install it to use this function.") + raise ImportError( + "vLLM is not installed. Please install it to use this function." + ) sampling_params = SamplingParams(max_tokens=200, seed=42) result = llm.generate([prompt], sampling_params=sampling_params) diff --git a/laiser/llm_models/llama_cpp_handler.py b/laiser/llm_models/llama_cpp_handler.py index 1c32db0..e6172a9 100644 --- a/laiser/llm_models/llama_cpp_handler.py +++ b/laiser/llm_models/llama_cpp_handler.py @@ -35,7 +35,9 @@ def __init__( model_path = model_path or os.getenv("LAISER_LLAMA_CPP_MODEL_PATH") if not model_path: - raise ValueError("Set LAISER_LLAMA_CPP_MODEL_PATH or pass model_path to LlamaCppBackend.") + raise ValueError( + "Set LAISER_LLAMA_CPP_MODEL_PATH or pass model_path to LlamaCppBackend." + ) model_path = str(Path(model_path)) # model_path = str(Path(model_path).expanduser().resolve()) @@ -107,7 +109,9 @@ def llama_cpp_chat( ) -> str: if llama is None: - raise ValueError("llama is None; expected an initialized llama_cpp.Llama instance.") + raise ValueError( + "llama is None; expected an initialized llama_cpp.Llama instance." + ) messages = [ {"role": "system", "content": system}, diff --git a/laiser/llm_models/llm_router.py b/laiser/llm_models/llm_router.py index 0f51bb9..2eb8a21 100644 --- a/laiser/llm_models/llm_router.py +++ b/laiser/llm_models/llm_router.py @@ -83,12 +83,17 @@ def gemini_generate(*args, **kwargs): print(f"Warning: HuggingFace LLM support not available: {e}") def llm_generate_vllm(*args, **kwargs): - raise ImportError("HuggingFace LLM support is not available. Please install required packages.") + raise ImportError( + "HuggingFace LLM support is not available. Please install required packages." + ) + class LLMRouter: - def __init__(self, model_id: str, use_gpu: bool, hf_token=None, api_key=None, backend=None): + def __init__( + self, model_id: str, use_gpu: bool, hf_token=None, api_key=None, backend=None + ): self.model_id = model_id self.use_gpu = use_gpu self.hf_token = hf_token diff --git a/laiser/llm_models/model_loader.py b/laiser/llm_models/model_loader.py index 3c9b251..2ad79ef 100644 --- a/laiser/llm_models/model_loader.py +++ b/laiser/llm_models/model_loader.py @@ -100,7 +100,9 @@ def load_model_from_transformer(model_id: str = None, token: str = ""): def load_model_from_vllm(model_id: str = None, token: str = None, dtype: str = None, quantization: str = None): if not VLLM_AVAILABLE: - raise ImportError("vLLM is not installed. Cannot load model using vLLM backend.") + raise ImportError( + "vLLM is not installed. Cannot load model using vLLM backend." + ) model_id = model_id or DEFAULT_VLLM_MODEL_ID dtype = dtype or "float16" @@ -120,7 +122,9 @@ def load_model_from_vllm(model_id: str = None, token: str = None, dtype: str = N llm = LLM(**llm_args) quant_info = f" with {quantization} quantization" if quantization else "" - print(f"[INFO] Successfully loaded vLLM model: {model_id} with dtype: {dtype}{quant_info}") + print( + f"[INFO] Successfully loaded vLLM model: {model_id} with dtype: {dtype}{quant_info}" + ) except Exception as e: print(f"[WARN] Failed to load model '{model_id}' with dtype '{dtype}': {e}") print(f"[INFO] Falling back to default model: {DEFAULT_VLLM_MODEL_ID}") diff --git a/laiser/services.py b/laiser/services.py index 6e492a8..f3d3a0f 100644 --- a/laiser/services.py +++ b/laiser/services.py @@ -36,7 +36,9 @@ class PromptBuilder: def build_skill_extraction_prompt(input_text: str, input_type: str) -> str: """Build prompt for basic skill extraction""" if input_type == "job_desc": - extraction_prompt = SKILL_EXTRACTION_PROMPT_JOB.format(description=input_text) + extraction_prompt = SKILL_EXTRACTION_PROMPT_JOB.format( + description=input_text + ) return extraction_prompt elif input_type == "syllabus": return SKILL_EXTRACTION_PROMPT_SYLLABUS.format( @@ -83,7 +85,9 @@ def build_ksa_extraction_prompt( ) @staticmethod - def build_ksa_details_prompt(skill: str, description: str, num_key_kr: int = 3, num_key_tas: int = 3) -> str: + def build_ksa_details_prompt( + skill: str, description: str, num_key_kr: int = 3, num_key_tas: int = 3 + ) -> str: """Build prompt for getting detailed KSA information for a specific skill""" return KSA_DETAILS_PROMPT.format( skill=skill, @@ -226,7 +230,9 @@ def parse_ksa_extraction_response(response: str) -> List[Dict[str, Any]]: skill_data["Knowledge Required"] = [k.strip() for k in knowledge_raw.split(",") if k.strip()] # Extract task abilities (multi-line support) - task_match = re.search(r"Task Abilities:\s*(.*?)(?=\s*$)", item, re.DOTALL) + task_match = re.search( + r"Task Abilities:\s*(.*?)(?=\s*$)", item, re.DOTALL + ) if task_match: task_raw = task_match.group(1).strip() skill_data["Task Abilities"] = [t.strip() for t in task_raw.split(",") if t.strip()] @@ -293,19 +299,24 @@ def align_skills_to_taxonomy( raw_skills_matched = [] taxonomy_descriptions = [] taxonomy_sources = [] + source_urls = [] correlations = [] def log_debug(msg: str): if debug: logger.debug(msg) - log_debug(f"[align] raw_skills={len(raw_skills)} threshold={similarity_threshold} top_k={top_k}") + log_debug( + f"[align] raw_skills={len(raw_skills)} threshold={similarity_threshold} top_k={top_k}" + ) model = self.data_access.get_embedding_model() # metadata loaded once metadata = self.faiss_manager.get_metadata() - log_debug(f"[align] metadata type={type(metadata).__name__} len={len(metadata)}") + log_debug( + f"[align] metadata type={type(metadata).__name__} len={len(metadata)}" + ) if isinstance(metadata, pd.DataFrame) and not metadata.empty: log_debug(f"[align] metadata columns={list(metadata.columns)}") @@ -330,7 +341,9 @@ def log_debug(msg: str): meta_idx = best.get("Index") canonical_skill = str(best.get("Skill", "")).strip() - log_debug(f"[skill {i}] best='{canonical_skill}' sim={similarity:.4f} meta_idx={meta_idx}") + log_debug( + f"[skill {i}] best='{canonical_skill}' sim={similarity:.4f} meta_idx={meta_idx}" + ) if similarity < similarity_threshold: log_debug(f"[skill {i}] below threshold -> skip") @@ -340,9 +353,13 @@ def log_debug(msg: str): log_debug(f"[skill {i}] empty canonical_skill -> skip") continue if meta_idx is None: - log_debug(f"[skill {i}] meta_idx is None (search_similar_skills may not return Index)") + log_debug( + f"[skill {i}] meta_idx is None (search_similar_skills may not return Index)" + ) elif int(meta_idx) >= len(metadata) or int(meta_idx) < 0: - log_debug(f"[skill {i}] meta_idx out of range: {meta_idx} (metadata len={len(metadata)})") + log_debug( + f"[skill {i}] meta_idx out of range: {meta_idx} (metadata len={len(metadata)})" + ) else: # ✅ DataFrame row by position meta = metadata.iloc[int(meta_idx)].to_dict() @@ -358,6 +375,7 @@ def log_debug(msg: str): raw_skills_matched.append(skill) taxonomy_descriptions.append(taxonomy_description) taxonomy_sources.append(taxonomy_source) + source_urls.append(meta.get("source_url", "")) correlations.append(similarity) log_debug(f"[align] matched={len(mapped_skills)} of {len(raw_skills)}") @@ -426,7 +444,9 @@ def __init__( self.nlp = None self.data_access = DataAccessLayer() self.faiss_manager = FAISSIndexManager(self.data_access) - self.alignment_service = SkillAlignmentService(self.data_access, self.faiss_manager) + self.alignment_service = SkillAlignmentService( + self.data_access, self.faiss_manager + ) self.prompt_builder = PromptBuilder() self.llm_parser = ResponseParser() self.response_parser = ResponseParser() @@ -599,7 +619,9 @@ def align_extracted_skills( ) if not isinstance(raw_skills, list): - print(f"Warning: raw_skills is not a list, converting from {type(raw_skills)}") + print( + f"Warning: raw_skills is not a list, converting from {type(raw_skills)}" + ) raw_skills = [str(raw_skills)] if raw_skills else [] return self.alignment_service.align_skills_to_taxonomy( diff --git a/laiser/utils.py b/laiser/utils.py index 136d002..9ae9812 100644 --- a/laiser/utils.py +++ b/laiser/utils.py @@ -90,7 +90,9 @@ def build_faiss_index_esco(): # Embed ESCO skills using SentenceTransformer model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") print("Embedding ESCO skills...") - esco_embeddings = model.encode(skill_names, convert_to_numpy=True, show_progress_bar=True) + esco_embeddings = model.encode( + skill_names, convert_to_numpy=True, show_progress_bar=True + ) # ⚡ Normalize & Index using FAISS (cosine sim = L2 norm + dot product) dimension = esco_embeddings.shape[1] @@ -121,7 +123,9 @@ def load_faiss_index_esco(): index_path = os.path.join(os.path.dirname(__file__), "input/esco_faiss_index.index") if not os.path.exists(index_path): - raise FileNotFoundError(f"FAISS index file not found at {index_path}. Please ensure the file exists.") + raise FileNotFoundError( + f"FAISS index file not found at {index_path}. Please ensure the file exists." + ) index = faiss.read_index(index_path) print("FAISS index for ESCO skills loaded successfully.") return index @@ -144,7 +148,11 @@ def get_top_esco_skills(input_text, top_k=10): emb = model.encode(input_text, convert_to_numpy=True) faiss.normalize_L2(emb.reshape(1, -1)) scores, indices = index.search(emb.reshape(1, -1), top_k) - return [{"Skill": skill_names[i], "index": int(i), "score": float(scores[0][j])} for j, i in enumerate(indices[0])] + return [ + {"Skill": skill_names[i], "index": int(i), "score": float(scores[0][j])} + for j, i in enumerate(indices[0]) + ] + def get_embedding(nlp, input_text): diff --git a/print_system_specs.py b/print_system_specs.py index 46c9ca2..76e4192 100644 --- a/print_system_specs.py +++ b/print_system_specs.py @@ -19,7 +19,9 @@ def _safe_run(cmd, timeout: float = 5.0): """Run a command safely with a timeout; return (code, stdout, stderr).""" try: - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) try: out, err = p.communicate(timeout=timeout) except subprocess.TimeoutExpired: diff --git a/tests/test_alignment_core.py b/tests/test_alignment_core.py index ac5b49a..6d77b30 100644 --- a/tests/test_alignment_core.py +++ b/tests/test_alignment_core.py @@ -88,7 +88,9 @@ def test_align_skills_to_taxonomy_real_flow(): # Minimal, meaningful assertions assert isinstance(df, pd.DataFrame) - assert set(["Raw Skill", "Taxonomy Skill", "Correlation Coefficient"]).issubset(df.columns) + assert set(["Raw Skill", "Taxonomy Skill", "Correlation Coefficient"]).issubset( + df.columns + ) # For these common skills, we usually expect at least one match if index is healthy. # (We don't assert exact mapping because taxonomies may change.) diff --git a/tests/test_anthropic.py b/tests/test_anthropic.py index 04c4453..3b87ba9 100644 --- a/tests/test_anthropic.py +++ b/tests/test_anthropic.py @@ -2,6 +2,8 @@ import pytest +import pytest + # ✅ import from wherever you saved that function # Example: # from laiser.llm_models.openai_helper import openai_generate diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 4b8dfbc..f0d3537 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -66,7 +66,9 @@ def _pick(cols): # Make sure correlation is numeric if "Correlation Coefficient" in df.columns: - df["Correlation Coefficient"] = pd.to_numeric(df["Correlation Coefficient"], errors="coerce") + df["Correlation Coefficient"] = pd.to_numeric( + df["Correlation Coefficient"], errors="coerce" + ) # --- core view (small table) --- core_cols = ["Research ID", "Raw Skill"] @@ -76,16 +78,26 @@ def _pick(cols): core_cols.append("taxonomy") if "Taxonomy Description" in df.columns: core_cols.append("Taxonomy Description") + if "Source URL" in df.columns: + core_cols.append("Source URL") if "Correlation Coefficient" in df.columns: core_cols.append("Correlation Coefficient") core_view = df.loc[:, [c for c in core_cols if c in df.columns]] # --- taxonomy summary --- - taxonomy_counts = df["taxonomy"].value_counts() if "taxonomy" in df.columns else pd.Series(dtype=int) + taxonomy_counts = ( + df["taxonomy"].value_counts() + if "taxonomy" in df.columns + else pd.Series(dtype=int) + ) # --- correlation stats --- - corr_series = df["Correlation Coefficient"] if "Correlation Coefficient" in df.columns else pd.Series(dtype=float) + corr_series = ( + df["Correlation Coefficient"] + if "Correlation Coefficient" in df.columns + else pd.Series(dtype=float) + ) corr_stats = {} if not corr_series.dropna().empty: corr_stats = { @@ -116,8 +128,12 @@ def _pick(cols): if "Correlation Coefficient" in df.columns: bins = [0.0, 0.4, 0.5, 0.6, 0.7, 1.0] labels = ["<0.4", "0.4–0.5", "0.5–0.6", "0.6–0.7", "0.7+"] - df["Corr Bucket"] = pd.cut(df["Correlation Coefficient"], bins=bins, labels=labels, include_lowest=True) - bucket_counts = df["Corr Bucket"].value_counts().reindex(labels).fillna(0).astype(int) + df["Corr Bucket"] = pd.cut( + df["Correlation Coefficient"], bins=bins, labels=labels, include_lowest=True + ) + bucket_counts = ( + df["Corr Bucket"].value_counts().reindex(labels).fillna(0).astype(int) + ) # --- duplicate taxonomy mapping (taxonomy skill duplicated) --- dup_counts = None diff --git a/tests/test_index_process.py b/tests/test_index_process.py index c4e7ed4..1ddfbb1 100644 --- a/tests/test_index_process.py +++ b/tests/test_index_process.py @@ -32,7 +32,9 @@ def test_initialize_index_full_flow_subtests(): # ---- Step 1: init using existing files (or build if missing) ---- index1, metadata = manager.initialize_index(force_rebuild=False) check.is_not_none(index1, "step1: index1 should not be None") - check.is_true(isinstance(index1, faiss.Index), "step1: index1 should be a FAISS Index") + check.is_true( + isinstance(index1, faiss.Index), "step1: index1 should be a FAISS Index" + ) for p in artifacts: check.is_true(p.exists(), f"step1: artifact should exist: {p.name}") @@ -46,7 +48,9 @@ def test_initialize_index_full_flow_subtests(): check.is_not_none(index3, "step3: index3 should not be None") check.is_true(index3.ntotal > 0, "step3: index3.ntotal should be > 0") for p in artifacts: - check.is_true(p.exists(), f"step3: artifact should exist after rebuild: {p.name}") + check.is_true( + p.exists(), f"step3: artifact should exist after rebuild: {p.name}" + ) # ---- Step 4: delete artifacts ---- for p in artifacts: diff --git a/tests/test_openai.py b/tests/test_openai.py index 5e05b99..6dbd9d4 100644 --- a/tests/test_openai.py +++ b/tests/test_openai.py @@ -2,6 +2,8 @@ import pytest +import pytest + # ✅ import from wherever you saved that function # Example: # from laiser.llm_models.openai_helper import openai_generate