Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions laiser/data_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,9 @@ def __init__(self, data_access: DataAccessLayer):

# Issue: Do we even need this? Can't this be done in init
# Issue [GFI_OddEven]: Split these into two seperate modules load and build index
def initialize_index(self, force_rebuild: bool = False, debug: bool = False) -> faiss.IndexFlatIP:
def initialize_index(
self, force_rebuild: bool = False, debug: bool = False
) -> faiss.IndexFlatIP:
"""Initialize FAISS index (load or build).

Behavior (minimal & strict):
Expand All @@ -260,7 +262,9 @@ def initialize_index(self, force_rebuild: bool = False, debug: bool = False) ->
## Issue: Embedding (npy) is not accessed. Cosine Calculations might be faster if npy is accessed.
try:
self.index = self.data_access.load_faiss_index(str(local_index_path))
self.metadata = self.data_access.load_skill_metadata(str(local_json_path))
self.metadata = self.data_access.load_skill_metadata(
str(local_json_path)
)
except Exception as e:
if debug:
logger.warning(f"[initialize_index] load attempt failed: {e}")
Expand Down Expand Up @@ -318,7 +322,9 @@ def initialize_index(self, force_rebuild: bool = False, debug: bool = False) ->
"taxonomy",
]:
# find actual column name (case-insensitive)
found = next((col for col in single_df.columns if col.lower() == c), None)
found = next(
(col for col in single_df.columns if col.lower() == c), None
)
if found:
keep_cols.append(found)
single_df = single_df[keep_cols].copy()
Expand Down Expand Up @@ -511,7 +517,9 @@ def _find(cols_substr):

# Persist metadata JSON and FAISS index (best-effort with debug warnings)
try:
self.data_access.save_skill_metadata_json(self.metadata, str(local_json_path))
self.data_access.save_skill_metadata_json(
self.metadata, str(local_json_path)
)
except Exception as e:
if debug:
logger.warning(f"[initialize_index] Failed to write metadata JSON: {e}")
Expand Down Expand Up @@ -554,10 +562,14 @@ def search_similar_skills(
ordered by similarity. If you want a safety limit, pass `max_results` (int).
"""
if self.index is None:
raise FAISSIndexError("FAISS index not initialized. Call initialize_index() first.")
raise FAISSIndexError(
"FAISS index not initialized. Call initialize_index() first."
)

if self.metadata is None:
raise FAISSIndexError("Metadata not initialized. Call initialize_index() first.")
raise FAISSIndexError(
"Metadata not initialized. Call initialize_index() first."
)

# Ensure skill_names come from metadata
if self.skill_names is None:
Expand Down Expand Up @@ -598,7 +610,9 @@ def search_similar_skills(
top_k = max(1, min(int(top_k), ntotal))
scores, indices = self.index.search(q, top_k)
results = []
for rank, (score, idx) in enumerate(zip(scores[0], indices[0]), start=1):
for rank, (score, idx) in enumerate(
zip(scores[0], indices[0]), start=1
):
if idx == -1:
continue
results.append(
Expand Down
17 changes: 13 additions & 4 deletions laiser/llm_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@

# Provide a fallback function
def llm_router(*args, **kwargs):
raise ImportError("llm_router is not available. Please check your installation.")
raise ImportError(
"llm_router is not available. Please check your installation."
)



torch.cuda.empty_cache()
Expand Down Expand Up @@ -517,7 +520,9 @@ def get_completion_vllm(input_text, text_columns, id_column, input_type, llm, ba
"""

try:
result = vllm_generate(llm, input_text, input_type=input_type, batch_size=batch_size)
result = vllm_generate(
llm, input_text, input_type=input_type, batch_size=batch_size
)
except Exception as e:
print(f"Error in vLLM generation: {e}")
return []
Expand All @@ -537,7 +542,9 @@ def get_completion_vllm(input_text, text_columns, id_column, input_type, llm, ba
parsed_output.extend(parsed)
except Exception as e:
print(f"Error parsing output for index {i}: {e}")
print(f"DataFrame shape: {input_text.shape}, trying to access index {i}")
print(
f"DataFrame shape: {input_text.shape}, trying to access index {i}"
)
print(f"Available indices: {list(input_text.index)}")
continue

Expand Down Expand Up @@ -601,7 +608,9 @@ def get_ksa_details(
raw_text = llm_router(prompt, model_id, use_gpu, llm, tokenizer, model, api_key)
json_match = re.search(r"\{.*\}", raw_text, re.DOTALL)
if not json_match:
print(f"[get_ksa_details] No JSON match found in response for skill '{skill}'")
print(
f"[get_ksa_details] No JSON match found in response for skill '{skill}'"
)
return [], []

parsed = json.loads(json_match.group())
Expand Down
4 changes: 3 additions & 1 deletion laiser/llm_models/hugging_face_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def llm_generate(prompt: str, tokenizer, model, model_id: str, use_gpu: bool):

def llm_generate_vllm(prompt, llm):
if not VLLM_AVAILABLE:
raise ImportError("vLLM is not installed. Please install it to use this function.")
raise ImportError(
"vLLM is not installed. Please install it to use this function."
)

sampling_params = SamplingParams(max_tokens=200, seed=42)
result = llm.generate([prompt], sampling_params=sampling_params)
Expand Down
8 changes: 6 additions & 2 deletions laiser/llm_models/llama_cpp_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def __init__(

model_path = model_path or os.getenv("LAISER_LLAMA_CPP_MODEL_PATH")
if not model_path:
raise ValueError("Set LAISER_LLAMA_CPP_MODEL_PATH or pass model_path to LlamaCppBackend.")
raise ValueError(
"Set LAISER_LLAMA_CPP_MODEL_PATH or pass model_path to LlamaCppBackend."
)

model_path = str(Path(model_path))
# model_path = str(Path(model_path).expanduser().resolve())
Expand Down Expand Up @@ -107,7 +109,9 @@ def llama_cpp_chat(
) -> str:

if llama is None:
raise ValueError("llama is None; expected an initialized llama_cpp.Llama instance.")
raise ValueError(
"llama is None; expected an initialized llama_cpp.Llama instance."
)

messages = [
{"role": "system", "content": system},
Expand Down
9 changes: 7 additions & 2 deletions laiser/llm_models/llm_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,17 @@ def gemini_generate(*args, **kwargs):
print(f"Warning: HuggingFace LLM support not available: {e}")

def llm_generate_vllm(*args, **kwargs):
raise ImportError("HuggingFace LLM support is not available. Please install required packages.")
raise ImportError(
"HuggingFace LLM support is not available. Please install required packages."
)



class LLMRouter:

def __init__(self, model_id: str, use_gpu: bool, hf_token=None, api_key=None, backend=None):
def __init__(
self, model_id: str, use_gpu: bool, hf_token=None, api_key=None, backend=None
):
self.model_id = model_id
self.use_gpu = use_gpu
self.hf_token = hf_token
Expand Down
8 changes: 6 additions & 2 deletions laiser/llm_models/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@ def load_model_from_transformer(model_id: str = None, token: str = ""):
def load_model_from_vllm(model_id: str = None, token: str = None, dtype: str = None, quantization: str = None):

if not VLLM_AVAILABLE:
raise ImportError("vLLM is not installed. Cannot load model using vLLM backend.")
raise ImportError(
"vLLM is not installed. Cannot load model using vLLM backend."
)

model_id = model_id or DEFAULT_VLLM_MODEL_ID
dtype = dtype or "float16"
Expand All @@ -120,7 +122,9 @@ def load_model_from_vllm(model_id: str = None, token: str = None, dtype: str = N
llm = LLM(**llm_args)

quant_info = f" with {quantization} quantization" if quantization else ""
print(f"[INFO] Successfully loaded vLLM model: {model_id} with dtype: {dtype}{quant_info}")
print(
f"[INFO] Successfully loaded vLLM model: {model_id} with dtype: {dtype}{quant_info}"
)
except Exception as e:
print(f"[WARN] Failed to load model '{model_id}' with dtype '{dtype}': {e}")
print(f"[INFO] Falling back to default model: {DEFAULT_VLLM_MODEL_ID}")
Expand Down
42 changes: 32 additions & 10 deletions laiser/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ class PromptBuilder:
def build_skill_extraction_prompt(input_text: str, input_type: str) -> str:
"""Build prompt for basic skill extraction"""
if input_type == "job_desc":
extraction_prompt = SKILL_EXTRACTION_PROMPT_JOB.format(description=input_text)
extraction_prompt = SKILL_EXTRACTION_PROMPT_JOB.format(
description=input_text
)
return extraction_prompt
elif input_type == "syllabus":
return SKILL_EXTRACTION_PROMPT_SYLLABUS.format(
Expand Down Expand Up @@ -83,7 +85,9 @@ def build_ksa_extraction_prompt(
)

@staticmethod
def build_ksa_details_prompt(skill: str, description: str, num_key_kr: int = 3, num_key_tas: int = 3) -> str:
def build_ksa_details_prompt(
skill: str, description: str, num_key_kr: int = 3, num_key_tas: int = 3
) -> str:
"""Build prompt for getting detailed KSA information for a specific skill"""
return KSA_DETAILS_PROMPT.format(
skill=skill,
Expand Down Expand Up @@ -226,7 +230,9 @@ def parse_ksa_extraction_response(response: str) -> List[Dict[str, Any]]:
skill_data["Knowledge Required"] = [k.strip() for k in knowledge_raw.split(",") if k.strip()]

# Extract task abilities (multi-line support)
task_match = re.search(r"Task Abilities:\s*(.*?)(?=\s*$)", item, re.DOTALL)
task_match = re.search(
r"Task Abilities:\s*(.*?)(?=\s*$)", item, re.DOTALL
)
if task_match:
task_raw = task_match.group(1).strip()
skill_data["Task Abilities"] = [t.strip() for t in task_raw.split(",") if t.strip()]
Expand Down Expand Up @@ -293,19 +299,24 @@ def align_skills_to_taxonomy(
raw_skills_matched = []
taxonomy_descriptions = []
taxonomy_sources = []
source_urls = []
correlations = []

def log_debug(msg: str):
if debug:
logger.debug(msg)

log_debug(f"[align] raw_skills={len(raw_skills)} threshold={similarity_threshold} top_k={top_k}")
log_debug(
f"[align] raw_skills={len(raw_skills)} threshold={similarity_threshold} top_k={top_k}"
)

model = self.data_access.get_embedding_model()

# metadata loaded once
metadata = self.faiss_manager.get_metadata()
log_debug(f"[align] metadata type={type(metadata).__name__} len={len(metadata)}")
log_debug(
f"[align] metadata type={type(metadata).__name__} len={len(metadata)}"
)
if isinstance(metadata, pd.DataFrame) and not metadata.empty:
log_debug(f"[align] metadata columns={list(metadata.columns)}")

Expand All @@ -330,7 +341,9 @@ def log_debug(msg: str):
meta_idx = best.get("Index")
canonical_skill = str(best.get("Skill", "")).strip()

log_debug(f"[skill {i}] best='{canonical_skill}' sim={similarity:.4f} meta_idx={meta_idx}")
log_debug(
f"[skill {i}] best='{canonical_skill}' sim={similarity:.4f} meta_idx={meta_idx}"
)

if similarity < similarity_threshold:
log_debug(f"[skill {i}] below threshold -> skip")
Expand All @@ -340,9 +353,13 @@ def log_debug(msg: str):
log_debug(f"[skill {i}] empty canonical_skill -> skip")
continue
if meta_idx is None:
log_debug(f"[skill {i}] meta_idx is None (search_similar_skills may not return Index)")
log_debug(
f"[skill {i}] meta_idx is None (search_similar_skills may not return Index)"
)
elif int(meta_idx) >= len(metadata) or int(meta_idx) < 0:
log_debug(f"[skill {i}] meta_idx out of range: {meta_idx} (metadata len={len(metadata)})")
log_debug(
f"[skill {i}] meta_idx out of range: {meta_idx} (metadata len={len(metadata)})"
)
else:
# ✅ DataFrame row by position
meta = metadata.iloc[int(meta_idx)].to_dict()
Expand All @@ -358,6 +375,7 @@ def log_debug(msg: str):
raw_skills_matched.append(skill)
taxonomy_descriptions.append(taxonomy_description)
taxonomy_sources.append(taxonomy_source)
source_urls.append(meta.get("source_url", ""))
correlations.append(similarity)

log_debug(f"[align] matched={len(mapped_skills)} of {len(raw_skills)}")
Expand Down Expand Up @@ -426,7 +444,9 @@ def __init__(
self.nlp = None
self.data_access = DataAccessLayer()
self.faiss_manager = FAISSIndexManager(self.data_access)
self.alignment_service = SkillAlignmentService(self.data_access, self.faiss_manager)
self.alignment_service = SkillAlignmentService(
self.data_access, self.faiss_manager
)
self.prompt_builder = PromptBuilder()
self.llm_parser = ResponseParser()
self.response_parser = ResponseParser()
Expand Down Expand Up @@ -599,7 +619,9 @@ def align_extracted_skills(
)

if not isinstance(raw_skills, list):
print(f"Warning: raw_skills is not a list, converting from {type(raw_skills)}")
print(
f"Warning: raw_skills is not a list, converting from {type(raw_skills)}"
)
raw_skills = [str(raw_skills)] if raw_skills else []

return self.alignment_service.align_skills_to_taxonomy(
Expand Down
14 changes: 11 additions & 3 deletions laiser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def build_faiss_index_esco():
# Embed ESCO skills using SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
print("Embedding ESCO skills...")
esco_embeddings = model.encode(skill_names, convert_to_numpy=True, show_progress_bar=True)
esco_embeddings = model.encode(
skill_names, convert_to_numpy=True, show_progress_bar=True
)

# ⚡ Normalize & Index using FAISS (cosine sim = L2 norm + dot product)
dimension = esco_embeddings.shape[1]
Expand Down Expand Up @@ -121,7 +123,9 @@ def load_faiss_index_esco():

index_path = os.path.join(os.path.dirname(__file__), "input/esco_faiss_index.index")
if not os.path.exists(index_path):
raise FileNotFoundError(f"FAISS index file not found at {index_path}. Please ensure the file exists.")
raise FileNotFoundError(
f"FAISS index file not found at {index_path}. Please ensure the file exists."
)
index = faiss.read_index(index_path)
print("FAISS index for ESCO skills loaded successfully.")
return index
Expand All @@ -144,7 +148,11 @@ def get_top_esco_skills(input_text, top_k=10):
emb = model.encode(input_text, convert_to_numpy=True)
faiss.normalize_L2(emb.reshape(1, -1))
scores, indices = index.search(emb.reshape(1, -1), top_k)
return [{"Skill": skill_names[i], "index": int(i), "score": float(scores[0][j])} for j, i in enumerate(indices[0])]
return [
{"Skill": skill_names[i], "index": int(i), "score": float(scores[0][j])}
for j, i in enumerate(indices[0])
]



def get_embedding(nlp, input_text):
Expand Down
4 changes: 3 additions & 1 deletion print_system_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
def _safe_run(cmd, timeout: float = 5.0):
"""Run a command safely with a timeout; return (code, stdout, stderr)."""
try:
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
try:
out, err = p.communicate(timeout=timeout)
except subprocess.TimeoutExpired:
Expand Down
4 changes: 3 additions & 1 deletion tests/test_alignment_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def test_align_skills_to_taxonomy_real_flow():

# Minimal, meaningful assertions
assert isinstance(df, pd.DataFrame)
assert set(["Raw Skill", "Taxonomy Skill", "Correlation Coefficient"]).issubset(df.columns)
assert set(["Raw Skill", "Taxonomy Skill", "Correlation Coefficient"]).issubset(
df.columns
)

# For these common skills, we usually expect at least one match if index is healthy.
# (We don't assert exact mapping because taxonomies may change.)
Expand Down
2 changes: 2 additions & 0 deletions tests/test_anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import pytest

import pytest

# ✅ import from wherever you saved that function
# Example:
# from laiser.llm_models.openai_helper import openai_generate
Expand Down
Loading
Loading