Skip to content

Commit 44fef2b

Browse files
author
Kavyansh Chourasia
committed
Shiksha API: Question paper API tolerant to typos in unit names
1 parent abdcec8 commit 44fef2b

File tree

2 files changed

+110
-39
lines changed

2 files changed

+110
-39
lines changed

shiksha-api/app-service/app/models/question_paper.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,44 @@ class QuestionBankResponse(BaseModel):
155155
questions: List[QuestionTypeResponse] = []
156156

157157

158+
# ============================
159+
# SERVICE INTERNAL MODELS
160+
# ============================
161+
162+
163+
class QuestionInfo(BaseModel):
164+
"""Individual question specification within a generation slot."""
165+
166+
type: QuestionType
167+
objective: str
168+
marks_per_question: int
169+
schema_hint: str
170+
171+
@computed_field
172+
@property
173+
def type_description(self) -> str:
174+
return self.type.description
175+
176+
177+
class GenerationSlot(BaseModel):
178+
"""A batch of questions to be generated together for a specific unit."""
179+
180+
unit_name: str
181+
learning_outcomes: List[str]
182+
questions: List[QuestionInfo]
183+
index_path: str
184+
185+
@computed_field
186+
@property
187+
def question_count(self) -> int:
188+
return len(self.questions)
189+
190+
@computed_field
191+
@property
192+
def unique_question_types(self) -> List[QuestionType]:
193+
return list(set(q.type for q in self.questions))
194+
195+
158196
# ============================
159197
# REQUEST MODELS
160198
# ============================

shiksha-api/app-service/app/services/question_paper_service.py

Lines changed: 72 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,12 @@
2020
QuestionTypeResponse,
2121
QuestionType,
2222
Template,
23+
# Service internal models
24+
QuestionInfo,
25+
GenerationSlot,
2326
)
2427
from app.config import settings
28+
import re
2529

2630
logger = logging.getLogger(__name__)
2731

@@ -131,7 +135,7 @@ def _get_grammar_topics(self, request: QuestionBankPartsGenerationRequest) -> st
131135

132136
def _build_generation_slots(
133137
self, request: QuestionBankPartsGenerationRequest
134-
) -> List[Dict[str, Any]]:
138+
) -> List[GenerationSlot]:
135139
"""Build generation slots from template distributions, grouped by unit with max 20 questions per slot."""
136140
units_dict = self._get_unit_los_dict(request)
137141
units_index_path_dict = self._get_unit_index_path_dict(request)
@@ -152,12 +156,12 @@ def _build_generation_slots(
152156
f"Unit Name `{dist.unit_name}` is not present in the `chapters` attribute in request"
153157
)
154158

155-
question_info = {
156-
"type": template.type,
157-
"objective": dist.objective,
158-
"marks_per_question": template.marks_per_question,
159-
"schema_hint": template.type.schema_dict(),
160-
}
159+
question_info = QuestionInfo(
160+
type=template.type,
161+
objective=dist.objective,
162+
marks_per_question=template.marks_per_question,
163+
schema_hint=template.type.schema_dict(),
164+
)
161165

162166
if dist.unit_name not in unit_questions:
163167
unit_questions[dist.unit_name] = []
@@ -175,12 +179,12 @@ def _build_generation_slots(
175179
for i in range(0, len(questions), self.max_questions_per_slot):
176180
batch = questions[i : i + self.max_questions_per_slot]
177181
slots.append(
178-
{
179-
"unit_name": unit_name,
180-
"learning_outcomes": unit_los, # Same for all questions in unit
181-
"questions": batch,
182-
"index_path": index_path,
183-
}
182+
GenerationSlot(
183+
unit_name=unit_name,
184+
learning_outcomes=unit_los, # Same for all questions in unit
185+
questions=batch,
186+
index_path=index_path,
187+
)
184188
)
185189

186190
return slots
@@ -225,16 +229,16 @@ def _format_system_prompt(
225229
self,
226230
request: QuestionBankPartsGenerationRequest,
227231
existing_questions: List[str],
228-
slot: Dict[str, Any],
232+
slot: GenerationSlot,
229233
) -> str:
230234
"""Format the system prompt using YAML templates for a specific unit slot."""
231235
try:
232236
# Get the main template
233237
template = self.prompts.get("question_bank_parts_gen", "")
234238

235239
# Get unit-specific information from the slot
236-
unit_name = slot["unit_name"]
237-
learning_outcomes = slot["learning_outcomes"]
240+
unit_name = slot.unit_name
241+
learning_outcomes = slot.learning_outcomes
238242

239243
# Get Bloom's taxonomy guide
240244
blooms_guide = self.prompts.get("blooms-taxonomy", {}).get("general", "")
@@ -275,38 +279,43 @@ def _get_format_instruction_for_type(self, qtype: QuestionType) -> str:
275279
return f"- For {qtype}: {qtype.description}. Use format: {qtype.schema_dict()}"
276280

277281
async def _generate_questions_batch(
278-
self, system_prompt: str, slot: Dict[str, Any], rag_adapter: BaseRagAdapter
282+
self, system_prompt: str, slot: GenerationSlot, rag_adapter: BaseRagAdapter
279283
) -> List[Dict[str, Any]]:
280284
"""Generate questions for a batch of slots using RAG adapter."""
281285
# Build slot directives from the new slot structure
282-
slot_questions = slot["questions"]
286+
slot_questions = slot.questions
283287

284288
# Generate dynamic format rules from QuestionType enum
285-
unique_types = set(q["type"] for q in slot_questions)
289+
unique_types = slot.unique_question_types
286290
format_rules = [
287291
self._get_format_instruction_for_type(qtype) for qtype in unique_types
288292
]
289293
format_rules_text = "\n".join(format_rules)
290294

295+
# Convert QuestionInfo objects to dict for JSON serialization
296+
slot_questions_dict = [
297+
{
298+
"type": q.type,
299+
"objective": q.objective,
300+
"marks_per_question": q.marks_per_question,
301+
"schema_hint": q.schema_hint,
302+
}
303+
for q in slot_questions
304+
]
305+
291306
user_message = (
292307
"Generate questions for the following slots in a SINGLE JSON object with an `items` array. "
293308
"For each slot, return exactly ONE object with the following fields:\n "
294-
"`unit_name`, `type`, `objective`, `marks_per_question` and `item`\n"
309+
"`type`, `objective`, `marks_per_question` and `item`\n"
295310
"`item` field should adhere to the question's `schema_hint`.\n\n"
296311
"Format rules by question type:\n"
297312
f"{format_rules_text}\n"
298-
f"Question slots:\n{json.dumps(slot_questions, ensure_ascii=False)}"
313+
f"Question slots:\n{json.dumps(slot_questions_dict, ensure_ascii=False)}"
299314
)
300315

301316
# Build chat history with system message only (as per requirement)
302317
chat_history = [ChatMessage(role="system", content=system_prompt)]
303318

304-
print(
305-
"********************* SYSTEM PROMPT *********************",
306-
system_prompt,
307-
)
308-
print("********************* USER MESSAGE *********************", user_message)
309-
310319
# Use RAG adapter to chat with index
311320
response_content = await rag_adapter.chat_with_index(
312321
curr_message=user_message, chat_history=chat_history
@@ -318,7 +327,7 @@ async def _generate_questions_batch(
318327
# Parse response
319328
response_data = json.loads(content)
320329

321-
print(
330+
logger.info(
322331
"********************** RESPONSE DATA **********************",
323332
json.dumps(response_data, indent=2),
324333
)
@@ -329,12 +338,16 @@ async def _generate_questions_batch(
329338
logger.warning("No items found in completion response")
330339
return []
331340

341+
# Add unit_name to each generated item for proper key generation
342+
for item in items:
343+
item["unit_name"] = slot.unit_name
344+
332345
return items
333346

334347
async def _generate_questions_batch_async(
335348
self,
336349
system_prompt: str,
337-
slot: Dict[str, Any],
350+
slot: GenerationSlot,
338351
rag_adapter: BaseRagAdapter,
339352
delay_seconds: int = 0,
340353
) -> List[Dict[str, Any]]:
@@ -356,16 +369,29 @@ def _organize_questions_into_response(
356369
# Create a question directory to organize questions by their specification
357370
question_directory = {}
358371

372+
def normalize_key_component(text: str) -> str:
373+
"""Normalize text by removing invisible chars, extra spaces, and converting to lowercase."""
374+
if not text:
375+
return ""
376+
# Remove invisible characters (control chars, zero-width spaces, etc.)
377+
# Remove control characters except tab, newline, carriage return
378+
text = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", "", str(text))
379+
# Remove zero-width characters
380+
text = re.sub(r"[\u200B-\u200D\uFEFF]", "", text)
381+
# Normalize whitespace and strip
382+
text = re.sub(r"\s+", " ", text).strip()
383+
return text.lower()
384+
359385
# Organize generated questions by (type, marks, unit_name, objective)
360386
for i, generated in enumerate(all_generated):
361387
qtype = QuestionType(generated.get("type"))
362-
unit_name = generated.get("unit_name")
363-
objective = generated.get("objective")
364-
marks_per_question = generated.get("marks_per_question")
388+
unit_name = normalize_key_component(generated.get("unit_name", ""))
389+
objective = normalize_key_component(generated.get("objective", ""))
390+
marks_per_question = generated.get("marks_per_question", 0)
365391
item = generated.get("item")
366392

367-
# Normalize key to lowercase to avoid case sensitivity issues
368-
key = f"{qtype.value}|{marks_per_question}|{unit_name}|{objective}".lower()
393+
# Create normalized key
394+
key = f"{qtype.value.lower()}|{marks_per_question}|{unit_name}|{objective}"
369395
if key not in question_directory:
370396
question_directory[key] = []
371397

@@ -387,8 +413,10 @@ def _organize_questions_into_response(
387413

388414
# Add questions in the order specified by question_distribution
389415
for q_dist in template.question_distribution or []:
390-
# Normalize key to lowercase to match the question_directory keys
391-
key = f"{template.type.value}|{template.marks_per_question}|{q_dist.unit_name}|{q_dist.objective}".lower()
416+
# Create normalized key to match the question_directory keys
417+
normalized_unit = normalize_key_component(q_dist.unit_name)
418+
normalized_objective = normalize_key_component(q_dist.objective)
419+
key = f"{template.type.value.lower()}|{template.marks_per_question}|{normalized_unit}|{normalized_objective}"
392420

393421
if key in question_directory and len(question_directory[key]) > 0:
394422
question = question_directory[key].pop(0)
@@ -401,6 +429,10 @@ def _organize_questions_into_response(
401429
logger.warning(
402430
f"--\nNo question found for Normalized key: {key}"
403431
)
432+
# Debug: show all available keys for troubleshooting
433+
logger.warning(
434+
f"Available keys: {list(question_directory.keys())}"
435+
)
404436

405437
response_questions.append(question_type_resp)
406438

@@ -453,9 +485,7 @@ async def generate_question_bank_by_parts(
453485
tasks = []
454486
for j, slot in enumerate(batch_slots):
455487
# Get or create cached RAG adapter instance
456-
rag_adapter = await self._get_or_create_rag_adapter(
457-
slot["index_path"]
458-
)
488+
rag_adapter = await self._get_or_create_rag_adapter(slot.index_path)
459489
# Initiate the index (download files for InMem, no-op for Qdrant)
460490
await rag_adapter.initiate_index()
461491

@@ -475,6 +505,9 @@ async def generate_question_bank_by_parts(
475505

476506
# Add all generated questions from this batch
477507
for raw_items in batch_results:
508+
if isinstance(raw_items, Exception):
509+
logger.error(f"Error in batch generation: {raw_items}")
510+
continue
478511
if raw_items:
479512
all_generated.extend(raw_items)
480513

0 commit comments

Comments
 (0)