Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pipeline {
HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-12-26-0'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-16-26-0'
DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
yr वर्ष
28 changes: 19 additions & 9 deletions nemo_text_processing/text_normalization/hi/taggers/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space
from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_HI_DIGIT, GraphFst, insert_space
from nemo_text_processing.text_normalization.hi.utils import get_abs_path


Expand All @@ -41,6 +41,11 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
self.zero = zero
self.teens_and_ties = teens_and_ties

# Single digit graph for digit-by-digit reading
# e.g., "०७३" -> "शून्य सात तीन"
single_digit_graph = digit | zero
self.single_digits_graph = single_digit_graph + pynini.closure(insert_space + single_digit_graph)

def create_graph_suffix(digit_graph, suffix, zeros_counts):
zero = pynutil.add_weight(pynutil.delete("०"), -0.1)
if zeros_counts == 0:
Expand Down Expand Up @@ -298,13 +303,8 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 0, graph_ten_padmas)
graph_ten_shankhs.optimize()

# Only match exactly 2 digits to avoid interfering with telephone numbers, decimals, etc.
# e.g., "०५" -> "शून्य पाँच"
single_digit = digit | zero
graph_leading_zero = zero + insert_space + single_digit
graph_leading_zero = pynutil.add_weight(graph_leading_zero, 0.5)

final_graph = (
# Graph without leading zeros - used by other taggers like ordinal, decimal and measure
graph_without_leading_zeros = (
digit
| zero
| teens_and_ties
Expand All @@ -325,8 +325,18 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
| graph_ten_padmas
| graph_shankhs
| graph_ten_shankhs
| graph_leading_zero
)
self.graph_without_leading_zeros = graph_without_leading_zeros.optimize()

# Handle numbers with leading zeros by reading digit-by-digit
# e.g., "०७३" -> "शून्य सात तीन", "००५" -> "शून्य शून्य पाँच"
cardinal_with_leading_zeros = pynini.compose(
pynini.accep("०") + pynini.closure(NEMO_HI_DIGIT), self.single_digits_graph
)
cardinal_with_leading_zeros = pynutil.add_weight(cardinal_with_leading_zeros, 0.5)

# Full graph including leading zeros - for standalone cardinal matching
final_graph = graph_without_leading_zeros | cardinal_with_leading_zeros

optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="decimal", kind="classify", deterministic=deterministic)

graph_digit = cardinal.digit | cardinal.zero
cardinal_graph = cardinal.final_graph
cardinal_graph = cardinal.graph_without_leading_zeros

self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize()

Expand Down
72 changes: 71 additions & 1 deletion nemo_text_processing/text_normalization/hi/taggers/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))

# Year unit variants for formal/informal handling
year_informal = pynini.string_map([("yr", "साल")])
year_formal = pynini.string_file(get_abs_path("data/measure/unit_year_formal.tsv"))

# All units EXCEPT year
unit_inputs_except_yr = pynini.difference(pynini.project(unit_graph, "input"), pynini.accep("yr"))
unit_graph_no_year = pynini.compose(unit_inputs_except_yr, unit_graph)

# Load quarterly units from separate files: map (FST) and list (FSA)
quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv"))
quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv"))
Expand All @@ -243,7 +251,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
unit = (
pynutil.insert(NEMO_SPACE)
+ pynutil.insert("units: \"")
+ unit_graph
+ unit_graph_no_year
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)
Expand All @@ -255,6 +263,29 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
+ pynutil.insert(NEMO_SPACE)
)

# Year-specific unit wrappers
unit_year_informal = (
pynutil.insert(NEMO_SPACE)
+ pynutil.insert("units: \"")
+ year_informal
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)
unit_year_formal = (
pynutil.insert(NEMO_SPACE)
+ pynutil.insert("units: \"")
+ year_formal
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)

# Cardinal >= 1000 -> formal year (वर्ष)
# Use graph_without_leading_zeros which covers all number ranges (thousands to shankhs)
cardinal_large = cardinal.graph_without_leading_zeros

# Cardinal < 1000 -> informal year (साल)
cardinal_small = cardinal.zero | cardinal.digit | cardinal.teens_and_ties | cardinal.graph_hundreds

symbol_graph = pynini.string_map(
[
(LOWERCASE_X, HI_BY),
Expand Down Expand Up @@ -354,6 +385,42 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
+ unit
)

# Large numbers (>=1000) + yr -> formal (वर्ष)
graph_cardinal_year_formal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_large
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
+ pynutil.insert("}")
+ delete_space
+ unit_year_formal
)

# Small numbers (<1000) + yr -> informal (साल)
graph_cardinal_year_informal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_small
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
+ pynutil.insert("}")
+ delete_space
+ unit_year_informal
)

# Regular decimals (e.g., 16.07) + yr -> formal (वर्ष)
graph_decimal_year_formal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal_graph
+ pynutil.insert(" }")
+ delete_space
+ unit_year_formal
)

# Handling cardinal clubbed with symbol as single token
graph_exceptions = (
pynutil.insert("cardinal { ")
Expand Down Expand Up @@ -381,7 +448,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp

graph = (
pynutil.add_weight(graph_decimal, 0.1)
| pynutil.add_weight(graph_decimal_year_formal, 0.1)
| pynutil.add_weight(graph_cardinal, 0.1)
| pynutil.add_weight(graph_cardinal_year_formal, 0.1)
| pynutil.add_weight(graph_cardinal_year_informal, -0.1) # Higher priority for small numbers
| pynutil.add_weight(graph_exceptions, 0.1)
| pynutil.add_weight(graph_dedh_dhai, -0.2)
| pynutil.add_weight(graph_savva, -0.1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,4 +144,7 @@
५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ
२ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल
०५~शून्य पाँच
०१~शून्य एक
०१~शून्य एक
०७३~शून्य सात तीन
०००१~शून्य शून्य शून्य एक
०००~शून्य शून्य शून्य
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,10 @@
५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा
२x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब
१३x१३ का घर~तेरह बाई तेरह का घर
१००० yr~एक हज़ार वर्ष
९९९९ yr~नौ हज़ार नौ सौ निन्यानबे वर्ष
१६.०७ yr~सोलह दशमलव शून्य सात वर्ष
५ yr~पाँच साल
१.५ yr~डेढ़ साल
२.५ yr~ढाई साल
३.५ yr~साढ़े तीन साल