From fb203bb26151a746888b26957ae87df02b8c06fb Mon Sep 17 00:00:00 2001 From: buxy Date: Tue, 24 Mar 2026 17:22:41 +0530 Subject: [PATCH] build:[EN Currency] Fix ISO-prefix value extraction; add ISO prefix and crore/lakh support (#3211, #3212) --- Patterns/English/English-NumbersWithUnit.yaml | 31 ++-- .../number_with_unit/models.py | 60 +++++++ .../resources/english_numeric_with_unit.py | 29 ++-- .../NumberWithUnit/English/CurrencyModel.json | 151 ++++++++++++++++++ 4 files changed, 251 insertions(+), 20 deletions(-) diff --git a/Patterns/English/English-NumbersWithUnit.yaml b/Patterns/English/English-NumbersWithUnit.yaml index bccfa80e3a..c845b6f145 100644 --- a/Patterns/English/English-NumbersWithUnit.yaml +++ b/Patterns/English/English-NumbersWithUnit.yaml @@ -658,7 +658,7 @@ FractionalUnitNameToCodeMap: !dictionary CompoundUnitConnectorRegex: !simpleRegex def: (?and) MultiplierRegex: !simpleRegex - def: \s*\b(thousand|million|billion|trillion)s?\b + def: \s*\b(thousand|million|billion|trillion|lakh|crore)s?\b CurrencyPrefixList: !dictionary types: [ string, string ] entries: @@ -666,10 +666,10 @@ CurrencyPrefixList: !dictionary Dobra: db|std Dollar: $ Brazilian Real: R$ - United States dollar: united states $|us$|us $|u.s. $|u.s $|usd$ + United States dollar: united states $|us$|us $|u.s. $|u.s $|usd$|usd East Caribbean dollar: east caribbean $ - Mexican peso: mxn$|mxn $|mex$ - Australian dollar: australian $|australia $ + Mexican peso: mxn$|mxn $|mex$|mxn + Australian dollar: australian $|australia $|a$|aud Bahamian dollar: bahamian $|bahamia $ Barbadian dollar: barbadian $|barbadin $ Belize dollar: belize $ @@ -677,15 +677,15 @@ CurrencyPrefixList: !dictionary British Virgin Islands dollar: british virgin islands $|bvi$|virgin islands $|virgin island $|british virgin island $ Brunei dollar: brunei $|b$ Sen: sen - Singapore dollar: singapore $|s$ - Canadian dollar: canadian $|can$|c$|c $|canada $ + Singapore dollar: singapore $|s$|sg$|sgd + Canadian dollar: canadian $|can$|c$|c $|canada $|cad$|cad Cayman Islands dollar: cayman islands $|ci$|cayman island $ New Zealand dollar: new zealand $|nz$|nz $ Cook Islands dollar: cook islands $|cook island $ Fijian dollar: fijian $|fiji $ Guyanese dollar: gy$|gy $|g$|g $ Hong Kong dollar: hong kong $|hk$|hkd|hk $ - Indian rupee: ₹ + Indian rupee: ₹|inr|rs Jamaican dollar: jamaican $|j$|jamaica $ Kiribati dollar: kiribati $ Liberian dollar: liberian $|liberia $ @@ -701,12 +701,23 @@ CurrencyPrefixList: !dictionary Trinidad and Tobago dollar: trinidad and tobago $|trinidad $|trinidadian $ Tuvaluan dollar: tuvaluan $ Samoan tālā: ws$ - Chinese yuan: ¥ - Japanese yen: ¥ - Euro: € + Chinese yuan: ¥|cny|rmb + Japanese yen: ¥|jpy + Euro: €|eur Pound: £ Costa Rican colón: ₡ Turkish lira: ₺ +# ISO 4217 prefix codes for currencies commonly used as leading prefixes +# in financial documents but absent from symbol-only entries above + British pound: gbp + Vietnamese dong: vnd + Swedish krona: sek + Norwegian krone: nok + Danish krone: dkk + Swiss franc: chf + South Korean won: krw + Brazilian real: brl + South African rand: zar #CC Bitcoin: ₿|btc|xbt AmbiguousCurrencyUnitList: !list diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/models.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/models.py index 61f19c18a7..c4cbcf2a40 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/models.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/models.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +import re from abc import abstractmethod from typing import List @@ -10,6 +11,26 @@ from recognizers_text.utilities import QueryProcessor from recognizers_number_with_unit.number_with_unit.parsers import UnitValue, CurrencyUnitValue +# Matches an uppercase ISO currency prefix (1–3 letters + optional $) immediately +# followed by a digit — e.g. 'USD34', 'VND4,927', 'A$100', 'SG$40', 'CAD$1'. +# Used in CurrencyModel.parse() to insert a separating space before +# QueryProcessor lowercases the query, preventing the internal number extractor +# from misreading patterns like 'usd34.6 million' as '6 million'. +_CURRENCY_ISO_CONCAT_RE = re.compile(r'\b([A-Z]{1,3}\$|[A-Z]{3})(?=\d)') + + +def _to_original_pos(normalised_pos: int, insertions: List[int]) -> int: + """Convert a position in the space-normalised string to the original string position. + + Each inserted space at original position insertions[k] shifts all subsequent + normalised positions by +k+1. To reverse: subtract the count of insertions + whose normalised position falls strictly before normalised_pos. + """ + count = sum( + 1 for k, p in enumerate(insertions) if p + k < normalised_pos + ) + return normalised_pos - count + class ExtractorParserModel: def __init__(self, extractor: Extractor, parser: Parser): @@ -106,6 +127,45 @@ class CurrencyModel(AbstractNumberWithUnitModel): def model_type_name(self) -> str: return 'currency' + def parse(self, query: str) -> List[ModelResult]: + # Normalise uppercase ISO currency prefixes that are directly + # concatenated to digits before the base class calls + # QueryProcessor.preprocess() (which lowercases the query). + # + # Without this step the internal EnglishNumberExtractor (Unit mode) + # misreads patterns such as: + # 'USD34.6 million' -> extracts '6 million' (decimal boundary) + # 'VND4,927 billion' -> extracts '927 billion' (comma boundary) + # + # After inserting the space: + # 'USD34.6 million' -> 'USD 34.6 million' -> '34.6 million' ✓ + # 'VND4,927 billion' -> 'VND 4,927 billion' -> '4,927 billion' ✓ + # + # Uppercase-only matching avoids false positives on common English + # words ('can', 'try', 'nor', etc.) which are never all-caps. + # + # When spaces are inserted the base-class results carry positions and + # text from the normalised string, not the original. We record the + # insertion points and map every result back to the original string so + # that callers always receive offsets that are valid against their input. + insertions = [m.end() for m in _CURRENCY_ISO_CONCAT_RE.finditer(query)] + + if not insertions: + # No concatenation found — no position adjustment needed. + return super().parse(query) + + normalised = _CURRENCY_ISO_CONCAT_RE.sub(r'\1 ', query) + results = super().parse(normalised) + + for result in results: + orig_start = _to_original_pos(result.start, insertions) + orig_end = _to_original_pos(result.end, insertions) + result.start = orig_start + result.end = orig_end + result.text = query[orig_start:orig_end + 1] + + return results + class DimensionModel(AbstractNumberWithUnitModel): @property diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/english_numeric_with_unit.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/english_numeric_with_unit.py index a70f1811ba..be5ea68715 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/english_numeric_with_unit.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/english_numeric_with_unit.py @@ -546,14 +546,14 @@ class EnglishNumericWithUnit: ("Millibitcoin", "MILLIBITCOIN"), ("Satoshi", "SATOSHI")]) CompoundUnitConnectorRegex = f'(?and)' - MultiplierRegex = f'\\s*\\b(thousand|million|billion|trillion)s?\\b' + MultiplierRegex = f'\\s*\\b(thousand|million|billion|trillion|lakh|crore)s?\\b' CurrencyPrefixList = dict([("Dobra", "db|std"), ("Dollar", "$"), ("Brazilian Real", "R$"), - ("United States dollar", "united states $|us$|us $|u.s. $|u.s $|usd$"), + ("United States dollar", "united states $|us$|us $|u.s. $|u.s $|usd$|usd"), ("East Caribbean dollar", "east caribbean $"), - ("Mexican peso", "mxn$|mxn $|mex$"), - ("Australian dollar", "australian $|australia $"), + ("Mexican peso", "mxn$|mxn $|mex$|mxn"), + ("Australian dollar", "australian $|australia $|a$|aud"), ("Bahamian dollar", "bahamian $|bahamia $"), ("Barbadian dollar", "barbadian $|barbadin $"), ("Belize dollar", "belize $"), @@ -561,15 +561,15 @@ class EnglishNumericWithUnit: ("British Virgin Islands dollar", "british virgin islands $|bvi$|virgin islands $|virgin island $|british virgin island $"), ("Brunei dollar", "brunei $|b$"), ("Sen", "sen"), - ("Singapore dollar", "singapore $|s$"), - ("Canadian dollar", "canadian $|can$|c$|c $|canada $"), + ("Singapore dollar", "singapore $|s$|sg$|sgd"), + ("Canadian dollar", "canadian $|can$|c$|c $|canada $|cad$|cad"), ("Cayman Islands dollar", "cayman islands $|ci$|cayman island $"), ("New Zealand dollar", "new zealand $|nz$|nz $"), ("Cook Islands dollar", "cook islands $|cook island $"), ("Fijian dollar", "fijian $|fiji $"), ("Guyanese dollar", "gy$|gy $|g$|g $"), ("Hong Kong dollar", "hong kong $|hk$|hkd|hk $"), - ("Indian rupee", "₹"), + ("Indian rupee", "₹|inr|rs"), ("Jamaican dollar", "jamaican $|j$|jamaica $"), ("Kiribati dollar", "kiribati $"), ("Liberian dollar", "liberian $|liberia $"), @@ -585,12 +585,21 @@ class EnglishNumericWithUnit: ("Trinidad and Tobago dollar", "trinidad and tobago $|trinidad $|trinidadian $"), ("Tuvaluan dollar", "tuvaluan $"), ("Samoan tālā", "ws$"), - ("Chinese yuan", "¥"), - ("Japanese yen", "¥"), - ("Euro", "€"), + ("Chinese yuan", "¥|cny|rmb"), + ("Japanese yen", "¥|jpy"), + ("Euro", "€|eur"), ("Pound", "£"), ("Costa Rican colón", "₡"), ("Turkish lira", "₺"), + ("British pound", "gbp"), + ("Vietnamese dong", "vnd"), + ("Swedish krona", "sek"), + ("Norwegian krone", "nok"), + ("Danish krone", "dkk"), + ("Swiss franc", "chf"), + ("South Korean won", "krw"), + ("Brazilian real", "brl"), + ("South African rand", "zar"), ("Bitcoin", "₿|btc|xbt")]) AmbiguousCurrencyUnitList = [r'din.', r'kiwi', r'kina', r'kobo', r'lari', r'lipa', r'napa', r'para', r'sfr.', r'taka', r'tala', r'toea', r'vatu', r'yuan', r'all', r'ang', r'ban', r'bob', r'btn', r'byr', r'cad', r'cop', r'cup', r'dop', r'gip', r'jod', r'kgs', r'lak', r'lei', r'mga', r'mop', r'nad', r'omr', r'pul', r'sar', r'sbd', r'scr', r'sdg', r'sek', r'sen', r'sol', r'sos', r'std', r'try', r'yer', r'yen', r'db', r'pen', r'ron', r'mad', r'zar', r'gel', r'satoshi', r'satoshis'] InformationSuffixList = dict([("Bit", "-bit|bit|bits"), diff --git a/Specs/NumberWithUnit/English/CurrencyModel.json b/Specs/NumberWithUnit/English/CurrencyModel.json index aba9f3a7e0..50aaa28ba1 100644 --- a/Specs/NumberWithUnit/English/CurrencyModel.json +++ b/Specs/NumberWithUnit/English/CurrencyModel.json @@ -2980,5 +2980,156 @@ } } ] + }, + { + "Input": "USD34.6 million deal", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "USD34.6 million", + "Start": 0, + "End": 14, + "TypeName": "currency", + "Resolution": { + "isoCurrency": "USD", + "unit": "United States dollar", + "value": "34600000" + } + } + ] + }, + { + "Input": "USD0.92 Million consideration", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "USD0.92 Million", + "Start": 0, + "End": 14, + "TypeName": "currency", + "Resolution": { + "isoCurrency": "USD", + "unit": "United States dollar", + "value": "920000" + } + } + ] + }, + { + "Input": "VND4,927 billion acquisition", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "VND4,927 billion", + "Start": 0, + "End": 15, + "TypeName": "currency", + "Resolution": { + "unit": "Vietnamese dong", + "value": "4927000000000" + } + } + ] + }, + { + "Input": "GBP 27 million", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "gbp 27 million", + "Start": 0, + "End": 13, + "TypeName": "currency", + "Resolution": { + "isoCurrency": "GBP", + "unit": "British pound", + "value": "27000000" + } + } + ] + }, + { + "Input": "SEK 60,500,000", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "sek 60,500,000", + "Start": 0, + "End": 13, + "TypeName": "currency", + "Resolution": { + "isoCurrency": "SEK", + "unit": "Swedish krona", + "value": "60500000" + } + } + ] + }, + { + "Input": "CAD$1,700,000", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "CAD$1,700,000", + "Start": 0, + "End": 12, + "TypeName": "currency", + "Resolution": { + "isoCurrency": "CAD", + "unit": "Canadian dollar", + "value": "1700000" + } + } + ] + }, + { + "Input": "A$100,000 cash consideration", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "A$100,000", + "Start": 0, + "End": 8, + "TypeName": "currency", + "Resolution": { + "isoCurrency": "AUD", + "unit": "Australian dollar", + "value": "100000" + } + } + ] + }, + { + "Input": "SG$40 million", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "SG$40 million", + "Start": 0, + "End": 12, + "TypeName": "currency", + "Resolution": { + "isoCurrency": "SGD", + "unit": "Singapore dollar", + "value": "40000000" + } + } + ] + }, + { + "Input": "Rs 660 crore", + "NotSupported": "java, javascript", + "Results": [ + { + "Text": "rs 660 crore", + "Start": 0, + "End": 11, + "TypeName": "currency", + "Resolution": { + "unit": "Rupee", + "value": "6600000000" + } + } + ] } ]