Skip to content

Commit 8304135

Browse files
committed
Prioritize numbers next to currencies
1 parent 4d9c393 commit 8304135

File tree

2 files changed

+57
-18
lines changed

2 files changed

+57
-18
lines changed

price_parser/parser.py

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import re
33
import string
4-
from typing import Callable, Optional, Pattern, List, Tuple
4+
from typing import Callable, Match, Optional, Pattern, List, Tuple
55
from decimal import Decimal, InvalidOperation
66

77
import attr
@@ -36,11 +36,11 @@ def fromstring(cls, price: Optional[str],
3636
``price`` string, it could be **preferred** over a value extracted
3737
from ``currency_hint`` string.
3838
"""
39-
amount_text = extract_price_text(price) if price is not None else None
39+
currency, source = _extract_currency_symbol(price, currency_hint)
40+
amount_text = extract_price_text(price, currency if source == price else None) if price is not None else None
4041
amount_num = parse_number(amount_text) if amount_text is not None else None
41-
currency = extract_currency_symbol(price, currency_hint)
4242
if currency is not None:
43-
currency = currency.strip()
43+
currency = currency.group(0).strip()
4444
return Price(
4545
amount=amount_num,
4646
currency=currency,
@@ -120,11 +120,12 @@ def or_regex(symbols: List[str]) -> Pattern:
120120
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search
121121

122122

123-
def extract_currency_symbol(price: Optional[str],
124-
currency_hint: Optional[str]) -> Optional[str]:
123+
def _extract_currency_symbol(price: Optional[str],
124+
currency_hint: Optional[str]) -> Optional[str]:
125125
"""
126-
Guess currency symbol from extracted price and currency strings.
127-
Return an empty string if symbol is not found.
126+
Guess the currency symbol from extracted price and currency strings.
127+
Return a (`match object`_, source_string) tuple with the symbol found and
128+
the string where it was found, or (None, None) if no symbol is found.
128129
"""
129130
methods: List[Tuple[Callable, Optional[str]]] = [
130131
(_search_safe_currency, price),
@@ -142,17 +143,32 @@ def extract_currency_symbol(price: Optional[str],
142143
for meth, attr in methods:
143144
m = meth(attr) if attr else None
144145
if m:
145-
return m.group(0)
146+
return m, attr
147+
148+
return None, None
146149

150+
151+
def extract_currency_symbol(price: Optional[str],
152+
currency_hint: Optional[str]) -> Optional[str]:
153+
"""
154+
Guess currency symbol from extracted price and currency strings.
155+
Return the symbol as found as a string, or None if no symbol is found.
156+
"""
157+
match, _ = _extract_currency_symbol(price, currency_hint)
158+
if match:
159+
return match.group(0)
147160
return None
148161

149162

150-
def extract_price_text(price: str) -> Optional[str]:
163+
def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]:
151164
"""
152165
Extract text of a price from a string which contains price and
153-
maybe some other text. If multiple price-looking substrings are present,
154-
the first is returned (FIXME: it is better to return a number
155-
which is near a currency symbol).
166+
maybe some other text.
167+
168+
If a match object of the currency within the `price` string is provided,
169+
amounts before or after the matched currency substring are prioritized.
170+
Otherwise, if multiple price-looking substrings are present, the first is
171+
returned.
156172
157173
>>> extract_price_text("price: $12.99")
158174
'12.99'
@@ -189,16 +205,39 @@ def extract_price_text(price: str) -> Optional[str]:
189205
""", price, re.VERBOSE)
190206
if m:
191207
return m.group(0).replace(' ', '')
208+
209+
def number_from_match(m):
210+
return m.group(1).strip(',.').strip()
211+
212+
if currency_match is not None:
213+
214+
m = re.search(r"""
215+
(\d[\d\s.,]*) # number, probably with thousand separators
216+
\s*$ # only match right before the currency symbol
217+
""", price[:currency_match.start(0)], re.VERBOSE)
218+
if m:
219+
return number_from_match(m)
220+
221+
m = re.search(r"""
222+
^\s* # only match right after the currency symbol
223+
(\d[\d\s.,]*) # number, probably with thousand separators
224+
\s* # skip whitespace
225+
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
226+
""", price[currency_match.end(0):], re.VERBOSE)
227+
if m:
228+
return number_from_match(m)
229+
192230
m = re.search(r"""
193231
(\d[\d\s.,]*) # number, probably with thousand separators
194232
\s* # skip whitespace
195233
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
196234
""", price, re.VERBOSE)
197-
198235
if m:
199-
return m.group(1).strip(',.').strip()
236+
return number_from_match(m)
237+
200238
if 'free' in price.lower():
201239
return '0'
240+
202241
return None
203242

204243

tests/test_price_parsing.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,7 @@ def __eq__(self, other):
618618
Example('€', '€ 139.00',
619619
'€', '139.00', 139),
620620
Example('There are 163 products.', 'From 26 to 50 €',
621-
'€', '26', 26),
621+
'€', '50', 50),
622622
Example('Pris NOK 1 999,00', '139,00',
623623
'NOK', '139,00', 139),
624624
Example('/sqft', '1.52',
@@ -1901,13 +1901,13 @@ def __eq__(self, other):
19011901
'CHF', '19.90', 19.90),
19021902
Example('', '530,42 Zł',
19031903
'Zł', '530,42', 530.42),
1904+
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
1905+
'EUR', '14,85', 14.85),
19041906
]
19051907

19061908

19071909
PRICE_PARSING_EXAMPLES_XFAIL = [
19081910
# amount is picked as a price
1909-
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
1910-
'EUR', '14,85', 14.85),
19111911
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
19121912
'$', '60.00', 60),
19131913
Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )',

0 commit comments

Comments
 (0)