Skip to content

Commit ec90991

Browse files
authored
Merge pull request #20 from OpenIsraeliSupermarkets/patch
v0.1.5
2 parents e9cdee5 + b16ea27 commit ec90991

7 files changed

Lines changed: 84 additions & 13 deletions

File tree

il_supermarket_parsers/parsers/hazi_hinam.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from il_supermarket_parsers.engines import BaseFileConverter
2-
from il_supermarket_parsers.documents import XmlDataFrameConverter
2+
from il_supermarket_parsers.documents import (
3+
XmlDataFrameConverter,
4+
SubRootedXmlDataFrameConverter,
5+
)
36

47

58
class HaziHinamFileConverter(BaseFileConverter):
@@ -33,10 +36,12 @@ def __init__(self) -> None:
3336
roots=["ChainId", "SubChainId", "StoreId", "BikoretNo"],
3437
ignore_column=["XmlDocVersion", "DllVerNo"],
3538
),
36-
stores_parser=XmlDataFrameConverter(
39+
stores_parser=SubRootedXmlDataFrameConverter(
3740
id_field="StoreID",
38-
list_key="Stores",
39-
roots=["ChainId"],
41+
list_key="SubChains",
42+
roots=["ChainId", "ChainName", "LastUpdateDate", "LastUpdateTime"],
43+
list_sub_key="Stores",
44+
sub_roots=["SubChainID", "SubChainName"],
4045
ignore_column=[],
4146
),
4247
)

il_supermarket_parsers/parsers/other.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from il_supermarket_parsers.engines.base import BaseFileConverter
2+
from il_supermarket_parsers.documents import SubRootedXmlDataFrameConverter
3+
24
from .confix import CofixFileConverter
35

46

@@ -153,3 +155,26 @@ class WoltFileConverter(BaseFileConverter):
153155
"""
154156
wolt
155157
"""
158+
159+
def __init__(self) -> None:
160+
super().__init__(
161+
promofull_parser=SubRootedXmlDataFrameConverter(
162+
list_key="Promotions",
163+
id_field="PromotionId",
164+
roots=["ChainId", "SubChainId", "StoreId", "BikoretNo"],
165+
list_sub_key="PromotionItems",
166+
sub_roots=[
167+
"Remarks",
168+
"AdditionalRestrictions",
169+
"ClubId",
170+
"PromotionEndHour",
171+
"PromotionUpdateTime",
172+
"PromotionId",
173+
"PromotionDescription",
174+
"PromotionStartDate",
175+
"PromotionStartHour",
176+
"PromotionEndDate",
177+
],
178+
ignore_column=["XmlDocVersion", "DllVerNo"],
179+
)
180+
)

il_supermarket_parsers/parsers/tests/test_case.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ def __parser_validate(self, file_type, dump_path="temp"):
7575
files_types=[file_type],
7676
).load()
7777

78+
complete_file_loaded = list(map(lambda x: x.get_full_path(), files))
79+
files_from_folder = self.list_xml_files_recursive(sub_folder)
80+
assert sorted(complete_file_loaded) == sorted(files_from_folder), (
81+
f"dataloader failed, failed to load"
82+
f": {list(set(files_from_folder) - set(complete_file_loaded))}"
83+
)
7884
dfs = []
7985
for file in files:
8086

@@ -97,6 +103,15 @@ def __parser_validate(self, file_type, dump_path="temp"):
97103
if dfs:
98104
pd.concat(dfs)
99105

106+
def list_xml_files_recursive(self, directory):
107+
"""list all xml files"""
108+
file_list = []
109+
for root, _, files in os.walk(directory):
110+
for file in files:
111+
if "xml" in file:
112+
file_list.append(os.path.join(root, file))
113+
return file_list
114+
100115
def test_parsing_store(self):
101116
"""scrape one file and make sure it exists"""
102117
self._parser_validate(FileTypesFilters.STORE_FILE.name)

il_supermarket_parsers/utils/data_loader.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,20 @@ def _format_datetime(self, date):
6666

6767
def _file_name_to_components(self, store_folder, file_name, empty_store_id="0000"):
6868
"""extract file name components"""
69+
70+
_file_name_split = file_name.split(".")[0].split("-")
6971
try:
70-
prefix_file_name, store_number, date, *_ = file_name.split(".")[0].split(
71-
"-"
72-
)
72+
# Promo7290700100008-000-207-20250224-103225
73+
if len(_file_name_split) == 5:
74+
prefix_file_name, _, store_number, date, time, *_ = _file_name_split
75+
extracted_datetime = date + time
76+
else:
77+
prefix_file_name, store_number, extracted_datetime, *_ = (
78+
_file_name_split
79+
)
7380
except ValueError:
7481
# global files
75-
prefix_file_name, date, *_ = file_name.split(".")[0].split("-")
82+
prefix_file_name, extracted_datetime, *_ = _file_name_split
7683
store_number = empty_store_id
7784

7885
file_type, chain_id = self._find_file_type_and_chain_id(prefix_file_name)
@@ -83,7 +90,7 @@ def _file_name_to_components(self, store_folder, file_name, empty_store_id="0000
8390
prefix_file_name=prefix_file_name,
8491
extracted_store_number=store_number,
8592
extracted_chain_id=chain_id,
86-
extracted_date=self._format_datetime(date),
93+
extracted_date=self._format_datetime(extracted_datetime),
8794
detected_filetype=file_type,
8895
)
8996

il_supermarket_parsers/utils/xml_utils.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import xml.etree.ElementTree as ET
2+
from lxml import etree
23

34

45
def count_tag_in_xml(xml_file_path, tag_to_count):
@@ -83,13 +84,31 @@ def change_xml_encoding(file_path):
8384
)
8485

8586

87+
def try_to_recover_xml(file_path):
88+
"""try to recover the xml"""
89+
90+
parser = etree.XMLParser(recover=True, encoding="utf-8")
91+
with open(file_path, "rb") as f:
92+
tree = etree.parse(f, parser)
93+
fixed_xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode(
94+
"utf-8"
95+
)
96+
97+
with open(file_path, "w", encoding="utf-8") as f:
98+
f.write(fixed_xml)
99+
100+
86101
def get_root(file):
87102
"""get ET root"""
88103
try:
89104
tree = ET.parse(file)
90105
except ET.ParseError:
91-
change_xml_encoding(file)
92-
tree = ET.parse(file)
106+
try:
107+
try_to_recover_xml(file)
108+
tree = ET.parse(file)
109+
except ET.ParseError:
110+
change_xml_encoding(file)
111+
tree = ET.parse(file)
93112

94113
return tree.getroot()
95114

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
pandas==2.2.2
22
lxml==5.2.1
33
pymongo==4.6.3
4-
il-supermarket-scraper>=0.5.6
4+
il-supermarket-scraper>=0.5.7
55
tqdm==4.66

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
tests_require=dev_required,
3232
extras_require={"test": ["pytest", "pytest-xdist"]},
3333
# *strongly* suggested for sharing
34-
version="0.1.4",
34+
version="0.1.5",
3535
# The license can be anything you like
3636
license="MIT",
3737
description="python package that process the data dumped by the israeli supermarket",

0 commit comments

Comments
 (0)