Skip to content

Commit 464b820

Browse files
committed
feat: db exports for developpers
1 parent 543d0fc commit 464b820

File tree

5 files changed

+299
-71
lines changed

5 files changed

+299
-71
lines changed

docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ services:
1919
- "${POSTGRES_EXPOSE:-5512}:5432"
2020
volumes:
2121
- dbdata:/var/lib/postgresql/data
22+
# a shared folder makes it easy to share data for import / exports
23+
- ./data:/opt/data
2224
networks:
2325
- common_net
2426

query/tables/product_tags.py

Lines changed: 1 addition & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -2,77 +2,7 @@
22
The order of tags is not preserved"""
33

44
from ..database import create_record, get_rows_affected
5-
6-
COUNTRIES_TAG = "countries_tags"
7-
tag_tables_v1 = {
8-
"_keywords": "product_keywords_tag",
9-
"additives_tags": "product_additives_tag",
10-
"allergens_tags": "product_allergens_tag",
11-
"amino_acids_tags": "product_amino_acids_tag",
12-
"brands_tags": "product_brands_tag",
13-
"categories_properties_tags": "product_categories_properites_tag",
14-
"categories_tags": "product_categories_tag",
15-
"checkers_tags": "product_checkers_tag",
16-
"cities_tags": "product_cities_tag",
17-
"codes_tags": "product_codes_tag",
18-
"correctors_tags": "product_correctors_tag",
19-
COUNTRIES_TAG: "product_countries_tag",
20-
"data_quality_bugs_tags": "product_data_quality_bugs_tag",
21-
"data_quality_errors_tags": "product_data_quality_errors_tag",
22-
"data_quality_tags": "product_data_quality_tag",
23-
"data_quality_warnings_tags": "product_data_quality_warnings_tag",
24-
"data_sources_tags": "product_data_sources_tag",
25-
"debug_tags": "product_debug_tag",
26-
"ecoscore_tags": "product_ecoscore_tag",
27-
"editors_tags": "product_editors_tag",
28-
"emb_codes_tags": "product_emb_codes_tag",
29-
"entry_dates_tags": "product_entry_dates_tag",
30-
"food_groups_tags": "product_food_groups_tag",
31-
"informers_tags": "product_informers_tag",
32-
"ingredients_analysis_tags": "product_ingredients_analysis_tag",
33-
"ingredients_from_palm_oil_tags": "product_ingredients_from_palm_oil_tag",
34-
"ingredients_n_tags": "product_ingredients_ntag",
35-
"ingredients_original_tags": "product_ingredients_original_tag",
36-
"ingredients_tags": "product_ingredients_tag",
37-
"ingredients_that_may_be_from_palm_oil_tags": "product_ingredients_that_may_be_from_palm_oil_tag",
38-
"labels_tags": "product_labels_tag",
39-
"languages_tags": "product_languages_tag",
40-
"last_check_dates_tags": "product_last_check_dates_tag",
41-
"last_edit_dates_tags": "product_last_edit_dates_tag",
42-
"last_image_dates_tags": "product_latest_image_dates_tag",
43-
"manufacturing_places_tags": "product_manufacturing_places_tag",
44-
"minerals_tags": "product_minerals_tag",
45-
"misc_tags": "product_misc_tag",
46-
"nova_groups_tags": "product_nova_groups_tag",
47-
"nucleotides_tags": "product_nucleotides_tag",
48-
"nutrient_levels_tags": "product_nutrient_levels_tag",
49-
"nutriscore_2021_tags": "product_nutriscore2021tag",
50-
"nutriscore_2023_tags": "product_nutriscore2023tag",
51-
"nutriscore_tags": "product_nutriscore_tag",
52-
"nutrition_grades_tags": "product_nutrition_grades_tag",
53-
"origins_tags": "product_origins_tag",
54-
"other_nutritional_substances_tags": "product_other_nutritional_substances_tag",
55-
"packaging_materials_tags": "product_packaging_materials_tag",
56-
"packaging_recycling_tags": "product_packaging_recycling_tag",
57-
"packaging_shapes_tags": "product_packaging_shapes_tag",
58-
"packaging_tags": "product_packaging_tag",
59-
"periods_after_opening_tags": "product_periods_after_opening_tag",
60-
"photographers_tags": "product_photographers_tag",
61-
"pnns_groups_1_tags": "product_pnns_groups1tag",
62-
"pnns_groups_2_tags": "product_pnns_groups2tag",
63-
"popularity_tags": "product_popularity_tag",
64-
"purchase_places_tags": "product_purchase_places_tag",
65-
"states_tags": "product_states_tag",
66-
"stores_tags": "product_stores_tag",
67-
"teams_tags": "product_teams_tag",
68-
"traces_tags": "product_traces_tag",
69-
"unknown_nutrients_tags": "product_unknown_nutrients_tag",
70-
"vitamins_tags": "product_vitamins_tag",
71-
"weighers_tags": "product_weighers_tag",
72-
}
73-
74-
# Append additional tag tables to this list when we introduce them and then add a migration to create the new tables
75-
TAG_TABLES = tag_tables_v1
5+
from .product_tags_list import COUNTRIES_TAG, TAG_TABLES
766

777

788
async def create_tables(transaction, tag_tables):

query/tables/product_tags_list.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""List of tags tables
2+
"""
3+
# we keep it in this simple module to enable easy import eg. for export script.
4+
5+
COUNTRIES_TAG = "countries_tags"
6+
tag_tables_v1 = {
7+
"_keywords": "product_keywords_tag",
8+
"additives_tags": "product_additives_tag",
9+
"allergens_tags": "product_allergens_tag",
10+
"amino_acids_tags": "product_amino_acids_tag",
11+
"brands_tags": "product_brands_tag",
12+
"categories_properties_tags": "product_categories_properites_tag",
13+
"categories_tags": "product_categories_tag",
14+
"checkers_tags": "product_checkers_tag",
15+
"cities_tags": "product_cities_tag",
16+
"codes_tags": "product_codes_tag",
17+
"correctors_tags": "product_correctors_tag",
18+
COUNTRIES_TAG: "product_countries_tag",
19+
"data_quality_bugs_tags": "product_data_quality_bugs_tag",
20+
"data_quality_errors_tags": "product_data_quality_errors_tag",
21+
"data_quality_tags": "product_data_quality_tag",
22+
"data_quality_warnings_tags": "product_data_quality_warnings_tag",
23+
"data_sources_tags": "product_data_sources_tag",
24+
"debug_tags": "product_debug_tag",
25+
"ecoscore_tags": "product_ecoscore_tag",
26+
"editors_tags": "product_editors_tag",
27+
"emb_codes_tags": "product_emb_codes_tag",
28+
"entry_dates_tags": "product_entry_dates_tag",
29+
"food_groups_tags": "product_food_groups_tag",
30+
"informers_tags": "product_informers_tag",
31+
"ingredients_analysis_tags": "product_ingredients_analysis_tag",
32+
"ingredients_from_palm_oil_tags": "product_ingredients_from_palm_oil_tag",
33+
"ingredients_n_tags": "product_ingredients_ntag",
34+
"ingredients_original_tags": "product_ingredients_original_tag",
35+
"ingredients_tags": "product_ingredients_tag",
36+
"ingredients_that_may_be_from_palm_oil_tags": "product_ingredients_that_may_be_from_palm_oil_tag",
37+
"labels_tags": "product_labels_tag",
38+
"languages_tags": "product_languages_tag",
39+
"last_check_dates_tags": "product_last_check_dates_tag",
40+
"last_edit_dates_tags": "product_last_edit_dates_tag",
41+
"last_image_dates_tags": "product_latest_image_dates_tag",
42+
"manufacturing_places_tags": "product_manufacturing_places_tag",
43+
"minerals_tags": "product_minerals_tag",
44+
"misc_tags": "product_misc_tag",
45+
"nova_groups_tags": "product_nova_groups_tag",
46+
"nucleotides_tags": "product_nucleotides_tag",
47+
"nutrient_levels_tags": "product_nutrient_levels_tag",
48+
"nutriscore_2021_tags": "product_nutriscore2021tag",
49+
"nutriscore_2023_tags": "product_nutriscore2023tag",
50+
"nutriscore_tags": "product_nutriscore_tag",
51+
"nutrition_grades_tags": "product_nutrition_grades_tag",
52+
"origins_tags": "product_origins_tag",
53+
"other_nutritional_substances_tags": "product_other_nutritional_substances_tag",
54+
"packaging_materials_tags": "product_packaging_materials_tag",
55+
"packaging_recycling_tags": "product_packaging_recycling_tag",
56+
"packaging_shapes_tags": "product_packaging_shapes_tag",
57+
"packaging_tags": "product_packaging_tag",
58+
"periods_after_opening_tags": "product_periods_after_opening_tag",
59+
"photographers_tags": "product_photographers_tag",
60+
"pnns_groups_1_tags": "product_pnns_groups1tag",
61+
"pnns_groups_2_tags": "product_pnns_groups2tag",
62+
"popularity_tags": "product_popularity_tag",
63+
"purchase_places_tags": "product_purchase_places_tag",
64+
"states_tags": "product_states_tag",
65+
"stores_tags": "product_stores_tag",
66+
"teams_tags": "product_teams_tag",
67+
"traces_tags": "product_traces_tag",
68+
"unknown_nutrients_tags": "product_unknown_nutrients_tag",
69+
"vitamins_tags": "product_vitamins_tag",
70+
"weighers_tags": "product_weighers_tag",
71+
}
72+
73+
# Append additional tag tables to this list when we introduce them
74+
# and then add a migration to create the new tables
75+
TAG_TABLES = tag_tables_v1

scripts/export_db_sample.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#/usr/bin/env python3
2+
import argparse
3+
import sys
4+
import datetime as dt
5+
from textwrap import dedent
6+
7+
sys.path.append(".")
8+
from query.tables import product_tags_list
9+
10+
11+
DESCRIPTION = """
12+
Generate a script to export some data from openfoodfacts-query database,
13+
starting from the event table.
14+
15+
It is intended to get data for developers.
16+
17+
It can be piped into the docker psql, this will generates exports in a folder,
18+
that you can then either
19+
get from a shared folder (typically /opt/data mapped to ./data),
20+
or `docker cp -a container_name:/path/to/folder local/path`
21+
22+
This scripts currently only export part of the tables.
23+
"""
24+
# TODO: add product_country, product_scans_by_country, country, contributor
25+
26+
27+
def generate_export_script(from_date, to_date, export_dir):
28+
29+
outputs = []
30+
31+
def cmd(sql):
32+
outputs.append(dedent(sql))
33+
34+
cmd(
35+
f"""
36+
\\! mkdir -p {export_dir}
37+
-- enable writing by postgres process
38+
\\! chmod a+rwX {export_dir}
39+
\\set export_dir '{export_dir}'
40+
\\set from_date '{from_date}'
41+
\\set to_date '{to_date}'
42+
"""
43+
)
44+
45+
cmd(
46+
"""
47+
\\set export_path :export_dir/product_update_event.csv
48+
49+
COPY (
50+
select * from product_update_event as e
51+
where
52+
e.received_at::timestamp between :'from_date' and :'to_date'
53+
)
54+
TO :'export_path' DELIMITER ',' CSV HEADER;
55+
"""
56+
)
57+
58+
cmd(
59+
"""
60+
\\set export_path :export_dir/product_update.csv
61+
62+
COPY (
63+
select p.* from product_update as p
64+
left join product_update_event as e
65+
on p.event_id = e.id
66+
where
67+
e.received_at::timestamp between :'from_date' and :'to_date'
68+
)
69+
TO :'export_path' DELIMITER ',' CSV HEADER;
70+
"""
71+
)
72+
cmd(
73+
"""
74+
\\set export_path :export_dir/product.csv
75+
COPY (
76+
select * from product
77+
where id in
78+
(
79+
select p.product_id from product_update as p
80+
left join product_update_event as e
81+
on p.event_id = e.id
82+
where e.received_at::timestamp between :'from_date' and :'to_date'
83+
)
84+
)
85+
TO :'export_path' DELIMITER ',' CSV HEADER;
86+
"""
87+
)
88+
# should use product_tags.TAG_TABLES
89+
for table_name in product_tags_list.TAG_TABLES.values():
90+
cmd(
91+
f"""
92+
\\set table_name {table_name}
93+
\\set export_path :export_dir/:table_name.csv
94+
95+
COPY (
96+
select * from :table_name
97+
where product_id in
98+
(
99+
select p.product_id from product_update as p
100+
left join product_update_event as e
101+
on p.event_id = e.id
102+
where e.received_at::timestamp between :'from_date' and :'to_date'
103+
)
104+
)
105+
TO :'export_path' DELIMITER ',' CSV HEADER;
106+
"""
107+
)
108+
return outputs
109+
110+
111+
def get_parser():
112+
now = dt.datetime.now()
113+
default_start = (now - dt.timedelta(minutes=20)).strftime("%Y-%m-%d %H:%M:%S")
114+
default_end = now.strftime("%Y-%m-%d %H:%M:%S")
115+
default_folder="/opt/data/exports/" + now.strftime("%Y-%m-%d_%H:%M:%S")
116+
parser = argparse.ArgumentParser(description=DESCRIPTION)
117+
parser.add_argument(
118+
'from_date', default=default_start, type=str, nargs="?",
119+
help=(
120+
"Start date in iso format, e.g. 2025-11-21 11:00:00\n" +
121+
"If empty, it's now - 20 minutes"
122+
)
123+
)
124+
parser.add_argument(
125+
'to_date', default=default_end, type=str, nargs="?",
126+
help=(
127+
"End date in iso format, e.g. 2025-11-21 11:20:00\n" +
128+
"If empty, it's now"
129+
)
130+
)
131+
parser.add_argument(
132+
'--dest', default=default_folder, type=str,
133+
help=(
134+
"Target directory, it will be created if it does not exists," +
135+
f"defaults to {default_folder} (according to current time)"
136+
),
137+
)
138+
return parser
139+
140+
141+
if __name__ == "__main__":
142+
parser = get_parser()
143+
args = parser.parse_args()
144+
outputs = generate_export_script(args.from_date, args.to_date, args.dest)
145+
print("\n".join(outputs))
146+
147+

scripts/import_db_sample.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#/usr/bin/env python3
2+
import argparse
3+
import csv
4+
import glob
5+
import sys
6+
from textwrap import dedent as _d
7+
8+
DESCRIPTION = """
9+
Generate a script to import some data into openfoodfacts-query database,
10+
from data exported thanks to export_db_sample.py.
11+
12+
It can be piped into psql, for example:
13+
```
14+
python3 import_db_sample.py data/exports/test /opt/data/exports/test | \
15+
docker compose exec -T query_postgres psql -U productopener -d query
16+
```
17+
18+
It is intended to have data for local development.
19+
"""
20+
21+
def generate_import_script(source_dir, docker_dir):
22+
outputs = []
23+
24+
def cmd(sql):
25+
outputs.append(_d(sql))
26+
27+
def priority(file_name):
28+
if file_name.endswith("product.csv"):
29+
return 0
30+
if file_name.endswith("product_update.csv"):
31+
return 1
32+
if file_name.endswith("product_update_event.csv"):
33+
return 2
34+
if file_name.endswith("_tag.csv"):
35+
return 10
36+
return 100
37+
38+
files = glob.glob(f"{source_dir}/*.csv")
39+
files = sorted(files, key=priority)
40+
41+
for file in files:
42+
docker_path = file.replace(source_dir, docker_dir)
43+
44+
table_name = file.split("/")[-1].split(".")[0]
45+
csv_reader = csv.reader(open(file), delimiter=",")
46+
# get column names from csv,
47+
# because they might not have same order
48+
colnames = ",".join(next(csv_reader))
49+
cmd(
50+
f"""
51+
COPY {table_name} ({colnames}) FROM '{docker_path}' WITH (FORMAT CSV, DELIMITER ',',HEADER MATCH);
52+
"""
53+
)
54+
return outputs
55+
56+
def get_parser():
57+
parser = argparse.ArgumentParser(description=DESCRIPTION)
58+
parser.add_argument(
59+
"source_dir",
60+
type=str,
61+
help="path to source directory, containing csv, on the host machine"
62+
)
63+
parser.add_argument(
64+
"docker_dir",
65+
type=str,
66+
help="path to source directory, containing csv in the postgres docker container"
67+
)
68+
return parser
69+
70+
if __name__ == "__main__":
71+
parser = get_parser()
72+
args = parser.parse_args()
73+
outputs = generate_import_script(args.source_dir, args.docker_dir)
74+
print("\n".join(outputs))

0 commit comments

Comments
 (0)