Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 46 additions & 75 deletions src/openfoodfacts/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,82 +21,53 @@
DEFAULT_CACHE_DIR = Path("~/.cache/openfoodfacts/taxonomy").expanduser()


TAXONOMY_URLS = {
Flavor.off: {
TaxonomyType.category: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/categories.full.json",
TaxonomyType.ingredient: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/ingredients.full.json",
TaxonomyType.label: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/labels.full.json",
TaxonomyType.brand: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/brands.full.json",
TaxonomyType.packaging_shape: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/packaging_shapes.full.json",
TaxonomyType.packaging_material: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/packaging_materials.full.json",
TaxonomyType.packaging_recycling: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/packaging_recycling.full.json",
TaxonomyType.country: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/countries.full.json",
TaxonomyType.store: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/stores.full.json",
TaxonomyType.nova_group: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/nova_groups.full.json",
TaxonomyType.additive: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/additives.full.json",
TaxonomyType.vitamin: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/vitamins.full.json",
TaxonomyType.mineral: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/minerals.full.json",
TaxonomyType.amino_acid: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/amino_acids.full.json",
TaxonomyType.nucleotide: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/nucleotides.full.json",
TaxonomyType.allergen: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/allergens.full.json",
TaxonomyType.state: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/states.full.json",
TaxonomyType.data_quality: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/data_quality.full.json",
TaxonomyType.origin: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/origins.full.json",
TaxonomyType.language: URLBuilder.static(Flavor.off, Environment.org)
+ "/data/taxonomies/languages.full.json",
TaxonomyType.other_nutritional_substance: URLBuilder.static(
Flavor.off, Environment.org
)
+ "/data/taxonomies/other_nutritional_substances.full.json",
},
Flavor.obf: {
TaxonomyType.category: URLBuilder.static(Flavor.obf, Environment.org)
+ "/data/taxonomies/categories.full.json",
TaxonomyType.ingredient: URLBuilder.static(Flavor.obf, Environment.org)
+ "/data/taxonomies/ingredients.full.json",
TaxonomyType.label: URLBuilder.static(Flavor.obf, Environment.org)
+ "/data/taxonomies/labels.full.json",
TaxonomyType.brand: URLBuilder.static(Flavor.obf, Environment.org)
+ "/data/taxonomies/brands.full.json",
TaxonomyType.allergen: URLBuilder.static(Flavor.obf, Environment.org)
+ "/data/taxonomies/allergens.full.json",
},
Flavor.opff: {
TaxonomyType.category: URLBuilder.static(Flavor.opff, Environment.org)
+ "/data/taxonomies/categories.full.json",
TaxonomyType.ingredient: URLBuilder.static(Flavor.opff, Environment.org)
+ "/data/taxonomies/ingredients.full.json",
},
Flavor.opf: {
TaxonomyType.category: URLBuilder.static(Flavor.opf, Environment.org)
+ "/data/taxonomies/categories.full.json",
TaxonomyType.label: URLBuilder.static(Flavor.opf, Environment.org)
+ "/data/taxonomies/labels.full.json",
TaxonomyType.brand: URLBuilder.static(Flavor.opf, Environment.org)
+ "/data/taxonomies/brands.full.json",
},
TAXONOMY_MAPPING = {
Flavor.off: (
TaxonomyType.category,
TaxonomyType.ingredient,
TaxonomyType.label,
TaxonomyType.brand,
TaxonomyType.packaging_shape,
TaxonomyType.packaging_material,
TaxonomyType.packaging_recycling,
TaxonomyType.country,
TaxonomyType.store,
TaxonomyType.nova_group,
TaxonomyType.additive,
TaxonomyType.vitamin,
TaxonomyType.mineral,
TaxonomyType.amino_acid,
TaxonomyType.nucleotide,
TaxonomyType.allergen,
TaxonomyType.state,
TaxonomyType.data_quality,
TaxonomyType.origin,
TaxonomyType.language,
TaxonomyType.other_nutritional_substance,
),
Flavor.obf: (
TaxonomyType.category,
TaxonomyType.ingredient,
TaxonomyType.label,
TaxonomyType.brand,
TaxonomyType.allergen,
),
Flavor.opff: (
TaxonomyType.category,
TaxonomyType.ingredient,
),
Flavor.opf: (
TaxonomyType.category,
TaxonomyType.label,
TaxonomyType.brand,
),
}


def _generate_file_path(taxonomy_type: TaxonomyType, flavor: Flavor):
return f"{URLBuilder.static(flavor, Environment.org)}/{taxonomy_type.dataset_path}"


class TaxonomyNode:
"""A taxonomy element.

Expand Down Expand Up @@ -466,7 +437,7 @@ def from_type(
:param flavor: The data source, defaults to Flavor.off
:return: a Taxonomy
"""
url = TAXONOMY_URLS[flavor][TaxonomyType[taxonomy_type]]
url = _generate_file_path(taxonomy_type, flavor)
return cls.from_url(url)


Expand Down Expand Up @@ -501,7 +472,7 @@ def get_taxonomy(

cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir
taxonomy_path = cache_dir / filename
url = TAXONOMY_URLS[flavor][taxonomy_type]
url = _generate_file_path(taxonomy_type, flavor)

if not should_download_file(url, taxonomy_path, force_download, download_newer):
return Taxonomy.from_path(taxonomy_path)
Expand Down
63 changes: 41 additions & 22 deletions src/openfoodfacts/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,28 +891,47 @@ class DatasetType(str, enum.Enum):


class TaxonomyType(str, enum.Enum):
category = "category"
ingredient = "ingredient"
label = "label"
brand = "brand"
packaging_shape = "packaging_shape"
packaging_material = "packaging_material"
packaging_recycling = "packaging_recycling"
country = "country"
store = "store"
nova_group = "nova_group"
packaging = "packaging"
additive = "additive"
vitamin = "vitamin"
mineral = "mineral"
amino_acid = "amino_acid"
nucleotide = "nucleotide"
allergen = "allergen"
state = "state"
data_quality = "data_quality"
origin = "origin"
language = "language"
other_nutritional_substance = "other_nutritional_substance"
dataset_filename: str
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added to avoid mypy error

src/openfoodfacts/types.py:901: error: "TaxonomyType" has no attribute "dataset_filename"  [attr-defined]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@raphodn I'm really not a fan of storing additional data in an Enum. TaxonomyType is used by other projects such as Robotoff, which don't really care about the dataset_filename. Can we have either a dict mapping TaxonomyType to a dataset_filename?
In order to ensure every TaxonomyType has an associated dataset_filename, we could add a unit test that fails in case one is missing.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aaah i shouldn't have merged. Can i force push and drop the commit on the develop branch and reopen the PR ?

Copy link
Copy Markdown
Member Author

@raphodn raphodn May 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'd be better if every filename mapped perfectly to the enum, there's at least 1 plural exception 😅

  • no plural: packaging_recycling, packaging, data_quality
  • different plural: category, country

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok did a PR instead: #479


category = ("category", "categories.full.json")
ingredient = ("ingredient", "ingredients.full.json")
label = ("label", "labels.full.json")
brand = ("brand", "brands.full.json")
packaging_shape = ("packaging_shape", "packaging_shapes.full.json")
packaging_material = ("packaging_material", "packaging_materials.full.json")
packaging_recycling = ("packaging_recycling", "packaging_recycling.full.json")
country = ("country", "countries.full.json")
store = ("store", "stores.full.json")
nova_group = ("nova_group", "nova_groups.full.json")
packaging = ("packaging", "packaging.full.json")
additive = ("additive", "additives.full.json")
vitamin = ("vitamin", "vitamins.full.json")
mineral = ("mineral", "minerals.full.json")
amino_acid = ("amino_acid", "amino_acids.full.json")
nucleotide = ("nucleotide", "nucleotides.full.json")
allergen = ("allergen", "allergens.full.json")
state = ("state", "states.full.json")
data_quality = ("data_quality", "data_quality.full.json")
origin = ("origin", "origins.full.json")
language = ("language", "languages.full.json")
other_nutritional_substance = (
"other_nutritional_substance",
"other_nutritional_substances.full.json",
)

def __new__(cls, value: str, dataset_filename: str):
"""
Override __new__ to allow storing the dataset filename
associated with each taxonomy type.
"""
obj = str.__new__(cls, value)
obj._value_ = value
obj.dataset_filename = dataset_filename
return obj

@property
def dataset_path(self) -> str:
return f"data/taxonomies/{self.dataset_filename}"


class NutritionV3NutrientAggregated(BaseModel):
Expand Down
Loading