FAIRmat-NFDI · mkuehbach · Jun 11, 2026 · Apr 8, 2026 · Apr 8, 2026 · May 21, 2026
diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt
@@ -9,6 +9,7 @@ Apreo
 Aragonit
 Arfvedsonite
 Arvfedsonite
+BEAMKV
 BYTOWNITE
 Backscatter
 Benedikt
@@ -58,6 +59,7 @@ ESBD
 ESEM
 ESOURCE
 Elstar
+Emsa
 Erdmann
 Erlangen
 Eucentric
@@ -85,6 +87,7 @@ ICSD
 IUPAC
 IVAS
 Imager
+Iseconds
 JEOL
 JOMTAA
 Kernke
@@ -107,6 +110,7 @@ Merrillite
 Minicondenser
 Morawiec
 Mself
+NCOLUMNS
 NDATA
 NFDI
 NIAC
@@ -171,6 +175,7 @@ Wadsleyite
 Wehnelt
 Whitlockite
 XSTEP
+XUNITS
 YBCO
 YSTEP
 Zirc
@@ -180,22 +185,26 @@ affil
 amcsd
 analysisset
 angstroem
+annotable
 appdef
 arange
 asarray
 atomtypes
 authorships
+automaticstyles
 autorecover
 autostem
 backscatter
 bibfile
 bibtex
 bibtexparser
 bijective
+bitdepth
 caplog
 childs
 chunker
 cnts
+columnwidth
 continously
 cought
 couhatched
@@ -219,6 +228,7 @@ edaxh
 edgecolor
 eiger
 emapp
+emsa
 emsoft
 endianness
 equi
@@ -357,7 +367,9 @@ skiprows
 sothree
 spotsize
 strct
+stringvalue
 stwo
+stylename
 superalloy
 surrogateescape
 sversion
@@ -370,6 +382,7 @@ tzone
 unipd
 unitless
 ureg
+valuetype
 varphi
 visititems
 voronoi

diff --git a/.gitignore b/.gitignore
@@ -204,6 +204,7 @@ tests/run_tests.sh
 .virtual_documents
 tests/data
 tests/prod
+examples/oasisb/04_batch_process.sh
 examples/oasisb/aaa_legacy_*
 examples/oasisb/openalex
 examples/oasisb/*.yaml

diff --git a/CITATION.cff b/CITATION.cff
@@ -22,7 +22,7 @@ authors:
   - given-names: Heiko B.
     family-names: Weber
     orcid: 'https://orcid.org/0000-0002-6403-9022'
-  - given-names: Christoph
+  - given-names: Christoph T.
     family-names: Koch
     orcid: 'https://orcid.org/0000-0002-3984-1523'
   - given-names: Claudia

diff --git a/examples/oasisb/02_unpack_archived_legacy_data_to_enable_parsing.ipynb b/examples/oasisb/02_unpack_archived_legacy_data_to_enable_parsing.ipynb
@@ -7,14 +7,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import glob\n",
     "import os\n",
     "import sys\n",
     "\n",
+    "import numpy as np\n",
     "import pandas as pd\n",
     "import yaml\n",
     "\n",
     "from pynxtools_em import get_pynxtools_em_version\n",
     "from pynxtools_em.examples.oasisb.oasisb_utils import (\n",
+    "    EM_EDAX_MIME_TYPES_SIDECAR,  # edax\n",
+    "    EM_EDAX_MIME_TYPES_SOLITARY,\n",
     "    # APM_MIME_TYPES_SIDECAR,  # only for cross-referencing between apm and em collections\n",
     "    # APM_MIME_TYPES_SOLITARY,\n",
     "    EM_HFIVE_MIME_TYPES_SIDECAR,  # hdf\n",
@@ -29,6 +33,7 @@
     "    EM_MTEX_MIME_TYPES_SOLITARY,\n",
     "    get_project_id,\n",
     "    prepare_parsing,\n",
+    "    prepare_parsing_via_config_file,\n",
     ")\n",
     "\n",
     "print(os.getcwd())\n",
@@ -71,7 +76,7 @@
     "    dtype=str,\n",
     ").fillna(\"\")\n",
     "\n",
-    "project_range: tuple[int, int] = (360, 880)\n",
+    "project_range: tuple[int, int] = (1, 880)\n",
     "\n",
     "with open(f\"{src_directory}{os.sep}aaa_em_nomad_project_names.yaml\") as fp:\n",
     "    nomad_project_names: dict[str, str] = yaml.safe_load(fp)\n",
@@ -91,10 +96,10 @@
     "                        project_id,\n",
     "                        trg_directory,\n",
     "                        report=True,\n",
-    "                        write=True,\n",
-    "                        mime_type=\"mixed\",  # \"image\",  # \"mtex\", \"hdf\", \"image\", \"mixed\"\n",
-    "                        mime_type_solitary=EM_MIXED_MIME_TYPES_SOLITARY,\n",
-    "                        mime_type_sidecar=EM_MIXED_MIME_TYPES_SIDECAR,\n",
+    "                        write=False,\n",
+    "                        mime_type=\"image\",  # \"image\",  # \"mtex\", \"hdf\", \"image\", \"mixed\"\n",
+    "                        mime_type_solitary=EM_IMAGE_MIME_TYPES_SOLITARY,\n",
+    "                        mime_type_sidecar=EM_IMAGE_MIME_TYPES_SIDECAR,\n",
     "                    )\n",
     "                    for key, obj in status.items():\n",
     "                        if obj[\"n\"] > 0:\n",
@@ -117,7 +122,8 @@
    "id": "5",
    "metadata": {},
    "source": [
-    "Programmatic identification of atom_types from file names."
+    "Programmatic identification of atom_types from file names the collection has more than 300k files.<br>\n",
+    "So we use sampling at most 10 "
    ]
   },
   {
@@ -127,12 +133,64 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pattern = os.path.join(\n",
-    "    f\"{trg_directory}{os.sep}049.mixed.decompressed.csv\", f\".mixed.decompressed.csv\"\n",
-    ")\n",
-    "decompression_logfiles: list[str] = glob.glob(pattern)\n",
-    "for file in decompression_logfiles:\n",
-    "    print(file)"
+    "from odf.opendocument import OpenDocumentSpreadsheet\n",
+    "from odf.style import Style, TableColumnProperties\n",
+    "from odf.table import Table, TableCell, TableColumn, TableRow\n",
+    "from odf.text import P\n",
+    "\n",
+    "\n",
+    "def make_text_cell(text):\n",
+    "    cell = TableCell(valuetype=\"string\")\n",
+    "    cell.setAttribute(\"stringvalue\", text)\n",
+    "    cell.addElement(P(text=text))\n",
+    "    return cell\n",
+    "\n",
+    "\n",
+    "def generate_human_annotable_spreadsheet(\n",
+    "    file_path_prefix: str,\n",
+    "    project_id: str,\n",
+    "    candidates: list[tuple[str, str]],\n",
+    "    mime_type: str = \"\",\n",
+    ") -> None:\n",
+    "    doc = OpenDocumentSpreadsheet()\n",
+    "\n",
+    "    table = Table(name=f\"{project_id}\")\n",
+    "\n",
+    "    row = TableRow()\n",
+    "    for col_name, col_width in [\n",
+    "        # (\"atom_types\", 10),\n",
+    "        (\"src\", 16),\n",
+    "        (\"trg\", 16),\n",
+    "    ]:\n",
+    "        col_style = Style(name=col_name, family=\"table-column\")\n",
+    "        col_style.addElement(TableColumnProperties(columnwidth=f\"{col_width}in\"))\n",
+    "        doc.automaticstyles.addElement(col_style)\n",
+    "        table.addElement(TableColumn(stylename=col_style))\n",
+    "\n",
+    "        # cell = TableCell()\n",
+    "        # cell.addElement()  # P(text=col_name))\n",
+    "        row.addElement(make_text_cell(col_name))\n",
+    "    table.addElement(row)\n",
+    "\n",
+    "    for src, trg in candidates:\n",
+    "        row = TableRow()\n",
+    "\n",
+    "        # cell = TableCell()\n",
+    "        # cell.addElement(P(text=\"\"))\n",
+    "        # row.addElement(cell)\n",
+    "        # cell = TableCell()\n",
+    "        # cell.addElement(make_text_cell(src))  # P(text=src))\n",
+    "        row.addElement(make_text_cell(src))  # cell)\n",
+    "        # cell = TableCell()\n",
+    "        # cell.addElement(make_text_cell(trg))  # P(text=trg))\n",
+    "        row.addElement(make_text_cell(trg))  # cell)\n",
+    "\n",
+    "        table.addElement(row)\n",
+    "\n",
+    "    doc.spreadsheet.addElement(table)\n",
+    "    doc.save(\n",
+    "        f\"{file_path_prefix}{os.sep}{project_id}{f'''.{mime_type}''' if mime_type != '' else ''}.decompressed.csv.subset.ods\"\n",
+    "    )"
    ]
   },
   {
@@ -141,7 +199,136 @@
    "id": "7",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# pattern = os.path.join(f\"{trg_directory}{os.sep}049.mixed.decompressed.csv\")\n",
+    "pattern = os.path.join(f\"{trg_directory}{os.sep}*.image.decompressed.csv\")\n",
+    "decompression_logfiles: list[str] = glob.glob(pattern)\n",
+    "statistics: dict[str, int] = {}\n",
+    "maximum: int = 10\n",
+    "total: int = 0\n",
+    "for file in sorted(decompression_logfiles):\n",
+    "    project_id = file.rsplit(os.sep, 1)[1].split(\".\", 1)[0]\n",
+    "    print(project_id)\n",
+    "    with open(file, encoding=\"utf-8\") as fp:\n",
+    "        # statistics[file] = sum(1 for _ in fp) - 3  # three header lines\n",
+    "        # fp.seek(0)\n",
+    "\n",
+    "        # hashing required because of sidecar files appearing also in the list\n",
+    "        # by convention sidecar files have the same filename stem as the main file\n",
+    "        # e.g. (\"jeol.tif\", \"jeol.txt\"), (\"jeol.bmp\", \"jeol.txt\") but there are also trickier cases\n",
+    "        # like these (\"tescan.tif\", \"tescan-tif.hdr\")\n",
+    "        candidate_lookup: dict[str, list[tuple[str, str]]] = {}\n",
+    "        hashes: set[str] = set()\n",
+    "        for line in fp:\n",
+    "            parts = line.strip().split(\";\")\n",
+    "            if len(parts) == 5:\n",
+    "                hsh = parts[4].rsplit(os.sep, 1)[1][4:].split(\".\")[0]\n",
+    "                hashes.add(hsh)\n",
+    "                if hsh not in candidate_lookup:\n",
+    "                    candidate_lookup[hsh] = [(parts[2], parts[4])]\n",
+    "                else:\n",
+    "                    candidate_lookup[hsh].append((parts[2], parts[4]))\n",
+    "\n",
+    "        rng = np.random.default_rng(seed=int(project_id))  # deterministic seed\n",
+    "        candidate_hashes = list(hashes)\n",
+    "        sample_n: int = (\n",
+    "            maximum if len(candidate_hashes) > maximum else len(candidate_hashes)\n",
+    "        )\n",
+    "        print(f\"sample_n {sample_n}\")\n",
+    "        selected_hashes = rng.choice(list(hashes), size=sample_n, replace=False)\n",
+    "\n",
+    "        # collect all files with the selected hashes, given the above-mentioned assumptions\n",
+    "        # that will include again sidecar files if present\n",
+    "        selected: list[tuple[str, str]] = []\n",
+    "        for hsh in selected_hashes:\n",
+    "            for src, trg in candidate_lookup[hsh]:\n",
+    "                selected.append((src, trg))\n",
+    "                # print(f\">>>>{src}\")\n",
+    "                # print(f\"<<<<{trg}\")\n",
+    "        total += len(selected_hashes)\n",
+    "\n",
+    "        generate_human_annotable_spreadsheet(\n",
+    "            trg_directory, project_id, selected, mime_type=\"image\"\n",
+    "        )\n",
+    "\n",
+    "        del (\n",
+    "            candidate_lookup,\n",
+    "            hashes,\n",
+    "            parts,\n",
+    "            hsh,\n",
+    "            rng,\n",
+    "            candidate_hashes,\n",
+    "            sample_n,\n",
+    "            selected_hashes,\n",
+    "            selected,\n",
+    "        )\n",
+    "\n",
+    "        prepare_parsing_via_config_file(\n",
+    "            f\"{trg_directory}{os.sep}{project_id}.image.decompressed.csv.subset.ods\",\n",
+    "            src_directory,\n",
+    "            project_id,\n",
+    "            trg_directory,\n",
+    "            report=True,\n",
+    "            write=True,\n",
+    "            mime_type=\"image\",  # \"image\",  # \"mtex\", \"hdf\", \"image\", \"mixed\"\n",
+    "        )\n",
+    "\n",
+    "print(f\"total {total}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project_id = \"009\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9",
+   "metadata": {},
+   "source": [
+    "***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_file = pd.read_excel(\n",
+    "    f\"{os.getcwd()}{os.sep}009.config.image.ods\",\n",
+    "    sheet_name=\"009\",\n",
+    "    engine=\"odf\",\n",
+    "    dtype=str,\n",
+    ").fillna(\"\")\n",
+    "print(config_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "count: int = 0\n",
+    "for key, value in sorted(statistics.items(), key=lambda item: item[1], reverse=True):\n",
+    "    print(f\"{value};{key.replace(f'''{trg_directory}{os.sep}''', '')}\")\n",
+    "    if value > maximum:\n",
+    "        total += maximum\n",
+    "    else:\n",
+    "        total += value\n",
+    "    count += 1\n",
+    "\n",
+    "\"\"\""
+   ]
   }
  ],
  "metadata": {