Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b15116e
bump versions precommit and pyproject toml
atomprobe-tc Apr 8, 2026
72fe06c
constrain rosettasciio always to use the latest
atomprobe-tc Apr 8, 2026
2a2e11e
skeleton rsciio_msa
atomprobe-tc May 21, 2026
a1cb2df
Merge branch 'main' into issues_74_and_149_msa_spc_spd
atomprobe-tc May 28, 2026
c276d71
Merge branch 'main' into issues_74_and_149_msa_spc_spd
atomprobe-tc Jun 5, 2026
8bf48bf
tweaking atom type handling and bibliography handling for oasisb example
atomprobe-tc Jun 5, 2026
8baaad9
iso3_city_firstauthor naming information collected, starting mtex.h5 …
atomprobe-tc Jun 8, 2026
1a9fb4b
remove NeXus/MTex parser as this one has been updated and will be mai…
atomprobe-tc Jun 9, 2026
8998926
proper return values when file already exists and using logger
atomprobe-tc Jun 9, 2026
b3bea29
reorganize location of the code for examples related to data ingestio…
atomprobe-tc Jun 9, 2026
cef3603
spellchecking minor
atomprobe-tc Jun 9, 2026
5c83b3f
fixing batch_process queue, next steps i) test and run with msa, emd,…
atomprobe-tc Jun 9, 2026
2504460
explicit chunking for all parsers were applicable
atomprobe-tc Jun 9, 2026
35b513e
explicit chunking for all parsers were applicable
atomprobe-tc Jun 9, 2026
0e03188
Merge branch 'fairmat1_final_pm' into issues_74_and_149_msa_spc_spd
atomprobe-tc Jun 9, 2026
752b6fd
Merge branch 'main' into issues_74_and_149_msa_spc_spd
atomprobe-tc Jun 9, 2026
a0dc5b1
working version of the msa parser
atomprobe-tc Jun 9, 2026
3bfeb18
spellchecking
atomprobe-tc Jun 9, 2026
0ed6ebf
fix the issue that the running long ipynb session drain the main memo…
atomprobe-tc Jun 9, 2026
501b942
configure rosettasciio so that eds-streams are included
atomprobe-tc Jun 10, 2026
56c4704
adding cli executable script to overcome the problem of the significa…
atomprobe-tc Jun 10, 2026
985a748
adding parsing of NXuser
atomprobe-tc Jun 10, 2026
b73b733
Adding a deterministic sampling to select random subsets and then dec…
atomprobe-tc Jun 11, 2026
bd5fc36
apply the patch that so far was used only in the tiff_zeiss parser al…
atomprobe-tc Jun 11, 2026
0bc1aeb
apply the patch that so far was used only in the tiff_zeiss parser al…
atomprobe-tc Jun 11, 2026
66395e6
minor fix spellchecking to prepare merging this feature branch, furth…
atomprobe-tc Jun 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .cspell/custom-dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Apreo
Aragonit
Arfvedsonite
Arvfedsonite
BEAMKV
BYTOWNITE
Backscatter
Benedikt
Expand Down Expand Up @@ -58,6 +59,7 @@ ESBD
ESEM
ESOURCE
Elstar
Emsa
Erdmann
Erlangen
Eucentric
Expand Down Expand Up @@ -85,6 +87,7 @@ ICSD
IUPAC
IVAS
Imager
Iseconds
JEOL
JOMTAA
Kernke
Expand All @@ -107,6 +110,7 @@ Merrillite
Minicondenser
Morawiec
Mself
NCOLUMNS
NDATA
NFDI
NIAC
Expand Down Expand Up @@ -171,6 +175,7 @@ Wadsleyite
Wehnelt
Whitlockite
XSTEP
XUNITS
YBCO
YSTEP
Zirc
Expand All @@ -180,22 +185,26 @@ affil
amcsd
analysisset
angstroem
annotable
appdef
arange
asarray
atomtypes
authorships
automaticstyles
autorecover
autostem
backscatter
bibfile
bibtex
bibtexparser
bijective
bitdepth
caplog
childs
chunker
cnts
columnwidth
continously
cought
couhatched
Expand All @@ -219,6 +228,7 @@ edaxh
edgecolor
eiger
emapp
emsa
emsoft
endianness
equi
Expand Down Expand Up @@ -357,7 +367,9 @@ skiprows
sothree
spotsize
strct
stringvalue
stwo
stylename
superalloy
surrogateescape
sversion
Expand All @@ -370,6 +382,7 @@ tzone
unipd
unitless
ureg
valuetype
varphi
visititems
voronoi
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ tests/run_tests.sh
.virtual_documents
tests/data
tests/prod
examples/oasisb/04_batch_process.sh
examples/oasisb/aaa_legacy_*
examples/oasisb/openalex
examples/oasisb/*.yaml
Expand Down
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ authors:
- given-names: Heiko B.
family-names: Weber
orcid: 'https://orcid.org/0000-0002-6403-9022'
- given-names: Christoph
- given-names: Christoph T.
family-names: Koch
orcid: 'https://orcid.org/0000-0002-3984-1523'
- given-names: Claudia
Expand Down
213 changes: 200 additions & 13 deletions examples/oasisb/02_unpack_archived_legacy_data_to_enable_parsing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"import os\n",
"import sys\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import yaml\n",
"\n",
"from pynxtools_em import get_pynxtools_em_version\n",
"from pynxtools_em.examples.oasisb.oasisb_utils import (\n",
" EM_EDAX_MIME_TYPES_SIDECAR, # edax\n",
" EM_EDAX_MIME_TYPES_SOLITARY,\n",
" # APM_MIME_TYPES_SIDECAR, # only for cross-referencing between apm and em collections\n",
" # APM_MIME_TYPES_SOLITARY,\n",
" EM_HFIVE_MIME_TYPES_SIDECAR, # hdf\n",
Expand All @@ -29,6 +33,7 @@
" EM_MTEX_MIME_TYPES_SOLITARY,\n",
" get_project_id,\n",
" prepare_parsing,\n",
" prepare_parsing_via_config_file,\n",
")\n",
"\n",
"print(os.getcwd())\n",
Expand Down Expand Up @@ -71,7 +76,7 @@
" dtype=str,\n",
").fillna(\"\")\n",
"\n",
"project_range: tuple[int, int] = (360, 880)\n",
"project_range: tuple[int, int] = (1, 880)\n",
"\n",
"with open(f\"{src_directory}{os.sep}aaa_em_nomad_project_names.yaml\") as fp:\n",
" nomad_project_names: dict[str, str] = yaml.safe_load(fp)\n",
Expand All @@ -91,10 +96,10 @@
" project_id,\n",
" trg_directory,\n",
" report=True,\n",
" write=True,\n",
" mime_type=\"mixed\", # \"image\", # \"mtex\", \"hdf\", \"image\", \"mixed\"\n",
" mime_type_solitary=EM_MIXED_MIME_TYPES_SOLITARY,\n",
" mime_type_sidecar=EM_MIXED_MIME_TYPES_SIDECAR,\n",
" write=False,\n",
" mime_type=\"image\", # \"image\", # \"mtex\", \"hdf\", \"image\", \"mixed\"\n",
" mime_type_solitary=EM_IMAGE_MIME_TYPES_SOLITARY,\n",
" mime_type_sidecar=EM_IMAGE_MIME_TYPES_SIDECAR,\n",
" )\n",
" for key, obj in status.items():\n",
" if obj[\"n\"] > 0:\n",
Expand All @@ -117,7 +122,8 @@
"id": "5",
"metadata": {},
"source": [
"Programmatic identification of atom_types from file names."
"Programmatic identification of atom_types from file names the collection has more than 300k files.<br>\n",
"So we use sampling at most 10 "
]
},
{
Expand All @@ -127,12 +133,64 @@
"metadata": {},
"outputs": [],
"source": [
"pattern = os.path.join(\n",
" f\"{trg_directory}{os.sep}049.mixed.decompressed.csv\", f\".mixed.decompressed.csv\"\n",
")\n",
"decompression_logfiles: list[str] = glob.glob(pattern)\n",
"for file in decompression_logfiles:\n",
" print(file)"
"from odf.opendocument import OpenDocumentSpreadsheet\n",
"from odf.style import Style, TableColumnProperties\n",
"from odf.table import Table, TableCell, TableColumn, TableRow\n",
"from odf.text import P\n",
"\n",
"\n",
"def make_text_cell(text):\n",
" cell = TableCell(valuetype=\"string\")\n",
" cell.setAttribute(\"stringvalue\", text)\n",
" cell.addElement(P(text=text))\n",
" return cell\n",
"\n",
"\n",
"def generate_human_annotable_spreadsheet(\n",
" file_path_prefix: str,\n",
" project_id: str,\n",
" candidates: list[tuple[str, str]],\n",
" mime_type: str = \"\",\n",
") -> None:\n",
" doc = OpenDocumentSpreadsheet()\n",
"\n",
" table = Table(name=f\"{project_id}\")\n",
"\n",
" row = TableRow()\n",
" for col_name, col_width in [\n",
" # (\"atom_types\", 10),\n",
" (\"src\", 16),\n",
" (\"trg\", 16),\n",
" ]:\n",
" col_style = Style(name=col_name, family=\"table-column\")\n",
" col_style.addElement(TableColumnProperties(columnwidth=f\"{col_width}in\"))\n",
" doc.automaticstyles.addElement(col_style)\n",
" table.addElement(TableColumn(stylename=col_style))\n",
"\n",
" # cell = TableCell()\n",
" # cell.addElement() # P(text=col_name))\n",
" row.addElement(make_text_cell(col_name))\n",
" table.addElement(row)\n",
"\n",
" for src, trg in candidates:\n",
" row = TableRow()\n",
"\n",
" # cell = TableCell()\n",
" # cell.addElement(P(text=\"\"))\n",
" # row.addElement(cell)\n",
" # cell = TableCell()\n",
" # cell.addElement(make_text_cell(src)) # P(text=src))\n",
" row.addElement(make_text_cell(src)) # cell)\n",
" # cell = TableCell()\n",
" # cell.addElement(make_text_cell(trg)) # P(text=trg))\n",
" row.addElement(make_text_cell(trg)) # cell)\n",
"\n",
" table.addElement(row)\n",
"\n",
" doc.spreadsheet.addElement(table)\n",
" doc.save(\n",
" f\"{file_path_prefix}{os.sep}{project_id}{f'''.{mime_type}''' if mime_type != '' else ''}.decompressed.csv.subset.ods\"\n",
" )"
]
},
{
Expand All @@ -141,7 +199,136 @@
"id": "7",
"metadata": {},
"outputs": [],
"source": []
"source": [
"# pattern = os.path.join(f\"{trg_directory}{os.sep}049.mixed.decompressed.csv\")\n",
"pattern = os.path.join(f\"{trg_directory}{os.sep}*.image.decompressed.csv\")\n",
"decompression_logfiles: list[str] = glob.glob(pattern)\n",
"statistics: dict[str, int] = {}\n",
"maximum: int = 10\n",
"total: int = 0\n",
"for file in sorted(decompression_logfiles):\n",
" project_id = file.rsplit(os.sep, 1)[1].split(\".\", 1)[0]\n",
" print(project_id)\n",
" with open(file, encoding=\"utf-8\") as fp:\n",
" # statistics[file] = sum(1 for _ in fp) - 3 # three header lines\n",
" # fp.seek(0)\n",
"\n",
" # hashing required because of sidecar files appearing also in the list\n",
" # by convention sidecar files have the same filename stem as the main file\n",
" # e.g. (\"jeol.tif\", \"jeol.txt\"), (\"jeol.bmp\", \"jeol.txt\") but there are also trickier cases\n",
" # like these (\"tescan.tif\", \"tescan-tif.hdr\")\n",
" candidate_lookup: dict[str, list[tuple[str, str]]] = {}\n",
" hashes: set[str] = set()\n",
" for line in fp:\n",
" parts = line.strip().split(\";\")\n",
" if len(parts) == 5:\n",
" hsh = parts[4].rsplit(os.sep, 1)[1][4:].split(\".\")[0]\n",
" hashes.add(hsh)\n",
" if hsh not in candidate_lookup:\n",
" candidate_lookup[hsh] = [(parts[2], parts[4])]\n",
" else:\n",
" candidate_lookup[hsh].append((parts[2], parts[4]))\n",
"\n",
" rng = np.random.default_rng(seed=int(project_id)) # deterministic seed\n",
" candidate_hashes = list(hashes)\n",
" sample_n: int = (\n",
" maximum if len(candidate_hashes) > maximum else len(candidate_hashes)\n",
" )\n",
" print(f\"sample_n {sample_n}\")\n",
" selected_hashes = rng.choice(list(hashes), size=sample_n, replace=False)\n",
"\n",
" # collect all files with the selected hashes, given the above-mentioned assumptions\n",
" # that will include again sidecar files if present\n",
" selected: list[tuple[str, str]] = []\n",
" for hsh in selected_hashes:\n",
" for src, trg in candidate_lookup[hsh]:\n",
" selected.append((src, trg))\n",
" # print(f\">>>>{src}\")\n",
" # print(f\"<<<<{trg}\")\n",
" total += len(selected_hashes)\n",
"\n",
" generate_human_annotable_spreadsheet(\n",
" trg_directory, project_id, selected, mime_type=\"image\"\n",
" )\n",
"\n",
" del (\n",
" candidate_lookup,\n",
" hashes,\n",
" parts,\n",
" hsh,\n",
" rng,\n",
" candidate_hashes,\n",
" sample_n,\n",
" selected_hashes,\n",
" selected,\n",
" )\n",
"\n",
" prepare_parsing_via_config_file(\n",
" f\"{trg_directory}{os.sep}{project_id}.image.decompressed.csv.subset.ods\",\n",
" src_directory,\n",
" project_id,\n",
" trg_directory,\n",
" report=True,\n",
" write=True,\n",
" mime_type=\"image\", # \"image\", # \"mtex\", \"hdf\", \"image\", \"mixed\"\n",
" )\n",
"\n",
"print(f\"total {total}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8",
"metadata": {},
"outputs": [],
"source": [
"project_id = \"009\""
]
},
{
"cell_type": "markdown",
"id": "9",
"metadata": {},
"source": [
"***"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10",
"metadata": {},
"outputs": [],
"source": [
"config_file = pd.read_excel(\n",
" f\"{os.getcwd()}{os.sep}009.config.image.ods\",\n",
" sheet_name=\"009\",\n",
" engine=\"odf\",\n",
" dtype=str,\n",
").fillna(\"\")\n",
"print(config_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11",
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"count: int = 0\n",
"for key, value in sorted(statistics.items(), key=lambda item: item[1], reverse=True):\n",
" print(f\"{value};{key.replace(f'''{trg_directory}{os.sep}''', '')}\")\n",
" if value > maximum:\n",
" total += maximum\n",
" else:\n",
" total += value\n",
" count += 1\n",
"\n",
"\"\"\""
]
}
],
"metadata": {
Expand Down
Loading
Loading