Skip to content

Commit d858ec1

Browse files
committed
feat(hub): expose 14 more DuckDB tables + fix edge-proxy healthcheck
DuckDB browser -------------- GME 2.1.0rc1 ships 14 high-value tables the hub didn't surface yet: - openalex.work_github_urls (1.1k rows) — per-paper GitHub link that bridges OpenAlex works to repos already in our github index. - openalex.work_authors / work_institutions / work_references — the relational joins that let the row browser pivot from a work to its co-authors, host institutions, or downstream citers. - snsf.output_publications / output_datasets / output_academic_events / output_knowledge_transfers / output_public_communications / output_collaborations / output_use_inspired — every grant output keyed by grant_number so consumers can chain back to the funding source. - snsf.persons (146k) — the researcher registry every grant + output references. - swissubase.persons + swissubase.institutions — the author / org sides of the studies table, browsable independently. - epfl_graph.category_concepts (40k) — the discipline → concept edges underlying the EPFL Graph taxonomy. All sit in _AUTO_TABLES + _AUTO_SEARCH_EXAMPLES; the heuristic _AUTO_STAT_PATTERNS picks contextual stats per table automatically. Container limits ---------------- HUB_MEM_LIMIT bumped from 512m to 2g. Default 512m was enough for the original six DuckDBs but the new inventory keeps ~14 more files open in DuckDB connection state; the cumulative working set blew past 512m and the kernel OOM-killed the hub mid-request on the first sweep of the new collections. 2g matches the openalex working set and leaves headroom for the SQLite history db + Jinja2 cache. Edge-proxy healthcheck ---------------------- wget --spider http://localhost:80/ followed Caddy's permanent HTTP→HTTPS redirect into a real TLS handshake and choked on SNI (localhost ≠ openpulse.epfl.ch). Every check failed for hours; the container was marked unhealthy even though external traffic worked fine. Hit Caddy's admin API on :2019 instead — it answers in plain HTTP and only when the server process is alive.
1 parent b75a8f7 commit d858ec1

2 files changed

Lines changed: 106 additions & 1 deletion

File tree

infra/open-pulse-stack/docker-compose.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,13 @@ services:
461461
- ${OPEN_PULSE_DATA_DIR:-./data}/edge-proxy/data:/data
462462
- ${OPEN_PULSE_DATA_DIR:-./data}/edge-proxy/config:/config
463463
healthcheck:
464-
test: ["CMD-SHELL", "wget -q --spider http://localhost:80/ || exit 1"]
464+
# Caddy serves a permanent 308 HTTP→HTTPS redirect on :80 which
465+
# wget --spider follows into a real TLS handshake and chokes on
466+
# SNI (localhost ≠ openpulse.epfl.ch) — every check failed even
467+
# though external traffic worked fine. Hit Caddy's admin API on
468+
# :2019 instead: it returns the running config as JSON over plain
469+
# HTTP and only answers when the server process is alive.
470+
test: ["CMD-SHELL", "wget -q -O- http://localhost:2019/config/ >/dev/null || exit 1"]
465471
interval: 15s
466472
timeout: 5s
467473
retries: 5

src/open_pulse/gui/hub/knowledge/duckdb_browser.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,11 +321,39 @@ class Backing:
321321
_DATA_ROOT / "index/openalex/duckdb/openalex.duckdb",
322322
"works",
323323
),
324+
# OpenAlex relational + GitHub-extracted tables. ``work_github_urls``
325+
# is the highest-signal addition for cross-corpus joins — every row
326+
# is a (research paper, GitHub repo) edge our pipeline can use to
327+
# anchor a Person's contributions to their published work.
328+
"openalex_work_authors": (
329+
_DATA_ROOT / "index/openalex/duckdb/openalex.duckdb",
330+
"work_authors",
331+
),
332+
"openalex_work_institutions": (
333+
_DATA_ROOT / "index/openalex/duckdb/openalex.duckdb",
334+
"work_institutions",
335+
),
336+
"openalex_work_references": (
337+
_DATA_ROOT / "index/openalex/duckdb/openalex.duckdb",
338+
"work_references",
339+
),
340+
"openalex_work_github_urls": (
341+
_DATA_ROOT / "index/openalex/duckdb/openalex.duckdb",
342+
"work_github_urls",
343+
),
324344
# EPFL Graph
325345
"epfl_graph_disciplines": (
326346
_DATA_ROOT / "index/epfl_graph/duckdb/epfl_graph.duckdb",
327347
"categories",
328348
),
349+
# 39k discipline → concept edges. Browses the same EPFL discipline
350+
# taxonomy as ``epfl_graph_disciplines`` but at the concept level
351+
# (Wikipedia-grounded sub-topics), so the row count matches the
352+
# number of (category, concept) pairs.
353+
"epfl_graph_concepts": (
354+
_DATA_ROOT / "index/epfl_graph/duckdb/epfl_graph.duckdb",
355+
"category_concepts",
356+
),
329357
# ETH-Z Research Collection
330358
"ethz_research_collection_articles": (
331359
_DATA_ROOT
@@ -429,11 +457,57 @@ class Backing:
429457
_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb",
430458
"grants",
431459
),
460+
# SNSF — research outputs broken out per type. Every row is keyed
461+
# by ``grant_number`` so the row browser can pivot from any output
462+
# back to the funding grant that produced it. ``persons`` is the
463+
# 146k researcher registry every grant + output references.
464+
"snsf_persons": (_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb", "persons"),
465+
"snsf_output_publications": (
466+
_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb",
467+
"output_publications",
468+
),
469+
"snsf_output_datasets": (
470+
_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb",
471+
"output_datasets",
472+
),
473+
"snsf_output_academic_events": (
474+
_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb",
475+
"output_academic_events",
476+
),
477+
"snsf_output_knowledge_transfers": (
478+
_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb",
479+
"output_knowledge_transfers",
480+
),
481+
"snsf_output_public_communications": (
482+
_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb",
483+
"output_public_communications",
484+
),
485+
"snsf_output_collaborations": (
486+
_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb",
487+
"output_collaborations",
488+
),
489+
"snsf_output_use_inspired": (
490+
_DATA_ROOT / "index/snsf/duckdb/snsf.duckdb",
491+
"output_use_inspired",
492+
),
432493
# SwissUBase — studies is the largest non-empty table.
433494
"swissubase_entities": (
434495
_DATA_ROOT / "index/swissubase/duckdb/swissubase.duckdb",
435496
"studies",
436497
),
498+
# SwissUBase author + institution registries. Cross-walked into the
499+
# studies via ``study_persons`` / ``study_institutions`` join
500+
# tables — kept here for direct lookup ("which Swiss social-sciences
501+
# ORCIDs appear in our corpus?", "which institutions submit studies
502+
# via SwissUBase?").
503+
"swissubase_persons": (
504+
_DATA_ROOT / "index/swissubase/duckdb/swissubase.duckdb",
505+
"persons",
506+
),
507+
"swissubase_institutions": (
508+
_DATA_ROOT / "index/swissubase/duckdb/swissubase.duckdb",
509+
"institutions",
510+
),
437511
# Zenodo — ``zenodo_records`` is hand-tuned in ``_BACKING`` above
438512
# (it joins ``record_communities`` to expose a per-record community
439513
# list). ``communities`` + ``creators`` are exposed here so the
@@ -461,8 +535,13 @@ class Backing:
461535
"institutions": ("EPFL", "ETH", "MIT", "Switzerland"),
462536
"sources": ("Nature", "Science", "IEEE", "ACM"),
463537
"topics": ("artificial intelligence", "genetics", "climate", "robotics"),
538+
"openalex_work_authors": ("W2", "A5", "first", "corresponding"),
539+
"openalex_work_institutions": ("EPFL", "ETH", "MIT", "I"),
540+
"openalex_work_references": ("W2", "W3", "W4", "W5"),
541+
"openalex_work_github_urls": ("torvalds", "tensorflow", "pytorch", "epfl"),
464542
# EPFL Graph
465543
"epfl_graph_disciplines": ("physics", "computer", "biology", "mathematics"),
544+
"epfl_graph_concepts": ("machine", "neural", "protein", "quantum"),
466545
# ETH-Z Research Collection
467546
"ethz_research_collection_articles": (
468547
"deep learning",
@@ -501,8 +580,28 @@ class Backing:
501580
"snsf_epfl": ("EPFL", "machine learning", "physics", "Lausanne"),
502581
"snsf_ethz": ("ETH", "Zurich", "robotics", "quantum"),
503582
"snsf_switzerland": ("Swiss", "professor", "biology", "chemistry"),
583+
"snsf_persons": ("Patrick", "Müller", "EPFL", "ETH"),
584+
"snsf_output_publications": (
585+
"machine learning", "quantum", "nature", "epfl",
586+
),
587+
"snsf_output_datasets": ("dataset", "zenodo", "10.5281", "swiss"),
588+
"snsf_output_academic_events": ("conference", "workshop", "ICML", "NeurIPS"),
589+
"snsf_output_knowledge_transfers": (
590+
"patent", "spin-off", "industry", "transfer",
591+
),
592+
"snsf_output_public_communications": (
593+
"media", "talk", "press", "interview",
594+
),
595+
"snsf_output_collaborations": (
596+
"Switzerland", "Germany", "United States", "industry",
597+
),
598+
"snsf_output_use_inspired": ("application", "industry", "use", "EPFL"),
504599
# SwissUBase
505600
"swissubase_entities": ("survey", "FORS", "Switzerland", "households"),
601+
"swissubase_persons": ("Müller", "Schmidt", "Patrick", "Anna"),
602+
"swissubase_institutions": (
603+
"FORS", "Lausanne", "Bern", "Switzerland",
604+
),
506605
# Zenodo
507606
"zenodo_records": ("dataset", "epfl", "10.5281", "machine learning"),
508607
"zenodo_communities": ("epfl", "swiss", "open", "research"),

0 commit comments

Comments
 (0)