Skip to content

Commit 888210c

Browse files
committed
Taxonomy filters
1 parent 8361f6a commit 888210c

7 files changed

Lines changed: 243 additions & 42 deletions

File tree

CLAUDE.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# jb2hubs
2+
3+
Monorepo that converts UCSC GenArk and UCSC browser hubs into JBrowse 2 configs, and serves them via a static website.
4+
5+
## Packages
6+
7+
- `website/` — Astro + React static site (pages: search, recently-updated, accession, taxonomy, hubs, synteny, etc.)
8+
- `genark2jbrowse/` — scripts + TS that process GenArk hubs into JBrowse configs
9+
- `ucsc2jbrowse/` — scripts + TS that convert UCSC track hubs into JBrowse configs
10+
- `hubtools/` — shared TS library used by the converter packages
11+
12+
## Key website internals
13+
14+
- `src/components/SearchPage.tsx` — client-side search over `public/searchIndex.json`
15+
- `src/pages/recently-updated.astro` — server-rendered table with category dropdown filter
16+
- `src/hooks/useSearchIndex.ts` — SWR fetch of the search index; `IndexEntry = [accession, commonName, scientificName, assemblyName, assemblyStatus, source, taxonId]`
17+
- `src/recentlyUpdated.json` — build-time generated data for recently-updated page

website/generateTaxonomyFilter.ts

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import fs from 'fs'
2+
import readline from 'readline'
3+
import path from 'path'
4+
import { fileURLToPath } from 'url'
5+
import { execSync } from 'child_process'
6+
7+
const __filename = fileURLToPath(import.meta.url)
8+
const __dirname = path.dirname(__filename)
9+
10+
const NODES_DMP_PATH = path.join(__dirname, 'taxonomyBuilder', 'nodes.dmp')
11+
const SEARCH_INDEX_PATH = path.join(__dirname, 'public', 'searchIndex.json')
12+
const OUTPUT_PATH = path.join(__dirname, 'public', 'taxonomyFilter.json')
13+
14+
// Ordered from specific to broad for display purposes
15+
const CURATED_CLADES = [
16+
{ label: 'Mammalia', taxonId: 40674 },
17+
{ label: 'Aves', taxonId: 8782 },
18+
{ label: 'Actinopterygii', taxonId: 7898 },
19+
{ label: 'Vertebrata', taxonId: 7742 },
20+
{ label: 'Viridiplantae', taxonId: 33090 },
21+
{ label: 'Fungi', taxonId: 4751 },
22+
{ label: 'Bacteria', taxonId: 2 },
23+
{ label: 'Archaea', taxonId: 2157 },
24+
{ label: 'Viruses', taxonId: 10239 },
25+
]
26+
27+
function ensureNodesDmp() {
28+
if (fs.existsSync(NODES_DMP_PATH)) {
29+
return
30+
}
31+
const destDir = path.dirname(NODES_DMP_PATH).replaceAll('\\', '\\\\')
32+
console.log('nodes.dmp not found, downloading from NCBI...')
33+
execSync(
34+
`python3 -c "
35+
import urllib.request, zipfile, io
36+
print('Downloading taxdmp.zip from NCBI...')
37+
data = urllib.request.urlopen('https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip').read()
38+
z = zipfile.ZipFile(io.BytesIO(data))
39+
z.extract('nodes.dmp', '${destDir}')
40+
print('Extracted nodes.dmp')
41+
"`,
42+
{ stdio: 'inherit' },
43+
)
44+
}
45+
46+
async function buildChildrenMap(filePath: string): Promise<Map<number, number[]>> {
47+
const childrenMap = new Map<number, number[]>()
48+
const rl = readline.createInterface({
49+
input: fs.createReadStream(filePath),
50+
crlfDelay: Infinity,
51+
})
52+
for await (const line of rl) {
53+
const parts = line.split('\t|\t')
54+
if (parts.length < 2) {
55+
continue
56+
}
57+
const taxId = parseInt(parts[0]!)
58+
const parentId = parseInt(parts[1]!)
59+
if (isNaN(taxId) || isNaN(parentId) || taxId === parentId) {
60+
continue
61+
}
62+
const existing = childrenMap.get(parentId)
63+
if (existing) {
64+
existing.push(taxId)
65+
} else {
66+
childrenMap.set(parentId, [taxId])
67+
}
68+
}
69+
return childrenMap
70+
}
71+
72+
function collectDescendants(
73+
rootId: number,
74+
childrenMap: Map<number, number[]>,
75+
): Set<number> {
76+
const result = new Set<number>()
77+
const stack = [rootId]
78+
while (stack.length > 0) {
79+
const current = stack.pop()!
80+
if (result.has(current)) {
81+
continue
82+
}
83+
result.add(current)
84+
const children = childrenMap.get(current)
85+
if (children) {
86+
for (const child of children) {
87+
stack.push(child)
88+
}
89+
}
90+
}
91+
return result
92+
}
93+
94+
ensureNodesDmp()
95+
96+
const searchIndex = JSON.parse(
97+
fs.readFileSync(SEARCH_INDEX_PATH, 'utf-8'),
98+
) as [string, string, string, string, string, string, number][]
99+
100+
const indexTaxonIds = new Set(searchIndex.map(e => e[6]).filter(id => id > 0))
101+
console.log(`Search index: ${searchIndex.length} entries, ${indexTaxonIds.size} unique taxonIds`)
102+
103+
console.log('Parsing nodes.dmp...')
104+
const childrenMap = await buildChildrenMap(NODES_DMP_PATH)
105+
console.log(`Built children map with ${childrenMap.size} parent nodes`)
106+
107+
const result: Record<string, number[]> = {}
108+
for (const { label, taxonId } of CURATED_CLADES) {
109+
const allDescendants = collectDescendants(taxonId, childrenMap)
110+
const members: number[] = []
111+
for (const id of indexTaxonIds) {
112+
if (allDescendants.has(id)) {
113+
members.push(id)
114+
}
115+
}
116+
result[label] = members
117+
console.log(` ${label} (taxonId ${taxonId}): ${members.length} matching taxonIds`)
118+
}
119+
120+
fs.writeFileSync(OUTPUT_PATH, JSON.stringify(result))
121+
const sizeKB = (fs.statSync(OUTPUT_PATH).size / 1024).toFixed(0)
122+
console.log(`Wrote ${OUTPUT_PATH} (${sizeKB} KB)`)

website/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
"generate-taxonomy": "cd taxonomyBuilder && ./generate_taxonomy.sh",
99
"generate-recently-updated": "node generateRecentlyUpdated.ts",
1010
"generate-search-index": "node generateSearchIndex.ts",
11-
"prebuild": "yarn clean && yarn generate-taxonomy && yarn generate-recently-updated && yarn generate-search-index",
11+
"generate-taxonomy-filter": "node generateTaxonomyFilter.ts",
12+
"prebuild": "yarn clean && yarn generate-taxonomy && yarn generate-recently-updated && yarn generate-search-index && yarn generate-taxonomy-filter",
1213
"predev": "aws s3 sync s3://jbrowse.org/processedHubJson/ processedHubJson/",
1314
"dev": "astro dev",
1415
"build": "astro build",

website/public/taxonomyFilter.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

website/src/components/SearchPage.module.css

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
.searchWrapper {
2-
position: relative;
32
display: inline-flex;
43
align-items: center;
5-
max-width: 400px;
4+
gap: 0.5rem;
65
margin-bottom: 1rem;
76
}
87

8+
.inputWrapper {
9+
position: relative;
10+
display: inline-flex;
11+
align-items: center;
12+
}
13+
914
.searchIcon {
1015
position: absolute;
1116
left: 0.5rem;
@@ -17,7 +22,7 @@
1722
padding: 0.5rem 2.5rem 0.5rem 2rem;
1823
border: 1px solid #ccc;
1924
border-radius: 0.25rem;
20-
max-width: 400px;
25+
width: 400px;
2126
}
2227

2328
.clearButton {
@@ -30,6 +35,13 @@
3035
color: #666;
3136
}
3237

38+
.categorySelect {
39+
padding: 4px 8px;
40+
font-size: 0.9rem;
41+
border: 1px solid #ccc;
42+
border-radius: 0.25rem;
43+
}
44+
3345
.resultCount {
3446
margin-bottom: 0.5rem;
3547
font-size: 0.9rem;

website/src/components/SearchPage.tsx

Lines changed: 61 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,14 @@ import { Search } from 'lucide-react'
44

55
import styles from './SearchPage.module.css'
66
import { useSearchIndex } from '../hooks/useSearchIndex.ts'
7+
import { useTaxonomyFilter, CURATED_CLADES } from '../hooks/useTaxonomyFilter.ts'
78

89
import type { IndexEntry } from '../hooks/useSearchIndex.ts'
910

1011
const PAGE_SIZE = 100
1112

12-
function getQueryFromURL() {
13-
if (typeof window === 'undefined') {
14-
return ''
15-
}
16-
return new URLSearchParams(window.location.search).get('q') ?? ''
13+
function getURLParam(key: string) {
14+
return new URLSearchParams(window.location.search).get(key) ?? ''
1715
}
1816

1917
function scoreTerm(term: string, field: string) {
@@ -82,11 +80,9 @@ function highlightMatch(text: string, query: string) {
8280
.split(/\s+/)
8381
.filter(Boolean)
8482
.map(t => t.replaceAll(/[.*+?^${}()|[\]\\]/g, '\\$&'))
85-
const regex = new RegExp(`(${terms.join('|')})`, 'gi')
86-
const parts = text.split(regex)
87-
return parts.map((part, i) =>
88-
regex.test(part) ? <mark key={i}>{part}</mark> : part,
89-
)
83+
return text
84+
.split(new RegExp(`(${terms.join('|')})`, 'gi'))
85+
.map((part, i) => (i % 2 === 1 ? <mark key={i}>{part}</mark> : part))
9086
}
9187

9288
function entryHref(entry: IndexEntry) {
@@ -95,7 +91,9 @@ function entryHref(entry: IndexEntry) {
9591

9692
export default function SearchPage() {
9793
const { index, loading } = useSearchIndex()
98-
const [query, setQuery] = useState(getQueryFromURL)
94+
const cladeSets = useTaxonomyFilter()
95+
const [query, setQuery] = useState(() => getURLParam('q'))
96+
const [clade, setClade] = useState(() => getURLParam('clade'))
9997
const [page, setPage] = useState(0)
10098

10199
useEffect(() => {
@@ -105,28 +103,34 @@ export default function SearchPage() {
105103
} else {
106104
url.searchParams.delete('q')
107105
}
106+
if (clade) {
107+
url.searchParams.set('clade', clade)
108+
} else {
109+
url.searchParams.delete('clade')
110+
}
108111
window.history.replaceState({}, '', url.toString())
109-
}, [query])
110-
111-
useEffect(() => {
112112
setPage(0)
113-
}, [query])
113+
}, [query, clade])
114114

115115
const results = useMemo(() => {
116116
const terms = query.trim().toLowerCase().split(/\s+/).filter(Boolean)
117117
if (terms.length === 0) {
118118
return []
119119
}
120+
const cladeSet = clade && cladeSets ? cladeSets.get(clade) : undefined
120121
const scored: { entry: IndexEntry; score: number }[] = []
121122
for (const entry of index) {
123+
if (cladeSet && !cladeSet.has(entry[6])) {
124+
continue
125+
}
122126
const score = scoreEntry(entry, terms)
123127
if (score >= 0) {
124128
scored.push({ entry, score })
125129
}
126130
}
127131
scored.sort((a, b) => b.score - a.score)
128132
return scored.map(s => s.entry)
129-
}, [index, query])
133+
}, [index, query, clade, cladeSets])
130134

131135
const pageCount = Math.max(1, Math.ceil(results.length / PAGE_SIZE))
132136
const clampedPage = Math.min(page, pageCount - 1)
@@ -142,30 +146,47 @@ export default function SearchPage() {
142146
return (
143147
<div>
144148
<div className={styles.searchWrapper}>
145-
<Search size={16} className={styles.searchIcon} />
146-
<input
147-
type="text"
148-
value={query}
149+
<div className={styles.inputWrapper}>
150+
<Search size={16} className={styles.searchIcon} />
151+
<input
152+
type="text"
153+
value={query}
154+
onChange={e => {
155+
setQuery(e.target.value)
156+
}}
157+
placeholder="Search by name, species, or accession..."
158+
autoComplete="off"
159+
autoFocus
160+
className={styles.input}
161+
/>
162+
{query && (
163+
<button
164+
type="button"
165+
onClick={() => {
166+
setQuery('')
167+
}}
168+
className={styles.clearButton}
169+
aria-label="Clear search"
170+
>
171+
x
172+
</button>
173+
)}
174+
</div>
175+
<select
176+
id="clade-filter"
177+
value={clade}
149178
onChange={e => {
150-
setQuery(e.target.value)
179+
setClade(e.target.value)
151180
}}
152-
placeholder="Search by name, species, or accession..."
153-
autoComplete="off"
154-
autoFocus
155-
className={styles.input}
156-
/>
157-
{query && (
158-
<button
159-
type="button"
160-
onClick={() => {
161-
setQuery('')
162-
}}
163-
className={styles.clearButton}
164-
aria-label="Clear search"
165-
>
166-
x
167-
</button>
168-
)}
181+
className={styles.categorySelect}
182+
>
183+
<option value="">All clades</option>
184+
{CURATED_CLADES.map(({ label, display }) => (
185+
<option key={label} value={label}>
186+
{display}
187+
</option>
188+
))}
189+
</select>
169190
</div>
170191
{query.trim() && (
171192
<div className={styles.resultCount}>
@@ -180,6 +201,7 @@ export default function SearchPage() {
180201
<th>Scientific name</th>
181202
<th>Common name</th>
182203
<th>Accession</th>
204+
<th>Assembly name</th>
183205
<th>Assembly status</th>
184206
<th>Category</th>
185207
</tr>
@@ -194,6 +216,7 @@ export default function SearchPage() {
194216
</td>
195217
<td>{highlightMatch(entry[1], query)}</td>
196218
<td>{highlightMatch(entry[0], query)}</td>
219+
<td>{highlightMatch(entry[3], query)}</td>
197220
<td>{entry[4]}</td>
198221
<td>{entry[5]}</td>
199222
</tr>
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import useSWRImmutable from 'swr/immutable'
2+
3+
// Ordered from specific to broad — must match generateTaxonomyFilter.ts
4+
export const CURATED_CLADES = [
5+
{ label: 'Mammalia', display: 'Mammalia (mammals)' },
6+
{ label: 'Aves', display: 'Aves (birds)' },
7+
{ label: 'Actinopterygii', display: 'Actinopterygii (ray-finned fish)' },
8+
{ label: 'Vertebrata', display: 'Vertebrata' },
9+
{ label: 'Viridiplantae', display: 'Viridiplantae (plants)' },
10+
{ label: 'Fungi', display: 'Fungi' },
11+
{ label: 'Bacteria', display: 'Bacteria' },
12+
{ label: 'Archaea', display: 'Archaea' },
13+
{ label: 'Viruses', display: 'Viruses' },
14+
] as const
15+
16+
async function fetcher(url: string): Promise<Map<string, Set<number>>> {
17+
const res = await fetch(url)
18+
const data = (await res.json()) as Record<string, number[]>
19+
return new Map(Object.entries(data).map(([k, v]) => [k, new Set(v)]))
20+
}
21+
22+
export function useTaxonomyFilter() {
23+
const { data } = useSWRImmutable('/taxonomyFilter.json', fetcher)
24+
return data
25+
}

0 commit comments

Comments
 (0)