-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetchSubjectsFromDnb_prod_sru.flux
More file actions
76 lines (64 loc) · 2.16 KB
/
fetchSubjectsFromDnb_prod_sru.flux
File metadata and controls
76 lines (64 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
default sruHarvest=FLUX_DIR + "prod/sru_records.xml";
default outfile=FLUX_DIR + "prod/dnbSubjects.xml.gz";
default lobidHarvest = FLUX_DIR + "prod/dnbSubjects.jsonl";
default version=FLUX_DIR + "prod/";
"Start harvesting lobid."
| print;
"https://lobid.org/resources/search?q=_exists_%3AdnbId+AND+NOT+subject.type%3A%22ComplexSubject%22+AND+inCollection.id%3A%22http%3A%2F%2Flobid.org%2Forganisations%2FDE-655%23%21%22+AND+NOT+_exists_%3AzdbId&format=jsonl"
| open-http(header="User-Agent: hbz/dnb-subject-harvester")
| as-lines
| write(lobidHarvest)
;
"Harvesting lobid finished. Start creating dnbId2zdbId map."
| print;
FLUX_DIR + "prod/dnbSubjects.jsonl"
| open-file
| as-lines
| decode-json
| fix("retain('almaMmsId','dnbId')")
| encode-csv(noQuotes="true", separator="\t")
| write("prod/almaMmsId2dnbId.tsv")
;
"Map finished. Start harvesting sru."
| print;
lobidHarvest
| open-file
| as-lines
| decode-json
| fix("retain('dnbId')")
| literal-to-object
| template("https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=dnb.idn=${o}&recordSchema=MARC21-xml")
| catch-object-exception
| open-http(header="User-Agent: hbz/dnbSubjectHarvester", accept="application/xml")
| as-records
// The following two steps create a single xml file from the multiple incoming sru requests, saved into a harvest tag
| match(pattern="<\\?xml version=.*?>", replacement="")
| object-batch-log(batchSize="100")
| write(sruHarvest, header="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<harvest>", footer="</harvest>")
;
"SRU Harvest finished. Start harvesting dnb subject data."
| print;
sruHarvest
| open-file
| decode-xml
| handle-marcxml
| batch-log
| fix(FLUX_DIR + "subject.fix",*)
//| batch-log
| encode-marcxml
| object-batch-log
| write(outfile, compression="gzip") // compression is better for big file
;
"Create a list of broken dnbIds."
| print;
sruHarvest
| open-file
| as-lines
| filter-strings("<records/>",passmatches="true")
| match(pattern=".*dnb.idn=(.+)</query>.+$",replacement="$1")
| decode-csv(separator="\t")
| fix(FLUX_DIR + "failed.fix",*)
| batch-log(batchSize="10")
| encode-csv(separator="\t",includeheader="true",noQuotes="true")
| write(FLUX_DIR + "prod/failed.tsv")
;