forked from marco-bolo/csv-to-json-ld
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
182 lines (142 loc) · 7.29 KB
/
Copy pathMakefile
File metadata and controls
182 lines (142 loc) · 7.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
.PHONY: dockersetup output-directories jsonld clean bulk-ttl bulk-jsonld all init remove-orphaned shacl-report
WORKING_DIR := $(shell pwd)
UID := $(shell id -u)
GID := $(shell id -g)
CSVW_CHECK_DOCKER := roblinksdata/csvw-check:latest
CSV2RDF_DOCKER := europe-west2-docker.pkg.dev/swirrl-devops-infrastructure-1/public/csv2rdf:v0.7.1
JENA_CLI_DOCKER := gsscogs/gss-jvm-build-tools:latest
MBO_TOOLS_DOCKER := ghcr.io/marco-bolo/csv-to-json-ld-tools:latest
JQ := jq
DOCKER := docker
EXPECTED_COMMANDS := "$(DOCKER)" "sed" "awk" "xargs" "realpath" "printf" "dirname" "basename" "$(JQ)" "rm" "echo" "id" "pwd"
CSVW_CHECK := $(DOCKER) run --rm -v "$(WORKING_DIR)":/work -u $(UID):$(GID) -w /work $(CSVW_CHECK_DOCKER) -s
CSV2RDF := $(DOCKER) run --rm -v "$(WORKING_DIR)":/work -u $(UID):$(GID) -w /work $(CSV2RDF_DOCKER) csv2rdf -m minimal -u
RIOT := $(DOCKER) run --rm -v "$(WORKING_DIR)":/work -u $(UID):$(GID) -w /work $(JENA_CLI_DOCKER) riot
SPARQL := $(DOCKER) run --rm -v "$(WORKING_DIR)":/work -u $(UID):$(GID) -w /work $(JENA_CLI_DOCKER) sparql
MBO_TOOLS_DOCKER_RUN := $(DOCKER) run -i --rm -v "$(WORKING_DIR)":/work -u $(UID):$(GID) -w /work "$(MBO_TOOLS_DOCKER)"
CONVERT_LIST_VALUES_TO_NODES := $(MBO_TOOLS_DOCKER_RUN) listcolumnsasnodes
LIST_COLUMN_FOREIGN_KEY_CHECK := $(MBO_TOOLS_DOCKER_RUN) listcolumnforeignkeycheck
UNION_UNIQUE_IDENTIFIERS := $(MBO_TOOLS_DOCKER_RUN) unionuniqueidentifiers
JSONLD_CLI := $(MBO_TOOLS_DOCKER_RUN) jsonld
SHACL_CLI := $(MBO_TOOLS_DOCKER_RUN) pyshacl
CSVW_METADATA_FILES := $(wildcard remote/*.csv-metadata.json)
CSVW_METADATA_VALIDATION_FILES := $(CSVW_METADATA_FILES:remote/%.csv-metadata.json=out/validation/%.success.log)
BULK_TTL_FILES := $(CSVW_METADATA_FILES:remote/%.csv-metadata.json=out/bulk/%.ttl)
EXPECTED_BULK_OUT_FILES := $(BULK_TTL_FILES)
include remote/foreign-keys.mk
check:
@EXIT_CODE=0; \
for program in $(EXPECTED_COMMANDS); \
do \
command -v "$$program" > /dev/null || EXIT_CODE=1; \
command -v "$$program" > /dev/null || echo "\n\nError: '$$program' must be installed\n\n"; \
done; \
exit "$$EXIT_CODE";
dockersetup:
@echo "=============================== Pulling & Building required docker images. ==============================="
@docker pull $(CSVW_CHECK_DOCKER)
@docker pull $(CSV2RDF_DOCKER)
@docker pull $(JENA_CLI_DOCKER)
@docker pull $(MBO_TOOLS_DOCKER)
@echo "" ;
check-csv-format:
@echo "Checking CSV files for incomplete lines..."
@python3 scripts/check_csv_integrity.py
out/validation/person-or-organization.csv: data/Person.csv data/Organization.csv
@mkdir -p out/validation
@$(UNION_UNIQUE_IDENTIFIERS) --out out/validation/person-or-organization.csv --column-name "MBO Permanent Identifier*" data/Person.csv data/Organization.csv
validate: check-csv-format $(CSVW_METADATA_VALIDATION_FILES) $(MANUAL_FOREIGN_KEY_VALIDATION_LOGS)
@EXIT_CODE=0; \
for file in $$(find out/validation/*.err.log); \
do \
EXIT_CODE=1; \
printf '[0;31m'; # Red \
echo "Errors detected:"; \
echo ""; \
cat "$$file"; \
printf '[0m'; # Reset colour \
done; \
exit "$$EXIT_CODE";
out/bulk/%.json: out/bulk/%.ttl
@echo "=============================== Converting $< to JSON-LD $@ ===============================" ;
@$(RIOT) --syntax ttl --out json-ld "$<" > "$@";
@echo "";
out/bulk/All.trig: $(BULK_TTL_FILES)
@$(SPARQL) --quiet $(^:out/bulk/%.ttl=--namedGraph out/bulk/%.ttl) "CONSTRUCT { GRAPH ?g { ?s ?p ?o. } } WHERE { GRAPH ?g { ?s ?p ?o. } }" > out/bulk/All.trig
shacl-report: out/bulk/All.trig
@echo "The SHACL Report:"
@echo ""
@echo "First looking for any violations:"
@echo ""
@# This will cause the build process to fail if there are any violations.
@$(SHACL_CLI) --format table --shapes remote/shacl.ttl --allow-info --allow-warning out/bulk/All.trig
@echo ""
@echo "Now looking for any warnings or info:"
@echo ""
@# We don't want the build process to fail because of warnings/infos.
@$(SHACL_CLI) --format table --shapes remote/shacl.ttl out/bulk/All.trig || true
bulk-ttl: $(BULK_TTL_FILES) remove-orphaned
jsonld: $(BULK_TTL_FILES)
@$(MAKE) -f remote/split.mk jsonld
init:
@$(MAKE) check
@$(MAKE) dockersetup
@$(MAKE) -f remote/split.mk init
all:
@$(MAKE) init validate shacl-report jsonld
define DELETE_UNEXPECTED_BULK_FILES
ifeq ($$(filter $$(file),$(EXPECTED_BULK_OUT_FILES)),)
$$(shell rm -f "$$(file)")
endif
endef
# Remove orphaned outputs which should no longer be present.
remove-orphaned: $(wildcard out/bulk/*.ttl) $(wildcard out/bulk/*.json)
$(foreach file,$^, $(eval $(DELETE_UNEXPECTED_BULK_FILES)))
clean:
@$(MAKE) -f remote/split.mk clean
@rm -rf out
.DEFAULT_GOAL := all
define CSVW_TO_TARGETS =
# Defines the target to convert a CSV-W into TTL
# Importantly it makes sure that its local CSV files are listed as dependencies for make.
$(eval CSVW_FILE_NAME := $(shell basename "$(1)"))
$(eval TTL_FILE_$(1) := $(CSVW_FILE_NAME:%.csv-metadata.json=out/bulk/%.ttl))
$(eval CSVW_LOG_FILE_$(1) := $(CSVW_FILE_NAME:%.csv-metadata.json=out/validation/%.success.log))
$(eval CSVW_LOG_FILE_ERR_$(1) := $(CSVW_FILE_NAME:%.csv-metadata.json=out/validation/%.err.log))
$(eval CSVW_DIR_NAME_$(1) := $(shell dirname $$(realpath $(1))))
$(eval TABLE_SCHEMA_DEPENDENCIES_COMMAND_$(1) := cat "$(1)" \
| $(JQ) '.tables[] | .tableSchema' \
| sed 's/"\(.*\)"/\1/g' \
| awk '{print "$(CSVW_DIR_NAME_$(1))/" $$$$0}' \
| xargs -l realpath --canonicalize-missing --relative-to "$(WORKING_DIR)" \
| xargs;)
$(eval TABLE_SCHEMA_DEPENDENCIES_$(1) = $(shell $(TABLE_SCHEMA_DEPENDENCIES_COMMAND_$(1)) ))
$(eval CSVCHECK_CSV_DEPENDENCIES_COMMAND_$(1) := cat "$(1)" \
| $(JQ) '.tables[] | .url' \
| sed 's/"\(.*\)"/\1/g' \
| awk '{print "$(CSVW_DIR_NAME_$(1))/" $$$$0}' \
| xargs -l realpath --canonicalize-missing --relative-to "$(WORKING_DIR)" \
| xargs;)
$(eval CSVCHECK_CSV_DEPENDENCIES_$(1) = $(shell $(CSVCHECK_CSV_DEPENDENCIES_COMMAND_$(1)) ))
$(CSVW_LOG_FILE_$(1)): $(1) $(CSVCHECK_CSV_DEPENDENCIES_$(1)) $(TABLE_SCHEMA_DEPENDENCIES_$(1))
@echo "=============================== Validating $$< ==============================="
@rm -f "$(CSVW_LOG_FILE_$(1))" "$(CSVW_LOG_FILE_ERR_$(1))"
@RES=$$$$($(CSVW_CHECK) "$$<") && echo "$$$$RES" && touch "$(CSVW_LOG_FILE_$(1))" || (echo "$$$$RES" && echo "When validating $$<\n$$$$RES" > "$(CSVW_LOG_FILE_ERR_$(1))")
@echo ""
$(eval CSV2RDF_CSV_DEPENDENCIES_COMMAND_$(1) := cat "$(1)" \
| $(JQ) '.tables[] | select(.suppressOutput != true) | .url' \
| sed 's/"\(.*\)"/\1/g' \
| awk '{print "$(CSVW_DIR_NAME_$(1))/" $$$$0}' \
| xargs -l realpath --canonicalize-missing --relative-to "$(WORKING_DIR)" \
| xargs;)
$(eval CSV2RDF_CSV_DEPENDENCIES_$(1) = $(shell $(CSV2RDF_CSV_DEPENDENCIES_COMMAND_$(1)) ))
$(TTL_FILE_$(1)): $(1) $(CSV2RDF_CSV_DEPENDENCIES_$(1)) $(TABLE_SCHEMA_DEPENDENCIES_$(1)) out/validation/person-or-organization.csv
@mkdir -p out/bulk
@echo "=============================== Converting $$< to ttl $$@ ==============================="
@# Unfortunately csv2rdf returns a non-zero status code if it produces no triples (even if this is to be expected).
@# So for the time being we'll ignore any errors which come from it.
@$$(CSV2RDF) "$$<" -o "$$@" || true
@$$(CONVERT_LIST_VALUES_TO_NODES) "$$@"
@echo ""
endef
$(foreach file,$(CSVW_METADATA_FILES),$(eval $(call CSVW_TO_TARGETS,$(file))))