10
10
harvard_dv = next (_ for _ in test_dv .hosts if _ ["name" ] == "Harvard Dataverse" )
11
11
cimmyt_dv = next (_ for _ in test_dv .hosts if _ ["name" ] == "CIMMYT Research Data" )
12
12
13
+
13
14
@pytest .mark .parametrize (
14
15
("doi" , "resolved" ),
15
16
[
16
- ("doi:10.7910/DVN/6ZXAGT/3YRRYJ" , {"host" : harvard_dv , "url" : "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" }),
17
- ("10.7910/DVN/6ZXAGT/3YRRYJ" , {"host" : harvard_dv , "url" : "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" }),
18
- ("10.7910/DVN/TJCLKP" , {"host" : harvard_dv , "url" : "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP" }),
19
- ("https://dataverse.harvard.edu/api/access/datafile/3323458" , {"host" : harvard_dv , "url" : "https://dataverse.harvard.edu/api/access/datafile/3323458" }),
20
- ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016" , {"host" : cimmyt_dv , "url" : "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016" }),
17
+ (
18
+ "doi:10.7910/DVN/6ZXAGT/3YRRYJ" ,
19
+ {
20
+ "host" : harvard_dv ,
21
+ "url" : "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" ,
22
+ },
23
+ ),
24
+ (
25
+ "10.7910/DVN/6ZXAGT/3YRRYJ" ,
26
+ {
27
+ "host" : harvard_dv ,
28
+ "url" : "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" ,
29
+ },
30
+ ),
31
+ (
32
+ "10.7910/DVN/TJCLKP" ,
33
+ {
34
+ "host" : harvard_dv ,
35
+ "url" : "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP" ,
36
+ },
37
+ ),
38
+ (
39
+ "https://dataverse.harvard.edu/api/access/datafile/3323458" ,
40
+ {
41
+ "host" : harvard_dv ,
42
+ "url" : "https://dataverse.harvard.edu/api/access/datafile/3323458" ,
43
+ },
44
+ ),
45
+ (
46
+ "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016" ,
47
+ {
48
+ "host" : cimmyt_dv ,
49
+ "url" : "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016" ,
50
+ },
51
+ ),
21
52
("/some/random/string" , None ),
22
53
("https://example.com/path/here" , None ),
23
54
# Non dataverse DOIs
24
- ("https://doi.org/10.21105/joss.01277" , None )
25
- ]
55
+ ("https://doi.org/10.21105/joss.01277" , None ),
56
+ ],
26
57
)
27
58
def test_detect (doi , resolved ):
28
59
assert Dataverse ().detect (doi ) == resolved
@@ -31,37 +62,61 @@ def test_detect(doi, resolved):
31
62
@pytest .mark .parametrize (
32
63
("url" , "persistent_id" ),
33
64
[
34
- ("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" , "doi:10.7910/DVN/6ZXAGT" ),
35
- ("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP" , "doi:10.7910/DVN/TJCLKP" ),
36
- ("https://dataverse.harvard.edu/api/access/datafile/3323458" , "doi:10.7910/DVN/3MJ7IR" ),
37
- ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016" , "hdl:11529/10016" ),
38
- ]
65
+ (
66
+ "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" ,
67
+ "doi:10.7910/DVN/6ZXAGT" ,
68
+ ),
69
+ (
70
+ "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP" ,
71
+ "doi:10.7910/DVN/TJCLKP" ,
72
+ ),
73
+ (
74
+ "https://dataverse.harvard.edu/api/access/datafile/3323458" ,
75
+ "doi:10.7910/DVN/3MJ7IR" ,
76
+ ),
77
+ (
78
+ "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016" ,
79
+ "hdl:11529/10016" ,
80
+ ),
81
+ ],
39
82
)
40
83
def test_get_persistent_id (url , persistent_id ):
41
84
assert Dataverse ().get_persistent_id_from_url (url ) == persistent_id
42
85
43
- def test_dataverse_fetch ():
44
86
87
+ @pytest .mark .parametrize (
88
+ ("spec" , "md5tree" ),
89
+ [
90
+ (
91
+ "doi:10.7910/DVN/TJCLKP" ,
92
+ {
93
+ "data/primary/primary-data.zip" : "a8f6fc3fc58f503cd48e23fa8b088694" ,
94
+ "data/2023-01-03.tsv" : "6fd497bf13dab9a06fe737ebc22f1917" ,
95
+ "code/language.py" : "9d61582bcf497c83bbd1ed0eed3c772e" ,
96
+ },
97
+ ),
98
+ (
99
+ # A citation targeting a single file
100
+ "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" ,
101
+ {
102
+ "ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab" : "17a91888ed8e91dfb63acbbab6127ac5"
103
+ }
104
+ )
105
+ ],
106
+ )
107
+ def test_fetch (spec , md5tree ):
45
108
dv = Dataverse ()
46
- spec = dv .detect ("doi:10.7910/DVN/TJCLKP" )
47
109
48
110
with TemporaryDirectory () as d :
49
111
output = []
50
- for l in dv .fetch (spec , d ):
112
+ for l in dv .fetch (dv . detect ( spec ) , d ):
51
113
output .append (l )
52
114
53
- # Verify two directories
54
- assert set (os .listdir (d )) == {"data" , "code" }
55
-
56
- # Verify sha256sum of three files
57
- expected_sha = {
58
- 'data/primary/primary-data.zip' : '880f99a1e1d54a2553be61301f92e06b29236785b8d4d1b7ad0b4595d9d7512b' ,
59
- 'data/2023-01-03.tsv' : 'cc9759e8e6bc076dd7c1a8eb53a7ea3d38e8697fa9f544d15768db308516cc5f' ,
60
- 'code/language.py' : '1ffb3b3cdc9de01279779f3fc88824672c8ec3ab1c41ecdd5c1b59a9b0202215'
61
- }
62
-
63
- for subpath , expected_sha in expected_sha .items ():
64
- with open (os .path .join (d , subpath ), 'rb' ) as f :
65
- h = hashlib .sha256 ()
115
+ # Verify md5 sum of the files we expect to find
116
+ # We are using md5 instead of something more secure because that is what
117
+ # dataverse itself uses
118
+ for subpath , expected_sha in md5tree .items ():
119
+ with open (os .path .join (d , subpath ), "rb" ) as f :
120
+ h = hashlib .md5 ()
66
121
h .update (f .read ())
67
- assert h .hexdigest () == expected_sha
122
+ assert h .hexdigest () == expected_sha
0 commit comments