Support fetcing single files in dataverse

yuvipanda · yuvipanda · commit 1260a5a39466 · 2024-12-16T17:02:25.000-08:00
diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py
@@ -106,7 +106,7 @@ def get_persistent_id_from_url(self, url: str) -> str:
                 )
             return data["items"][0]["dataset_persistent_id"]
         elif parsed_url.path.startswith("/file.xhtml"):
-            file_persistent_id = qs['persistentId'][0]
+            file_persistent_id = qs["persistentId"][0]
             dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0]
             if file_persistent_id == dataset_persistent_id:
                 # We can't figure this one out, throw an error
@@ -115,6 +115,38 @@ def get_persistent_id_from_url(self, url: str) -> str:
 
         raise ValueError(f"Could not determine persistent id for dataverse URL {url}")
 
+    def get_datafiles(self, host: str, persistent_id: str) -> list[dict]:
+        """
+        Return a list of dataFiles for given persistent_id
+        """
+        dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}"
+
+        resp = self._request(dataset_url, headers={"accept": "application/json"})
+        # Assume it's a dataset
+        is_dataset = True
+        if resp.status_code == 404:
+            # It's possible this is a *file* persistent_id, not a dataset one
+            file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}"
+            resp = self._request(file_url, headers={"accept": "application/json"})
+
+            if resp.status_code == 404:
+                # This persistent id is just not here
+                raise ValueError(f"{persistent_id} on {host} is not found")
+
+            # It's not a dataset, it's a file!
+            is_dataset = False
+
+        # We already handled 404, raise error for everything else
+        resp.raise_for_status()
+
+        data = resp.json()["data"]
+
+        if is_dataset:
+            return data["latestVersion"]["files"]
+        else:
+            # Only one file object
+            return [data]
+
     def fetch(self, spec, output_dir, yield_output=False):
         """Fetch and unpack a Dataverse dataset."""
         url = spec["url"]
@@ -123,13 +155,8 @@ def fetch(self, spec, output_dir, yield_output=False):
         persistent_id = self.get_persistent_id_from_url(url)
 
         yield f"Fetching Dataverse record {persistent_id}.\n"
-        url = f'{host["url"]}/api/datasets/:persistentId?persistentId={persistent_id}'
-
-        resp = self.urlopen(url, headers={"accept": "application/json"})
-        print(resp.json())
-        record = resp.json()["data"]
 
-        for fobj in deep_get(record, "latestVersion.files"):
+        for fobj in self.get_datafiles(host["url"], persistent_id):
             file_url = (
                 # without format=original you get the preservation format (plain text, tab separated)
                 f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
@@ -155,7 +182,6 @@ def fetch(self, spec, output_dir, yield_output=False):
             copytree(os.path.join(output_dir, d), output_dir)
             shutil.rmtree(os.path.join(output_dir, d))
 
-
         # Save persistent id
         self.persitent_id = persistent_id
 
diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py
@@ -10,19 +10,50 @@
 harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
 cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
 
+
 @pytest.mark.parametrize(
     ("doi", "resolved"),
     [
-        ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
-        ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
-        ("10.7910/DVN/TJCLKP", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP"}),
-        ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/api/access/datafile/3323458"}),
-        ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016"}),
+        (
+            "doi:10.7910/DVN/6ZXAGT/3YRRYJ",
+            {
+                "host": harvard_dv,
+                "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
+            },
+        ),
+        (
+            "10.7910/DVN/6ZXAGT/3YRRYJ",
+            {
+                "host": harvard_dv,
+                "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
+            },
+        ),
+        (
+            "10.7910/DVN/TJCLKP",
+            {
+                "host": harvard_dv,
+                "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
+            },
+        ),
+        (
+            "https://dataverse.harvard.edu/api/access/datafile/3323458",
+            {
+                "host": harvard_dv,
+                "url": "https://dataverse.harvard.edu/api/access/datafile/3323458",
+            },
+        ),
+        (
+            "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
+            {
+                "host": cimmyt_dv,
+                "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
+            },
+        ),
         ("/some/random/string", None),
         ("https://example.com/path/here", None),
         # Non dataverse DOIs
-        ("https://doi.org/10.21105/joss.01277", None)
-    ]
+        ("https://doi.org/10.21105/joss.01277", None),
+    ],
 )
 def test_detect(doi, resolved):
     assert Dataverse().detect(doi) == resolved
@@ -31,37 +62,61 @@ def test_detect(doi, resolved):
 @pytest.mark.parametrize(
     ("url", "persistent_id"),
     [
-        ("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT"),
-        ("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP"),
-        ("https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR"),
-        ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016"),
-    ]
+        (
+            "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
+            "doi:10.7910/DVN/6ZXAGT",
+        ),
+        (
+            "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
+            "doi:10.7910/DVN/TJCLKP",
+        ),
+        (
+            "https://dataverse.harvard.edu/api/access/datafile/3323458",
+            "doi:10.7910/DVN/3MJ7IR",
+        ),
+        (
+            "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
+            "hdl:11529/10016",
+        ),
+    ],
 )
 def test_get_persistent_id(url, persistent_id):
     assert Dataverse().get_persistent_id_from_url(url) == persistent_id
 
-def test_dataverse_fetch():
 
+@pytest.mark.parametrize(
+    ("spec", "md5tree"),
+    [
+        (
+            "doi:10.7910/DVN/TJCLKP",
+            {
+                "data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694",
+                "data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917",
+                "code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e",
+            },
+        ),
+        (
+            # A citation targeting a single file
+            "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
+            {
+                "ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab": "17a91888ed8e91dfb63acbbab6127ac5"
+            }
+        )
+    ],
+)
+def test_fetch(spec, md5tree):
     dv = Dataverse()
-    spec = dv.detect("doi:10.7910/DVN/TJCLKP")
 
     with TemporaryDirectory() as d:
         output = []
-        for l in dv.fetch(spec, d):
+        for l in dv.fetch(dv.detect(spec), d):
             output.append(l)
 
-        # Verify two directories
-        assert set(os.listdir(d)) == {"data", "code"}
-
-        # Verify sha256sum of three files
-        expected_sha = {
-            'data/primary/primary-data.zip': '880f99a1e1d54a2553be61301f92e06b29236785b8d4d1b7ad0b4595d9d7512b',
-            'data/2023-01-03.tsv': 'cc9759e8e6bc076dd7c1a8eb53a7ea3d38e8697fa9f544d15768db308516cc5f',
-            'code/language.py': '1ffb3b3cdc9de01279779f3fc88824672c8ec3ab1c41ecdd5c1b59a9b0202215'
-        }
-
-        for subpath, expected_sha in expected_sha.items():
-            with open(os.path.join(d, subpath), 'rb') as f:
-                h = hashlib.sha256()
+        # Verify md5 sum of the files we expect to find
+        # We are using md5 instead of something more secure because that is what
+        # dataverse itself uses
+        for subpath, expected_sha in md5tree.items():
+            with open(os.path.join(d, subpath), "rb") as f:
+                h = hashlib.md5()
                 h.update(f.read())
-                assert h.hexdigest() == expected_sha
+                assert h.hexdigest() == expected_sha