Stop mocking dataverse contentprovider test

yuvipanda · yuvipanda · commit 52eeb8f16126 · 2024-12-16T14:51:39.000-08:00
As I was debugging jupyterhub#1388, I realized that PR actually broke the dataverse provider, but the existing test was mocking so much that we didn't actually catch it! IMO, since the point of contentproviders is to integrate with external content providers, they should be integration tests so we can catch issues with them more easily. Integration tests would have caught https://jupyter.zulipchat.com/#narrow/channel/103349-ask-anything/topic/Binder.20Dataverse.20error more cleanly than how it happened for example. This PR removes all mocks from the dataverse test, and we immediately benefit - it shows us that the dataverse provider *only* actually handles DOIs, and not direct URLs! So even though we technically had tests earlier that showed our dataverse provider supporting direct dataverse URLs, it simply was not true. So we actually catch the failure. I will try to see if we can use a demo or test instance for the fetch test though, so we don't screw up download stats even more for the existing test doi we use.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -60,6 +60,7 @@ jobs:
           - r
           - unit
           - venv
+          - contentproviders
         include:
           # The actions/setup-python action with Python version 3.6 isn't
           # possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04
diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py
@@ -0,0 +1,54 @@
+import hashlib
+import os
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from repo2docker.contentproviders import Dataverse
+
+test_dv = Dataverse()
+harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
+cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
+
+@pytest.mark.parametrize(
+    ("doi", "resolved"),
+    [
+        ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}),
+        ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}),
+        ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}),
+        ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "record": "doi:10.7910/DVN/6ZXAGT"}),
+        ("/some/random/string", None),
+        ("https://example.com/path/here", None),
+        # Non dataverse DOIs
+        ("https://doi.org/10.21105/joss.01277", None)
+    ]
+)
+def test_detect(doi, resolved):
+    assert Dataverse().detect(doi) == resolved
+
+
+def test_dataverse_fetch():
+    spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/TJCLKP"}
+
+    dv = Dataverse()
+
+    with TemporaryDirectory() as d:
+        output = []
+        for l in dv.fetch(spec, d):
+            output.append(l)
+
+        # Verify two directories
+        assert set(os.listdir(d)) == {"data", "code"}
+
+        # Verify sha256sum of three files
+        expected_sha = {
+            'data/primary/primary-data.zip': '880f99a1e1d54a2553be61301f92e06b29236785b8d4d1b7ad0b4595d9d7512b',
+            'data/2023-01-03.tsv': 'cc9759e8e6bc076dd7c1a8eb53a7ea3d38e8697fa9f544d15768db308516cc5f',
+            'code/language.py': '1ffb3b3cdc9de01279779f3fc88824672c8ec3ab1c41ecdd5c1b59a9b0202215'
+        }
+
+        for subpath, expected_sha in expected_sha.items():
+            with open(os.path.join(d, subpath), 'rb') as f:
+                h = hashlib.sha256()
+                h.update(f.read())
+                assert h.hexdigest() == expected_sha
diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py