Skip to content

Commit 6578915

Browse files
authored
feat: lower min intervals, reduce CI work
1 parent 666fcaf commit 6578915

File tree

3 files changed

+63
-24
lines changed

3 files changed

+63
-24
lines changed

cdx_toolkit/myrequests.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ def dns_fatal(hostname):
2727
},
2828
'index.commoncrawl.org': {
2929
'next_fetch': 0,
30-
'minimum_interval': 3.0,
30+
'minimum_interval': 1.0,
3131
},
3232
'data.commoncrawl.org': {
3333
'next_fetch': 0,
34-
'minimum_interval': 3.0,
34+
'minimum_interval': 0.55,
3535
},
3636
'web.archive.org': {
3737
'next_fetch': 0,

tests/test_cli.py

+57-20
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,33 @@
1-
from cdx_toolkit.cli import main
2-
31
import json
42
import sys
3+
import os
4+
import platform
5+
import logging
56

67
import pytest
7-
import requests
8+
9+
from cdx_toolkit.cli import main
10+
11+
LOGGER = logging.getLogger(__name__)
12+
13+
14+
def slow_ci():
15+
'''
16+
For Github Actions, the windows and macos runners are very slow.
17+
Detect those runners, so that we can cut testing short.
18+
'''
19+
if os.environ.get('FAKE_GITHUB_ACTION'):
20+
LOGGER.error('limiting pytest because FAKE_GITHUB_ACTION')
21+
return True
22+
if os.environ.get('GITHUB_ACTION'):
23+
if platform.system() in {'Darwin', 'Windows'}:
24+
LOGGER.error('limiting pytest because GITHUB_ACTION')
25+
return True
26+
v = sys.version_info
27+
if os.environ.get('GITHUB_ACTION') and v.major == 3 and v.minor != 12:
28+
LOGGER.error('limiting pytest because GITHUB_ACTION and py != 3.12')
29+
return False
30+
LOGGER.error('full pytest')
831

932

1033
def test_basics(capsys):
@@ -80,6 +103,8 @@ def test_multi_cc1(capsys, caplog):
80103

81104
for t in tests:
82105
multi_helper(t, capsys, caplog)
106+
if slow_ci():
107+
break
83108

84109

85110
def test_multi_cc2(capsys, caplog):
@@ -101,9 +126,10 @@ def test_multi_cc2(capsys, caplog):
101126

102127
for t in tests:
103128
multi_helper(t, capsys, caplog)
129+
if slow_ci():
130+
break
104131

105132

106-
@pytest.mark.skip(reason='needs some ratelimit love XXX')
107133
def test_multi_ia(capsys, caplog):
108134
tests = [
109135
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
@@ -120,12 +146,11 @@ def test_multi_ia(capsys, caplog):
120146

121147
for t in tests:
122148
multi_helper(t, capsys, caplog)
149+
break # XXX minimize IA for ratelimit purposes
123150

124151

125-
def test_multi_misc_notia(capsys, caplog):
152+
def test_multi_misc_not_ia(capsys, caplog):
126153
tests = [
127-
[{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
128-
{'count': 10, 'linefgrep': 'commoncrawl.org'}],
129154
[{'service': '-v -v --source https://web.arc4567hive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
130155
{'exception': ValueError}],
131156
[{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
@@ -142,11 +167,14 @@ def test_multi_misc_notia(capsys, caplog):
142167

143168
for t in tests:
144169
multi_helper(t, capsys, caplog)
170+
if slow_ci():
171+
break
145172

146173

147-
@pytest.mark.skip(reason='needs some ratelimit love XXX')
148174
def test_multi_misc_ia(capsys, caplog):
149175
tests = [
176+
[{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
177+
{'count': 10, 'linefgrep': 'commoncrawl.org'}],
150178
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'},
151179
{'count': 1, 'is_int': True}],
152180
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'},
@@ -157,35 +185,44 @@ def test_multi_misc_ia(capsys, caplog):
157185

158186
for t in tests:
159187
multi_helper(t, capsys, caplog)
188+
break # XXX minimize IA for ratelimit reasons
160189

161190

162191
def test_warc(tmpdir, caplog):
163192
# crash testing only, so far
164193

165-
base = ' --limit 10 warc commoncrawl.org/*'
194+
base = ' --limit 1 warc commoncrawl.org/*'
166195

167-
prefixes = ('-v -v --cc', '--ia',
168-
'--cc --cc-mirror https://index.commoncrawl.org/',
169-
'--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web')
170-
suffixes = ('--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
171-
'--prefix EMPTY --size 1 --url-fgrep bar',
172-
'--prefix EMPTY --size 1 --url-fgrepv common')
196+
prefixes = ( # note limit 2 below
197+
'-v -v --cc', # only case run by slow_cli
198+
'--ia',
199+
'--cc --cc-mirror https://index.commoncrawl.org/',
200+
'--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web',
201+
)
202+
suffixes = (
203+
'--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
204+
'--prefix EMPTY --size 1 --url-fgrep bar',
205+
'--prefix EMPTY --size 1 --url-fgrepv common'
206+
)
173207

174208
with tmpdir.as_cwd():
175209
for p in prefixes:
176-
if '--ia' in p or 'archive.org' in p:
177-
# XXX skip
178-
continue
179210
cmdline = p + base
211+
if 'cc' in cmdline:
212+
cmdline = cmdline.replace(' 1', ' 2')
180213
print(cmdline, file=sys.stderr)
181214
args = cmdline.split()
182215
main(args=args)
216+
if slow_ci():
217+
break
183218

184219
for s in suffixes:
185220
cmdline = prefixes[0] + base + ' ' + s
186221
print(cmdline, file=sys.stderr)
187222
args = cmdline.split()
188223
main(args=args)
224+
if slow_ci():
225+
break
189226

190227
assert True
191228

@@ -195,11 +232,11 @@ def one_ia_corner(tmpdir, cmdline):
195232
main(args=cmdline.split())
196233

197234

198-
@pytest.mark.skip(reason='needs some ratelimit love XXX')
235+
@pytest.mark.skip(reason='needs some ratelimit love')
199236
def test_warc_ia_corners(tmpdir, caplog):
200237
'''
201238
To test these more properly, need to add a --exact-warcname and then postprocess.
202-
For now, these tests show up in the coverage report
239+
For now, these are only crash tests.
203240
'''
204241

205242
# revisit vivification

tests/unit/test_capture_object.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
def test_capture_object():
88
cdx_cc = cdx_toolkit.CDXFetcher(source='cc')
9-
cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
9+
#XXX cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
1010
cdx_only = cdx_toolkit.CDXFetcher(source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG')
1111

1212
url = 'example.com'
@@ -16,10 +16,12 @@ def test_capture_object():
1616
for obj in cdx_only.iter(url, **kwargs):
1717
got_one = True
1818
with pytest.raises(ValueError):
19+
# we don't know how to fetch the content in this situation
1920
_ = obj.content
2021
assert got_one, 'found a capture cdx_only'
2122

22-
for cdx in (cdx_cc, cdx_ia):
23+
#XXX for cdx in (cdx_cc, cdx_ia):
24+
for cdx in (cdx_cc,):
2325
got_one = False
2426
for obj in cdx.iter(url, **kwargs):
2527
got_one = True

0 commit comments

Comments
 (0)