Skip to content

Commit de2cb7f

Browse files
Merge pull request #794 from aboutcode-org/minecode-pipeline-pypi
Migrate pypi pipeline to MineCodeBasePipeline
2 parents 9445f0d + 8636aa0 commit de2cb7f

File tree

7 files changed

+153
-130
lines changed

7 files changed

+153
-130
lines changed

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
#
99

1010

11-
VERSION = "0.0.1b59"
11+
VERSION = "0.0.1b60"

minecode_pipelines/pipelines/__init__.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ def _mine_and_publish_packageurls(
151151
commit_msg_func: Callable,
152152
logger: Callable,
153153
batch_size: int = 4000,
154+
checkpoint_on_commit: bool = False,
154155
checkpoint_func: Callable = None,
155156
checkpoint_freq: int = 30,
156157
):
@@ -192,19 +193,26 @@ def _mine_and_publish_packageurls(
192193
)
193194
checkout["file_to_commit"].add(purl_file)
194195
checkout["file_processed_count"] += 1
196+
if logger:
197+
logger(f"{checkout['repo'].working_dir}: {checkout['file_processed_count']} / {batch_size}")
195198

196199
if len(checkout["file_to_commit"]) > batch_size:
200+
if logger:
201+
logger(f"Trying to commit PackageURLs.")
197202
pipes.commit_and_push_checkout(
198203
local_checkout=checkout,
199204
commit_message=commit_msg_func(checkout["commit_count"] + 1),
200205
logger=logger,
201206
)
202-
203-
time_now = time.time()
204-
checkpoint_due = time_now - last_checkpoint_call >= checkpoint_interval
205-
if checkpoint_func and checkpoint_due:
206-
checkpoint_func()
207-
last_checkpoint_call = time_now
207+
if checkpoint_on_commit and checkpoint_func:
208+
checkpoint_func()
209+
210+
if not checkpoint_on_commit:
211+
time_now = time.time()
212+
checkpoint_due = time_now - last_checkpoint_call >= checkpoint_interval
213+
if checkpoint_func and checkpoint_due:
214+
checkpoint_func()
215+
last_checkpoint_call = time_now
208216

209217
for checkout in checked_out_repos.values():
210218
final_commit_count = checkout["commit_count"] + 1

minecode_pipelines/pipelines/mine_maven.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def steps(cls):
4242
cls.create_federatedcode_working_dir,
4343
cls.fetch_federation_config,
4444
cls.fetch_checkpoint_and_maven_index,
45-
cls.mine_and_publish_alpine_packageurls,
45+
cls.mine_and_publish_maven_packageurls,
4646
cls.delete_working_dir,
4747
)
4848

@@ -61,7 +61,7 @@ def fetch_checkpoint_and_maven_index(self):
6161
self.log(f"last_incremental: {last_incremental}")
6262
self.maven_nexus_collector = maven.MavenNexusCollector(last_incremental=last_incremental)
6363

64-
def mine_and_publish_alpine_packageurls(self):
64+
def mine_and_publish_maven_packageurls(self):
6565
_mine_and_publish_packageurls(
6666
packageurls=self.maven_nexus_collector.get_packages(),
6767
total_package_count=None,

minecode_pipelines/pipelines/mine_pypi.py

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,46 +20,83 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from scanpipe.pipelines import Pipeline
24-
from scanpipe.pipes import federatedcode
25-
26-
from minecode_pipelines import pipes
2723
from minecode_pipelines.pipes import pypi
24+
from minecode_pipelines.pipelines import MineCodeBasePipeline
25+
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
2826

29-
30-
class MinePypi(Pipeline):
27+
class MinePypi(MineCodeBasePipeline):
3128
"""
3229
Mine all packageURLs from a pypi index and publish them to
3330
a FederatedCode repo.
3431
"""
3532

33+
package_batch_size = 100
34+
3635
@classmethod
3736
def steps(cls):
3837
return (
3938
cls.check_federatedcode_eligibility,
39+
cls.create_federatedcode_working_dir,
4040
cls.mine_pypi_packages,
41-
cls.mine_and_publish_pypi_packageurls,
42-
cls.delete_cloned_repos,
41+
cls.get_pypi_packages_to_sync,
42+
cls.fetch_federation_config,
43+
cls.mine_and_publish_packageurls,
44+
cls.update_state_and_checkpoints,
45+
cls.delete_working_dir,
4346
)
4447

45-
def check_federatedcode_eligibility(self):
46-
"""
47-
Check if the project fulfills the following criteria for
48-
pushing the project result to FederatedCode.
49-
"""
50-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
51-
5248
def mine_pypi_packages(self):
5349
"""Mine pypi package names from pypi indexes or checkpoint."""
54-
self.pypi_packages, self.state = pypi.mine_pypi_packages(logger=self.log)
50+
self.pypi_packages, self.state, self.config_repo = pypi.mine_pypi_packages(logger=self.log)
5551

56-
def mine_and_publish_pypi_packageurls(self):
57-
"""Get pypi packageURLs for all mined pypi package names."""
58-
self.repos = pypi.mine_and_publish_pypi_packageurls(
52+
def get_pypi_packages_to_sync(self):
53+
"""Get pypi packages which needs to be synced using checkpoint."""
54+
self.packages, self.last_serial = pypi.get_pypi_packages_to_sync(
5955
packages_file=self.pypi_packages,
6056
state=self.state,
6157
logger=self.log,
6258
)
6359

64-
def delete_cloned_repos(self):
65-
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
60+
def packages_count(self):
61+
return len(self.packages)
62+
63+
def mine_packageurls(self):
64+
"""Yield pypi packageURLs for all mined pypi package names."""
65+
self.packages_mined = []
66+
yield from pypi.mine_and_publish_pypi_packageurls(
67+
packages_to_sync=self.packages,
68+
packages_mined=self.packages_mined,
69+
logger=self.log,
70+
)
71+
72+
def save_check_point(self):
73+
pypi.save_mined_packages_in_checkpoint(
74+
packages_mined=self.packages_mined,
75+
config_repo=self.config_repo,
76+
logger=self.log,
77+
)
78+
self.packages_mined = []
79+
80+
def mine_and_publish_packageurls(self):
81+
"""Mine and publish PackageURLs."""
82+
83+
_mine_and_publish_packageurls(
84+
packageurls=self.mine_packageurls(),
85+
total_package_count=self.packages_count(),
86+
data_cluster=self.data_cluster,
87+
checked_out_repos=self.checked_out_repos,
88+
working_path=self.working_path,
89+
append_purls=self.append_purls,
90+
commit_msg_func=self.commit_message,
91+
logger=self.log,
92+
checkpoint_func=self.save_check_point,
93+
checkpoint_on_commit=True,
94+
batch_size=self.package_batch_size,
95+
)
96+
97+
def update_state_and_checkpoints(self):
98+
pypi.update_state_and_checkpoints(
99+
config_repo=self.config_repo,
100+
last_serial=self.last_serial,
101+
logger=self.log,
102+
)

minecode_pipelines/pipes/__init__.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def get_checkpoint_from_file(cloned_repo, path):
4343
return checkpoint_data or {}
4444

4545

46-
def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger):
46+
def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None):
4747
from scanpipe.pipes.federatedcode import commit_and_push_changes
4848

4949
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
@@ -65,16 +65,17 @@ def get_mined_packages_from_checkpoint(config_repo, checkpoint_path):
6565
return checkpoint.get("packages_mined", [])
6666

6767

68-
def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, checkpoint_path):
68+
def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, checkpoint_path, logger=None):
6969
mined_packages = get_mined_packages_from_checkpoint(
7070
config_repo=config_repo,
7171
checkpoint_path=checkpoint_path,
7272
)
73-
packages = {"packages_mined": packages + mined_packages}
73+
packages_to_update = {"packages_mined": packages + mined_packages}
7474
update_checkpoints_in_github(
75-
checkpoint=packages,
75+
checkpoint=packages_to_update,
7676
cloned_repo=cloned_repo,
7777
path=checkpoint_path,
78+
logger=logger,
7879
)
7980

8081

0 commit comments

Comments
 (0)