Skip to content

Commit 153ad2f

Browse files
committed
Refactor debian mining pipeline for git deployment
Signed-off-by: Keshav Priyadarshi <[email protected]>
1 parent 96e2c56 commit 153ad2f

File tree

4 files changed

+59
-121
lines changed

4 files changed

+59
-121
lines changed

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
#
99

1010

11-
VERSION = "0.0.1b42"
11+
VERSION = "0.0.1b53"

minecode_pipelines/pipelines/mine_debian.py

Lines changed: 54 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,36 +20,72 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from scanpipe.pipelines import Pipeline
2423
from scanpipe.pipes import federatedcode
2524

2625
from minecode_pipelines import pipes
26+
from minecode_pipelines.pipelines import MineCodeBasePipeline
27+
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
2728
from minecode_pipelines.pipes import debian
2829

2930

30-
class MineDebian(Pipeline):
31-
"""
32-
Mine all packageURLs from a Debian index and publish them to
33-
a FederatedCode repo.
34-
"""
31+
class MineDebian(MineCodeBasePipeline):
32+
"""Mine PackageURLs from Debian index and publish them to FederatedCode."""
33+
34+
pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/"
35+
checkpoint_path = "debian/checkpoints.json"
36+
append_purls = True
3537

3638
@classmethod
3739
def steps(cls):
3840
return (
3941
cls.check_federatedcode_eligibility,
40-
cls.collect_packages_from_debian,
41-
cls.delete_cloned_repos,
42+
cls.create_federatedcode_working_dir,
43+
cls.fetch_federation_config,
44+
cls.fetch_checkpoint_and_debian_index,
45+
cls.mine_and_publish_alpine_packageurls,
46+
cls.save_check_point,
47+
cls.delete_working_dir,
4248
)
4349

44-
def check_federatedcode_eligibility(self):
45-
"""
46-
Check if the project fulfills the following criteria for
47-
pushing the project result to FederatedCode.
48-
"""
49-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
50+
def fetch_checkpoint_and_debian_index(self):
51+
self.checkpoint_config_repo = federatedcode.clone_repository(
52+
repo_url=self.pipeline_config_repo,
53+
clone_path=self.working_path / "minecode-pipelines-config",
54+
logger=self.log,
55+
)
56+
checkpoint = pipes.get_checkpoint_from_file(
57+
cloned_repo=self.checkpoint_config_repo,
58+
path=self.checkpoint_path,
59+
)
5060

51-
def collect_packages_from_debian(self):
52-
self.repos = debian.collect_packages_from_debian(logger=self.log)
61+
self.last_checkpoint = checkpoint.get("previous_debian_index_last_modified_date")
62+
self.log(f"last_checkpoint: {self.last_checkpoint}")
63+
self.debian_collector = debian.DebianCollector(logger=self.log)
5364

54-
def delete_cloned_repos(self):
55-
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
65+
def mine_and_publish_alpine_packageurls(self):
66+
_mine_and_publish_packageurls(
67+
packageurls=self.debian_collector.get_packages(
68+
previous_index_last_modified_date=self.last_checkpoint,
69+
),
70+
total_package_count=None,
71+
data_cluster=self.data_cluster,
72+
checked_out_repos=self.checked_out_repos,
73+
working_path=self.working_path,
74+
append_purls=self.append_purls,
75+
commit_msg_func=self.commit_message,
76+
logger=self.log,
77+
)
78+
79+
def save_check_point(self):
80+
"""Save Debian checkpoint only after successful completion of PURL mining"""
81+
from commoncode.date import get_file_mtime
82+
83+
last_modified = get_file_mtime(self.debian_collector.index_location)
84+
checkpoint = {"previous_debian_index_last_modified_date": last_modified}
85+
self.log(f"Saving checkpoint: {checkpoint}")
86+
pipes.update_checkpoints_in_github(
87+
checkpoint=checkpoint,
88+
cloned_repo=self.pipeline_config_repo,
89+
path=self.checkpoint_path,
90+
logger=self.log,
91+
)

minecode_pipelines/pipes/debian.py

Lines changed: 3 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -21,32 +21,21 @@
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

2323
import gzip
24-
from datetime import datetime
2524
import logging
25+
from datetime import datetime
2626
from shutil import rmtree
27+
from traceback import format_exc as traceback_format_exc
2728

2829
import debian_inspector
29-
from aboutcode import hashid
3030
from commoncode import fileutils
31-
from commoncode.date import get_file_mtime
3231
from packagedcode.models import PackageData
3332
from packageurl import PackageURL
34-
from scanpipe.pipes import federatedcode
3533
from scanpipe.pipes.fetch import fetch_http
3634

37-
from minecode_pipelines import pipes
38-
from minecode_pipelines import VERSION
3935
from minecode_pipelines.pipes import ls
40-
from traceback import format_exc as traceback_format_exc
4136

42-
DEBIAN_CHECKPOINT_PATH = "debian/checkpoints.json"
4337
DEBIAN_LSLR_URL = "http://ftp.debian.org/debian/ls-lR.gz"
4438

45-
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
46-
MINECODE_DATA_DEBIAN_REPO = "https://github.com/aboutcode-data/minecode-data-debian-test"
47-
48-
PACKAGE_BATCH_SIZE = 1000
49-
5039

5140
def is_collectible(file_name):
5241
"""Return True if a `file_name` is collectible."""
@@ -173,91 +162,4 @@ def get_packages(self, previous_index_last_modified_date=None, logger=None):
173162
size=entry.size,
174163
download_url=url_template.format(path=path),
175164
)
176-
yield versionless_purl, packaged_data
177-
178-
179-
def commit_message(commit_batch, total_commit_batch="many"):
180-
from django.conf import settings
181-
182-
author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME
183-
author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL
184-
tool_name = "pkg:github/aboutcode-org/scancode.io"
185-
186-
return f"""\
187-
Collect PackageURLs from Debian ({commit_batch}/{total_commit_batch})
188-
189-
Tool: {tool_name}@v{VERSION}
190-
Reference: https://{settings.ALLOWED_HOSTS[0]}
191-
192-
Signed-off-by: {author_name} <{author_email}>
193-
"""
194-
195-
196-
def collect_packages_from_debian(files_per_commit=PACKAGE_BATCH_SIZE, logger=None):
197-
# Clone data and config repo
198-
data_repo = federatedcode.clone_repository(
199-
repo_url=MINECODE_DATA_DEBIAN_REPO,
200-
logger=logger,
201-
)
202-
config_repo = federatedcode.clone_repository(
203-
repo_url=pipes.MINECODE_PIPELINES_CONFIG_REPO,
204-
logger=logger,
205-
)
206-
if logger:
207-
logger(f"{MINECODE_DATA_DEBIAN_REPO} repo cloned at: {data_repo.working_dir}")
208-
logger(f"{pipes.MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {config_repo.working_dir}")
209-
210-
# get last_modified to see if we can skip files
211-
checkpoint = pipes.get_checkpoint_from_file(
212-
cloned_repo=config_repo, path=DEBIAN_CHECKPOINT_PATH
213-
)
214-
last_modified = checkpoint.get("previous_debian_index_last_modified_date")
215-
if logger:
216-
logger(f"previous_debian_index_last_modified_date: {last_modified}")
217-
218-
# download and iterate through debian index
219-
debian_collector = DebianCollector()
220-
files_to_commit = []
221-
commit_batch = 1
222-
for current_purl, package in debian_collector.get_packages(
223-
previous_index_last_modified_date=last_modified
224-
):
225-
# write packageURL to file
226-
package_base_dir = hashid.get_package_base_dir(purl=current_purl)
227-
purl_file = pipes.write_packageurls_to_file(
228-
repo=data_repo,
229-
base_dir=package_base_dir,
230-
packageurls=[package.purl],
231-
append=True,
232-
)
233-
if purl_file not in files_to_commit:
234-
files_to_commit.append(purl_file)
235-
236-
if len(files_to_commit) == files_per_commit:
237-
federatedcode.commit_and_push_changes(
238-
commit_message=commit_message(commit_batch),
239-
repo=data_repo,
240-
files_to_commit=files_to_commit,
241-
logger=logger,
242-
)
243-
files_to_commit.clear()
244-
commit_batch += 1
245-
246-
if files_to_commit:
247-
federatedcode.commit_and_push_changes(
248-
commit_message=commit_message(commit_batch),
249-
repo=data_repo,
250-
files_to_commit=files_to_commit,
251-
logger=logger,
252-
)
253-
254-
last_modified = get_file_mtime(debian_collector.index_location)
255-
checkpoint = {"previous_debian_index_last_modified_date": last_modified}
256-
if logger:
257-
logger(f"checkpoint: {checkpoint}")
258-
pipes.update_checkpoints_in_github(
259-
checkpoint=checkpoint, cloned_repo=config_repo, path=DEBIAN_CHECKPOINT_PATH
260-
)
261-
262-
repos_to_clean = [data_repo, config_repo]
263-
return repos_to_clean
165+
yield versionless_purl, [packaged_data.purl]

pyproject-minecode_pipelines.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
44

55
[project]
66
name = "minecode_pipelines"
7-
version = "0.0.1b42"
7+
version = "0.0.1b53"
88
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
99
readme = "minecode_pipelines/README.rst"
1010
license = { text = "Apache-2.0" }

0 commit comments

Comments
 (0)