Skip to content

Commit 78bb653

Browse files
Migrate cran pipeline to MineCodeBasePipeline
Reference: #775 Signed-off-by: Ayan Sinha Mahapatra <asmahapatra@aboutcode.org>
1 parent da9baa8 commit 78bb653

File tree

2 files changed

+18
-69
lines changed

2 files changed

+18
-69
lines changed

minecode_pipelines/pipelines/mine_cpan.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from scanpipe.pipelines import Pipeline
24-
from scanpipe.pipes import federatedcode
2523

2624
from minecode_pipelines import pipes
2725
from minecode_pipelines.pipes import cpan
26+
from minecode_pipelines.pipelines import MineCodeBasePipeline
27+
from scanpipe.pipes import federatedcode
2828

2929

30-
class MineCpan(Pipeline):
30+
class MineCpan(MineCodeBasePipeline):
3131
"""
3232
Mine all packageURLs from a cpan index and publish them to
3333
a FederatedCode repo.
@@ -37,28 +37,24 @@ class MineCpan(Pipeline):
3737
def steps(cls):
3838
return (
3939
cls.check_federatedcode_eligibility,
40+
cls.create_federatedcode_working_dir,
4041
cls.mine_cpan_packages,
42+
cls.fetch_federation_config,
4143
cls.mine_and_publish_cpan_packageurls,
42-
cls.delete_cloned_repos,
44+
cls.mine_and_publish_packageurls,
45+
cls.delete_working_dir,
4346
)
4447

45-
def check_federatedcode_eligibility(self):
46-
"""
47-
Check if the project fulfills the following criteria for
48-
pushing the project result to FederatedCode.
49-
"""
50-
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
51-
5248
def mine_cpan_packages(self):
5349
"""Mine cpan package names from cpan indexes or checkpoint."""
5450
self.cpan_packages_path_by_name = cpan.mine_cpan_packages(logger=self.log)
5551

56-
def mine_and_publish_cpan_packageurls(self):
52+
def packages_count(self):
53+
return len(self.cpan_packages_path_by_name)
54+
55+
def mine_packageurls(self):
5756
"""Get cpan packageURLs for all mined cpan package names."""
58-
self.repos = cpan.mine_and_publish_cpan_packageurls(
57+
cpan.mine_and_publish_cpan_packageurls(
5958
package_path_by_name=self.cpan_packages_path_by_name,
6059
logger=self.log,
6160
)
62-
63-
def delete_cloned_repos(self):
64-
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)

minecode_pipelines/pipes/cpan.py

Lines changed: 6 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from minecode_pipelines import VERSION
24-
from minecode_pipelines.pipes import write_packageurls_to_file
25-
2623
from minecode_pipelines.miners.cpan import get_cpan_packages
2724
from minecode_pipelines.miners.cpan import get_cpan_packageurls
2825
from minecode_pipelines.miners.cpan import CPAN_REPO
@@ -32,11 +29,6 @@
3229

3330
from aboutcode.hashid import get_package_base_dir
3431
from packageurl import PackageURL
35-
from scanpipe.pipes.federatedcode import clone_repository
36-
37-
from scanpipe.pipes.federatedcode import commit_changes
38-
from scanpipe.pipes.federatedcode import push_changes
39-
4032

4133
# If True, show full details on fetching packageURL for
4234
# a package name present in the index
@@ -45,10 +37,6 @@
4537
PACKAGE_BATCH_SIZE = 500
4638

4739

48-
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
49-
MINECODE_DATA_CPAN_REPO = "https://github.com/aboutcode-data/minecode-data-cpan-test"
50-
51-
5240
def mine_cpan_packages(logger=None):
5341
if logger:
5442
logger("Getting packages from cpan index")
@@ -66,21 +54,15 @@ def mine_and_publish_cpan_packageurls(package_path_by_name, logger=None):
6654
if not package_path_by_name:
6755
return
6856

69-
# clone repo
70-
cloned_data_repo = clone_repository(repo_url=MINECODE_DATA_CPAN_REPO)
71-
if logger:
72-
logger(f"{MINECODE_DATA_CPAN_REPO} repo cloned at: {cloned_data_repo.working_dir}")
73-
57+
packageurls_by_base_purl = {}
7458
for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=package_path_by_name.keys()):
7559
packages_mined = []
76-
purls = []
77-
purl_files = []
7860

7961
if logger and LOG_PACKAGEURL_DETAILS:
8062
logger("Starting package mining for a batch of packages")
8163

8264
for package_name in package_batch:
83-
if not package_name:
65+
if not package_name or package_name in packages_mined:
8466
continue
8567

8668
# fetch packageURLs for package
@@ -106,41 +88,12 @@ def mine_and_publish_cpan_packageurls(package_path_by_name, logger=None):
10688

10789
# get repo and path for package
10890
base_purl = PackageURL(type=CPAN_TYPE, name=package_name).to_string()
109-
package_base_dir = get_package_base_dir(purl=base_purl)
110-
11191
if logger and LOG_PACKAGEURL_DETAILS:
112-
logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}")
92+
logger(f"fetched packageURLs for package: {base_purl}")
11393
purls_string = " ".join(packageurls)
11494
logger(f"packageURLs: {purls_string}")
11595

116-
# write packageURLs to file
117-
purl_file = write_packageurls_to_file(
118-
repo=cloned_data_repo,
119-
base_dir=package_base_dir,
120-
packageurls=packageurls,
121-
)
122-
purl_files.append(purl_file)
123-
purls.append(base_purl)
124-
12596
packages_mined.append(package_name)
126-
127-
if logger:
128-
purls_string = " ".join(purls)
129-
logger("Committing and pushing changes for a batch of packages: ")
130-
logger(f"{purls_string}")
131-
132-
# commit changes
133-
commit_changes(
134-
repo=cloned_data_repo,
135-
files_to_commit=purl_files,
136-
purls=purls,
137-
mine_type="packageURL",
138-
tool_name="pkg:cpan/minecode-pipelines",
139-
tool_version=VERSION,
140-
)
141-
142-
# Push changes to remote repository
143-
push_changes(repo=cloned_data_repo)
144-
145-
repos_to_clean = [cloned_data_repo]
146-
return repos_to_clean
97+
packageurls_by_base_purl[base_purl] = packageurls
98+
99+
return packageurls_by_base_purl

0 commit comments

Comments
 (0)