Skip to content

Commit 785907d

Browse files
Merge pull request #726 from aboutcode-org/minecode-pipeline-npm
Add support to mine npm PackageURLs
2 parents de2cb7f + 91307b4 commit 785907d

File tree

11 files changed

+679
-22
lines changed

11 files changed

+679
-22
lines changed

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
#
99

1010

11-
VERSION = "0.0.1b60"
11+
VERSION = "0.1.0"

minecode_pipelines/miners/npm.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
11+
import json
12+
import requests
13+
14+
from packageurl import PackageURL
15+
16+
17+
"""
18+
Visitors for Npmjs and npmjs-like javascript package repositories.
19+
20+
We have this hierarchy in npm replicate and registry index:
21+
npm projects replicate.npmjs.com (paginated JSON) -> versions at registry.npmjs.org (JSON) -> download urls
22+
23+
See https://github.com/orgs/community/discussions/152515 for information on
24+
the latest replicate.npmjs.com API.
25+
26+
https://replicate.npmjs.com/_all_docs
27+
This NPMJS replicate API serves as an index to get all npm packages and their revision IDs
28+
in paginated queries.
29+
30+
https://replicate.npmjs.com/_changes
31+
This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which
32+
can be fetched in paginated queries.
33+
34+
https://registry.npmjs.org/{namespace/name}
35+
For each npm package, a JSON containing details including the list of all releases
36+
and archives, their URLs, and some metadata for each release.
37+
38+
https://registry.npmjs.org/{namespace/name}/{version}
39+
For each release, a JSON contains details for the released version and all the
40+
downloads available for this release.
41+
"""
42+
43+
44+
NPM_REPLICATE_REPO = "https://replicate.npmjs.com/"
45+
NPM_REGISTRY_REPO = "https://registry.npmjs.org/"
46+
NPM_TYPE = "npm"
47+
NPM_REPLICATE_BATCH_SIZE = 10000
48+
49+
50+
def get_package_names_last_key(package_data):
51+
names = [package.get("id") for package in package_data.get("rows")]
52+
last_key = package_data.get("rows")[-1].get("key")
53+
return names, last_key
54+
55+
56+
def get_package_names_last_seq(package_data):
57+
names = [package.get("id") for package in package_data.get("results")]
58+
last_seq = package_data.get("last_seq")
59+
return names, last_seq
60+
61+
62+
def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO):
63+
npm_replicate_latest_changes = replicate_url + "_changes?descending=True"
64+
response = requests.get(npm_replicate_latest_changes)
65+
if not response.ok:
66+
return
67+
68+
package_data = response.json()
69+
_package_names, last_seq = get_package_names_last_seq(package_data)
70+
return last_seq
71+
72+
73+
def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO, logger=None):
74+
all_package_names = []
75+
i = 0
76+
77+
while True:
78+
if logger:
79+
logger(f"Processing iteration: {i}: changes after seq: {last_seq}")
80+
81+
npm_replicate_changes = (
82+
replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}"
83+
)
84+
response = requests.get(npm_replicate_changes)
85+
if not response.ok:
86+
return all_package_names
87+
88+
package_data = response.json()
89+
package_names, last_seq = get_package_names_last_seq(package_data)
90+
all_package_names.extend(package_names)
91+
92+
# We have fetched the last set of changes if True
93+
if len(package_names) < NPM_REPLICATE_BATCH_SIZE:
94+
break
95+
96+
i += 1
97+
98+
return {"packages": all_package_names}, last_seq
99+
100+
101+
def get_npm_packages(replicate_url=NPM_REPLICATE_REPO, logger=None):
102+
all_package_names = []
103+
104+
npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}"
105+
response = requests.get(npm_replicate_all)
106+
if not response.ok:
107+
return all_package_names
108+
109+
package_data = response.json()
110+
package_names, last_key = get_package_names_last_key(package_data)
111+
all_package_names.extend(package_names)
112+
113+
total_rows = package_data.get("total_rows")
114+
iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1
115+
116+
for i in range(iterations):
117+
npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"'
118+
if logger:
119+
logger(f"Processing iteration: {i}: {npm_replicate_from_id}")
120+
121+
response = requests.get(npm_replicate_from_id)
122+
if not response.ok:
123+
raise Exception(npm_replicate_from_id, response.text)
124+
125+
package_data = response.json()
126+
package_names, last_key = get_package_names_last_key(package_data)
127+
all_package_names.extend(package_names)
128+
129+
return {"packages": all_package_names}
130+
131+
132+
def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO):
133+
packageurls = []
134+
135+
project_index_api_url = npm_repo + name
136+
response = requests.get(project_index_api_url)
137+
if not response.ok:
138+
return packageurls
139+
140+
project_data = response.json()
141+
for version in project_data.get("versions"):
142+
purl = PackageURL(
143+
type=NPM_TYPE,
144+
name=name,
145+
version=version,
146+
)
147+
packageurls.append(purl.to_string())
148+
149+
return packageurls
150+
151+
152+
def load_npm_packages(packages_file):
153+
with open(packages_file) as f:
154+
packages_data = json.load(f)
155+
156+
return packages_data.get("packages", [])

minecode_pipelines/miners/pypi.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@
1313

1414
from packageurl import PackageURL
1515

16-
from minecode_pipelines.utils import get_temp_file
17-
from minecode_pipelines.pipes import write_data_to_json_file
18-
1916
"""
2017
Visitors for Pypi and Pypi-like Python package repositories.
2118
@@ -52,12 +49,6 @@ def get_pypi_packages(pypi_repo, logger=None):
5249
return response.json()
5350

5451

55-
def write_packages_json(packages, name):
56-
temp_file = get_temp_file(name)
57-
write_data_to_json_file(path=temp_file, data=packages)
58-
return temp_file
59-
60-
6152
def get_pypi_packageurls(name):
6253
packageurls = []
6354

minecode_pipelines/pipelines/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,6 @@ def _mine_and_publish_packageurls(
193193
)
194194
checkout["file_to_commit"].add(purl_file)
195195
checkout["file_processed_count"] += 1
196-
if logger:
197-
logger(f"{checkout['repo'].working_dir}: {checkout['file_processed_count']} / {batch_size}")
198196

199197
if len(checkout["file_to_commit"]) > batch_size:
200198
if logger:
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from minecode_pipelines.pipes import npm
24+
from minecode_pipelines.pipelines import MineCodeBasePipeline
25+
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
26+
27+
28+
class MineNPM(MineCodeBasePipeline):
29+
"""
30+
Mine all packageURLs from a npm index and publish them to
31+
a FederatedCode repo.
32+
"""
33+
34+
package_batch_size = 5
35+
36+
@classmethod
37+
def steps(cls):
38+
return (
39+
cls.check_federatedcode_eligibility,
40+
cls.create_federatedcode_working_dir,
41+
cls.mine_npm_packages,
42+
cls.get_npm_packages_to_sync,
43+
cls.fetch_federation_config,
44+
cls.mine_and_publish_packageurls,
45+
cls.update_state_and_checkpoints,
46+
cls.delete_working_dir,
47+
)
48+
49+
def mine_npm_packages(self):
50+
"""Mine npm package names from npm indexes or checkpoint."""
51+
(
52+
self.npm_packages, self.state, self.last_seq, self.config_repo
53+
) = npm.mine_npm_packages(logger=self.log)
54+
55+
def get_npm_packages_to_sync(self):
56+
"""Get npm packages which needs to be synced using checkpoint."""
57+
self.packages, self.synced_packages = npm.get_npm_packages_to_sync(
58+
packages_file=self.npm_packages,
59+
state=self.state,
60+
logger=self.log,
61+
)
62+
63+
def packages_count(self):
64+
return len(self.packages)
65+
66+
def mine_packageurls(self):
67+
"""Yield npm packageURLs for all mined npm package names."""
68+
self.packages_mined = []
69+
yield from npm.mine_and_publish_npm_packageurls(
70+
packages_to_sync=self.packages,
71+
packages_mined=self.packages_mined,
72+
logger=self.log,
73+
)
74+
75+
def save_check_point(self):
76+
npm.save_mined_packages_in_checkpoint(
77+
packages_mined=self.packages_mined,
78+
synced_packages=self.synced_packages,
79+
config_repo=self.config_repo,
80+
logger=self.log,
81+
)
82+
self.packages_mined = []
83+
84+
def mine_and_publish_packageurls(self):
85+
"""Mine and publish PackageURLs."""
86+
87+
_mine_and_publish_packageurls(
88+
packageurls=self.mine_packageurls(),
89+
total_package_count=self.packages_count(),
90+
data_cluster=self.data_cluster,
91+
checked_out_repos=self.checked_out_repos,
92+
working_path=self.working_path,
93+
append_purls=self.append_purls,
94+
commit_msg_func=self.commit_message,
95+
logger=self.log,
96+
checkpoint_func=self.save_check_point,
97+
checkpoint_on_commit=True,
98+
batch_size=self.package_batch_size,
99+
)
100+
101+
def update_state_and_checkpoints(self):
102+
npm.update_state_and_checkpoints(
103+
state=self.state,
104+
last_seq=self.last_seq,
105+
config_repo=self.config_repo,
106+
logger=self.log,
107+
)

minecode_pipelines/pipelines/mine_pypi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from minecode_pipelines.pipelines import MineCodeBasePipeline
2525
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
2626

27+
2728
class MinePypi(MineCodeBasePipeline):
2829
"""
2930
Mine all packageURLs from a pypi index and publish them to

0 commit comments

Comments
 (0)