|
21 | 21 | # Visit https://github.com/aboutcode-org/scancode.io for support and download. |
22 | 22 |
|
23 | 23 | import gzip |
24 | | -from datetime import datetime |
25 | 24 | import logging |
| 25 | +from datetime import datetime |
26 | 26 | from shutil import rmtree |
| 27 | +from traceback import format_exc as traceback_format_exc |
27 | 28 |
|
28 | 29 | import debian_inspector |
29 | | -from aboutcode import hashid |
30 | 30 | from commoncode import fileutils |
31 | | -from commoncode.date import get_file_mtime |
32 | 31 | from packagedcode.models import PackageData |
33 | 32 | from packageurl import PackageURL |
34 | | -from scanpipe.pipes import federatedcode |
35 | 33 | from scanpipe.pipes.fetch import fetch_http |
36 | 34 |
|
37 | | -from minecode_pipelines import pipes |
38 | | -from minecode_pipelines import VERSION |
39 | 35 | from minecode_pipelines.pipes import ls |
40 | | -from traceback import format_exc as traceback_format_exc |
41 | 36 |
|
42 | | -DEBIAN_CHECKPOINT_PATH = "debian/checkpoints.json" |
43 | 37 | DEBIAN_LSLR_URL = "http://ftp.debian.org/debian/ls-lR.gz" |
44 | 38 |
|
45 | | -# We are testing and storing mined packageURLs in one single repo per ecosystem for now |
46 | | -MINECODE_DATA_DEBIAN_REPO = "https://github.com/aboutcode-data/minecode-data-debian-test" |
47 | | - |
48 | | -PACKAGE_BATCH_SIZE = 1000 |
49 | | - |
50 | 39 |
|
51 | 40 | def is_collectible(file_name): |
52 | 41 | """Return True if a `file_name` is collectible.""" |
@@ -173,91 +162,4 @@ def get_packages(self, previous_index_last_modified_date=None, logger=None): |
173 | 162 | size=entry.size, |
174 | 163 | download_url=url_template.format(path=path), |
175 | 164 | ) |
176 | | - yield versionless_purl, packaged_data |
177 | | - |
178 | | - |
179 | | -def commit_message(commit_batch, total_commit_batch="many"): |
180 | | - from django.conf import settings |
181 | | - |
182 | | - author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME |
183 | | - author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL |
184 | | - tool_name = "pkg:github/aboutcode-org/scancode.io" |
185 | | - |
186 | | - return f"""\ |
187 | | - Collect PackageURLs from Debian ({commit_batch}/{total_commit_batch}) |
188 | | -
|
189 | | - Tool: {tool_name}@v{VERSION} |
190 | | - Reference: https://{settings.ALLOWED_HOSTS[0]} |
191 | | -
|
192 | | - Signed-off-by: {author_name} <{author_email}> |
193 | | - """ |
194 | | - |
195 | | - |
196 | | -def collect_packages_from_debian(files_per_commit=PACKAGE_BATCH_SIZE, logger=None): |
197 | | - # Clone data and config repo |
198 | | - data_repo = federatedcode.clone_repository( |
199 | | - repo_url=MINECODE_DATA_DEBIAN_REPO, |
200 | | - logger=logger, |
201 | | - ) |
202 | | - config_repo = federatedcode.clone_repository( |
203 | | - repo_url=pipes.MINECODE_PIPELINES_CONFIG_REPO, |
204 | | - logger=logger, |
205 | | - ) |
206 | | - if logger: |
207 | | - logger(f"{MINECODE_DATA_DEBIAN_REPO} repo cloned at: {data_repo.working_dir}") |
208 | | - logger(f"{pipes.MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {config_repo.working_dir}") |
209 | | - |
210 | | - # get last_modified to see if we can skip files |
211 | | - checkpoint = pipes.get_checkpoint_from_file( |
212 | | - cloned_repo=config_repo, path=DEBIAN_CHECKPOINT_PATH |
213 | | - ) |
214 | | - last_modified = checkpoint.get("previous_debian_index_last_modified_date") |
215 | | - if logger: |
216 | | - logger(f"previous_debian_index_last_modified_date: {last_modified}") |
217 | | - |
218 | | - # download and iterate through debian index |
219 | | - debian_collector = DebianCollector() |
220 | | - files_to_commit = [] |
221 | | - commit_batch = 1 |
222 | | - for current_purl, package in debian_collector.get_packages( |
223 | | - previous_index_last_modified_date=last_modified |
224 | | - ): |
225 | | - # write packageURL to file |
226 | | - package_base_dir = hashid.get_package_base_dir(purl=current_purl) |
227 | | - purl_file = pipes.write_packageurls_to_file( |
228 | | - repo=data_repo, |
229 | | - base_dir=package_base_dir, |
230 | | - packageurls=[package.purl], |
231 | | - append=True, |
232 | | - ) |
233 | | - if purl_file not in files_to_commit: |
234 | | - files_to_commit.append(purl_file) |
235 | | - |
236 | | - if len(files_to_commit) == files_per_commit: |
237 | | - federatedcode.commit_and_push_changes( |
238 | | - commit_message=commit_message(commit_batch), |
239 | | - repo=data_repo, |
240 | | - files_to_commit=files_to_commit, |
241 | | - logger=logger, |
242 | | - ) |
243 | | - files_to_commit.clear() |
244 | | - commit_batch += 1 |
245 | | - |
246 | | - if files_to_commit: |
247 | | - federatedcode.commit_and_push_changes( |
248 | | - commit_message=commit_message(commit_batch), |
249 | | - repo=data_repo, |
250 | | - files_to_commit=files_to_commit, |
251 | | - logger=logger, |
252 | | - ) |
253 | | - |
254 | | - last_modified = get_file_mtime(debian_collector.index_location) |
255 | | - checkpoint = {"previous_debian_index_last_modified_date": last_modified} |
256 | | - if logger: |
257 | | - logger(f"checkpoint: {checkpoint}") |
258 | | - pipes.update_checkpoints_in_github( |
259 | | - checkpoint=checkpoint, cloned_repo=config_repo, path=DEBIAN_CHECKPOINT_PATH |
260 | | - ) |
261 | | - |
262 | | - repos_to_clean = [data_repo, config_repo] |
263 | | - return repos_to_clean |
| 165 | + yield versionless_purl, [packaged_data.purl] |
0 commit comments