|
20 | 20 | # ScanCode.io is a free software code scanning tool from nexB Inc. and others. |
21 | 21 | # Visit https://github.com/aboutcode-org/scancode.io for support and download. |
22 | 22 |
|
23 | | -from scanpipe.pipelines import Pipeline |
24 | | -from scanpipe.pipes import federatedcode |
25 | | - |
26 | | -from minecode_pipelines import pipes |
27 | 23 | from minecode_pipelines.pipes import pypi |
| 24 | +from minecode_pipelines.pipelines import MineCodeBasePipeline |
| 25 | +from minecode_pipelines.pipelines import _mine_and_publish_packageurls |
28 | 26 |
|
29 | | - |
30 | | -class MinePypi(Pipeline): |
| 27 | +class MinePypi(MineCodeBasePipeline): |
31 | 28 | """ |
32 | 29 | Mine all packageURLs from a pypi index and publish them to |
33 | 30 | a FederatedCode repo. |
34 | 31 | """ |
35 | 32 |
|
| 33 | + package_batch_size = 100 |
| 34 | + |
36 | 35 | @classmethod |
37 | 36 | def steps(cls): |
38 | 37 | return ( |
39 | 38 | cls.check_federatedcode_eligibility, |
| 39 | + cls.create_federatedcode_working_dir, |
40 | 40 | cls.mine_pypi_packages, |
41 | | - cls.mine_and_publish_pypi_packageurls, |
42 | | - cls.delete_cloned_repos, |
| 41 | + cls.get_pypi_packages_to_sync, |
| 42 | + cls.fetch_federation_config, |
| 43 | + cls.mine_and_publish_packageurls, |
| 44 | + cls.update_state_and_checkpoints, |
| 45 | + cls.delete_working_dir, |
43 | 46 | ) |
44 | 47 |
|
45 | | - def check_federatedcode_eligibility(self): |
46 | | - """ |
47 | | - Check if the project fulfills the following criteria for |
48 | | - pushing the project result to FederatedCode. |
49 | | - """ |
50 | | - federatedcode.check_federatedcode_configured_and_available(logger=self.log) |
51 | | - |
52 | 48 | def mine_pypi_packages(self): |
53 | 49 | """Mine pypi package names from pypi indexes or checkpoint.""" |
54 | | - self.pypi_packages, self.state = pypi.mine_pypi_packages(logger=self.log) |
| 50 | + self.pypi_packages, self.state, self.config_repo = pypi.mine_pypi_packages(logger=self.log) |
55 | 51 |
|
56 | | - def mine_and_publish_pypi_packageurls(self): |
57 | | - """Get pypi packageURLs for all mined pypi package names.""" |
58 | | - self.repos = pypi.mine_and_publish_pypi_packageurls( |
| 52 | + def get_pypi_packages_to_sync(self): |
| 53 | + """Get pypi packages which needs to be synced using checkpoint.""" |
| 54 | + self.packages, self.last_serial = pypi.get_pypi_packages_to_sync( |
59 | 55 | packages_file=self.pypi_packages, |
60 | 56 | state=self.state, |
61 | 57 | logger=self.log, |
62 | 58 | ) |
63 | 59 |
|
64 | | - def delete_cloned_repos(self): |
65 | | - pipes.delete_cloned_repos(repos=self.repos, logger=self.log) |
| 60 | + def packages_count(self): |
| 61 | + return len(self.packages) |
| 62 | + |
| 63 | + def mine_packageurls(self): |
| 64 | + """Yield pypi packageURLs for all mined pypi package names.""" |
| 65 | + self.packages_mined = [] |
| 66 | + yield from pypi.mine_and_publish_pypi_packageurls( |
| 67 | + packages_to_sync=self.packages, |
| 68 | + packages_mined=self.packages_mined, |
| 69 | + logger=self.log, |
| 70 | + ) |
| 71 | + |
| 72 | + def save_check_point(self): |
| 73 | + pypi.save_mined_packages_in_checkpoint( |
| 74 | + packages_mined=self.packages_mined, |
| 75 | + config_repo=self.config_repo, |
| 76 | + logger=self.log, |
| 77 | + ) |
| 78 | + self.packages_mined = [] |
| 79 | + |
| 80 | + def mine_and_publish_packageurls(self): |
| 81 | + """Mine and publish PackageURLs.""" |
| 82 | + |
| 83 | + _mine_and_publish_packageurls( |
| 84 | + packageurls=self.mine_packageurls(), |
| 85 | + total_package_count=self.packages_count(), |
| 86 | + data_cluster=self.data_cluster, |
| 87 | + checked_out_repos=self.checked_out_repos, |
| 88 | + working_path=self.working_path, |
| 89 | + append_purls=self.append_purls, |
| 90 | + commit_msg_func=self.commit_message, |
| 91 | + logger=self.log, |
| 92 | + checkpoint_func=self.save_check_point, |
| 93 | + checkpoint_on_commit=True, |
| 94 | + batch_size=self.package_batch_size, |
| 95 | + ) |
| 96 | + |
| 97 | + def update_state_and_checkpoints(self): |
| 98 | + pypi.update_state_and_checkpoints( |
| 99 | + config_repo=self.config_repo, |
| 100 | + last_serial=self.last_serial, |
| 101 | + logger=self.log, |
| 102 | + ) |
0 commit comments