-
-
Notifications
You must be signed in to change notification settings - Fork 68
Expand file tree
/
Copy pathmine_maven.py
More file actions
118 lines (105 loc) · 4.8 KB
/
mine_maven.py
File metadata and controls
118 lines (105 loc) · 4.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
from scanpipe.pipes import federatedcode
from minecode_pipelines import pipes
from minecode_pipelines.pipelines import MineCodeBasePipeline
from minecode_pipelines.pipelines import _mine_and_publish_packageurls
from minecode_pipelines.pipes import maven
class MineMaven(MineCodeBasePipeline):
"""Mine PackageURLs from maven index and publish them to FederatedCode."""
pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/"
checkpoint_path = "maven/checkpoints.json"
append_purls = True
@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.create_federatedcode_working_dir,
cls.fetch_federation_config,
cls.fetch_checkpoint_and_maven_index,
cls.mine_and_publish_maven_packageurls,
cls.save_check_point,
cls.delete_working_dir,
)
def fetch_checkpoint_and_maven_index(self):
self.checkpoint_config_repo = federatedcode.clone_repository(
repo_url=self.pipeline_config_repo,
clone_path=self.working_path / "minecode-pipelines-config",
logger=self.log,
)
checkpoint = pipes.get_checkpoint_from_file(
cloned_repo=self.checkpoint_config_repo,
path=self.checkpoint_path,
)
last_incremental = checkpoint.get("last_incremental")
self.log(f"last_incremental: {last_incremental}")
self.maven_nexus_collector = maven.MavenNexusCollector(
last_incremental=last_incremental,
logger=self.log,
)
# Determine if we can resume from the last processed purl
saved_checksum = checkpoint.get("index_checksum")
current_checksum = self.maven_nexus_collector.index_checksum
self.last_processed_purl = None
if saved_checksum and saved_checksum == current_checksum:
self.last_processed_purl = checkpoint.get("last_processed_purl")
if self.last_processed_purl:
self.log(
f"Index checksum matches. Resuming from: {self.last_processed_purl}"
)
elif saved_checksum and saved_checksum != current_checksum:
self.log(
"Index checksum changed. Starting from beginning."
)
def mine_and_publish_maven_packageurls(self):
_mine_and_publish_packageurls(
packageurls=self.maven_nexus_collector.get_packages(
last_processed_purl=self.last_processed_purl,
),
total_package_count=None,
data_cluster=self.data_cluster,
checked_out_repos=self.checked_out_repos,
working_path=self.working_path,
append_purls=self.append_purls,
commit_msg_func=self.commit_message,
logger=self.log,
checkpoint_func=self._save_checkpoint,
)
def _save_checkpoint(self):
"""Save current progress as a checkpoint."""
last_incremental = self.maven_nexus_collector.index_properties.get(
"nexus.index.last-incremental"
)
checkpoint = {
"last_incremental": last_incremental,
"index_checksum": self.maven_nexus_collector.index_checksum,
"last_processed_purl": self.maven_nexus_collector.last_processed_purl,
}
self.log(f"Saving checkpoint: {checkpoint}")
pipes.update_checkpoints_in_github(
checkpoint=checkpoint,
cloned_repo=self.checkpoint_config_repo,
path=self.checkpoint_path,
logger=self.log,
)
def save_check_point(self):
self._save_checkpoint()