Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 64 additions & 14 deletions minecode/collectors/maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,9 +476,6 @@ def process_request(purl_str, **kwargs):


collect_links = re.compile(r'href="([^"]+)"').findall
collect_links_and_artifact_timestamps = re.compile(
r'<a href="([^"]+)".*</a>\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)'
).findall


def check_if_file_name_is_linked_on_page(file_name, links, **kwargs):
Expand Down Expand Up @@ -675,6 +672,62 @@ def filter_for_artifacts(timestamps_by_links):
return timestamps_by_links_filtered


def collect_links_and_artifact_timestamps(text):
# Return a list of sets containing all link locations and their
# corresponding timestamps extracted from a given HTML text.

# Pattern that matches with https://repo.maven.apache.org/maven2
maven_apache_pattern = re.compile(
r'<a href="([^"]+)"[^>]*>[^<]*</a>\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)'
)
maven_apache_matches = maven_apache_pattern.findall(text)
if maven_apache_matches:
return maven_apache_matches

# Pattern that matces with
# both Apache (UTC) and Nexus (Z) formats
# https://repository.jboss.org/nexus/service/rest/repository/browse/releases/
# https://repository.jboss.org/nexus/service/rest/repository/browse/public/
# https://repository.apache.org/snapshots/
repo_jboss_apache_pattern = re.compile(
r'<a href="([^"]+)"[^>]*>[^<]*</a></td>\s*<td>\s*((?:[A-Z][a-z]{2}\s+[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+(?:UTC|Z)\s+\d{4})|&nbsp;)\s*</td>'
)
repo_jboss_apache_matches = repo_jboss_apache_pattern.findall(text)
# Convert &nbsp; to empty string for table format
if repo_jboss_apache_matches:
return [
(item, "" if timestamp == "&nbsp;" else timestamp)
for item, timestamp in repo_jboss_apache_matches
]

# Pattern that matches with
# https://repo.spring.io/milestone
repo_spring_pattern = re.compile(
r'<a href="([^"]+)"[^>]*>[^<]*</a>\s+(\d{2}-[A-Z][a-z]{2}-\d{4}\s+\d{2}:\d{2})'
)
repo_spring_matches = repo_spring_pattern.findall(text)
if repo_spring_matches:
return repo_spring_matches

# Simple links in <pre> tags without timestamps (Gradle plugins format)
# https://plugins.gradle.org/m2/
plugins_gradle_pattern = re.compile(r'<pre><a href="([^"]+)"[^>]*>[^<]*</a></pre>')
plugins_gradle_matches = plugins_gradle_pattern.findall(text)
if plugins_gradle_matches:
# Filter out parent directory link if present
filtered_matches = []
for href in plugins_gradle_matches:
# Skip parent directory links
if href != "../" and not href.startswith(".."):
filtered_matches.append((href, ""))

# Only return if we found non-parent links
if filtered_matches:
return filtered_matches

return []


def collect_links_from_text(text, filter):
"""
Return a mapping of link locations and their timestamps, given HTML `text`
Expand All @@ -700,7 +753,7 @@ def create_absolute_urls_for_links(text, url, filter):
url = url.rstrip("/")
timestamps_by_links = collect_links_from_text(text, filter)
for link, timestamp in timestamps_by_links.items():
if not link.startswith(url):
if not link.startswith("http:") and not link.startswith("https:"):
link = f"{url}/{link}"
timestamps_by_absolute_links[link] = timestamp
return timestamps_by_absolute_links
Expand Down Expand Up @@ -758,23 +811,20 @@ def get_artifact_sha1(artifact_url):
return sha1


def get_classifier_from_artifact_url(
artifact_url, package_version_page_url, package_name, package_version
):
def get_classifier_from_artifact_url(artifact_url, package_name, package_version):
"""
Return the classifier from a Maven artifact URL `artifact_url`, otherwise
return None if a classifier cannot be determined from `artifact_url`
"""
classifier = None
# https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0
package_version_page_url = package_version_page_url.rstrip("/")
# https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0
leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}"
package_name_version_portion = f"{package_name}-{package_version}"
artifact_url_filename = artifact_url.rsplit("/", 1)[-1]
remaining_url_portion = artifact_url_filename.replace(package_name_version_portion, "")
# artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar'
# ['', '-onejar.jar']
_, remaining_url_portion = artifact_url.split(leading_url_portion)
# ['-onejar', 'jar']
# artifact_url_filename = 'livereload-jvm-0.2.0-onejar.jar'
# remaining_url_portion = '-onejar.jar'
remaining_url_portions = remaining_url_portion.split(".")
# ['-onejar', 'jar']
if remaining_url_portions and remaining_url_portions[0]:
# '-onejar'
classifier = remaining_url_portions[0]
Expand Down
8 changes: 5 additions & 3 deletions minecode/management/commands/import_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,14 @@ def process_request(importable_uri):
timestamps_by_artifact_links = get_artifact_links(version_page_url)
for artifact_link, timestamp in timestamps_by_artifact_links.items():
sha1 = get_artifact_sha1(artifact_link)
classifier = get_classifier_from_artifact_url(
artifact_link, version_page_url, name, version
)
classifier = get_classifier_from_artifact_url(artifact_link, name, version)
qualifiers = None
if classifier:
qualifiers = f"classifier={classifier}"
if timestamp:
release_date = dateutil_parse(timestamp)
else:
release_date = None
release_date = dateutil_parse(timestamp)
package_data = PackageData(
type="maven",
Expand Down
14 changes: 12 additions & 2 deletions minecode/management/commands/maven_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,15 @@ class Command(VerboseCommand):
help = "Run a Package request queue."

def handle(self, *args, **options):
maven_root_url = "https://repo.maven.apache.org/maven2"
crawl_maven_repo_from_root(root_url=maven_root_url)
# Add the maven root URLs
# Ref: https://github.com/aboutcode-org/purldb/issues/630#issuecomment-3599942716
maven_root_urls = [
"https://repo.maven.apache.org/maven2",
"https://repo.spring.io/artifactory/milestone",
"https://plugins.gradle.org/m2",
"https://repository.apache.org/content/groups/snapshots",
"https://repository.jboss.org/nexus/service/rest/repository/browse/releases",
"https://repository.jboss.org/nexus/service/rest/repository/browse/public",
]
for maven_root_url in maven_root_urls:
crawl_maven_repo_from_root(root_url=maven_root_url)
78 changes: 73 additions & 5 deletions minecode/tests/collectors/test_maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def test_get_merged_ancestor_package_from_maven_package(


class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase):
test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles")
test_data_dir = os.path.join(os.path.dirname(__file__), "../testfiles")

def test_check_if_file_name_is_linked_on_page(self):
links = ["foo/", "bar/", "baz/"]
Expand Down Expand Up @@ -500,12 +500,80 @@ def test_get_artifact_sha1(self, mock_request_get):

def test_get_classifier_from_artifact_url(self):
artifact_url = "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar"
package_version_page_url = (
"https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/"
)
package_name = "livereload-jvm"
package_version = "0.2.0"
classifier = maven.get_classifier_from_artifact_url(
artifact_url, package_version_page_url, package_name, package_version
artifact_url, package_name, package_version
)
self.assertEqual("onejar", classifier)

def test_collect_links_and_artifact_timestamps_repo_maven_apache_org(self):
# https://repo.maven.apache.org/maven2
with open(self.get_test_loc("maven/html/maven.apache.org/abbot.html")) as file:
text = file.read()
expected = [
("1.4.0/", "2015-09-22 16:03"),
("maven-metadata.xml", "2015-09-24 14:18"),
]

self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))

def test_collect_links_and_artifact_timestamps_repository_jboss_org(self):
# https://repository.jboss.org/nexus/service/rest/repository/browse/public/
# https://repository.jboss.org/nexus/service/rest/repository/browse/releases/
with open(self.get_test_loc("maven/html/repository.jboss.org/commons-codec.html")) as file:
text = file.read()
expected = [
("1.2/", ""),
(
"https://repository.jboss.org/nexus/repository/public/apache-codec/commons-codec/maven-metadata.xml",
"Fri Sep 05 09:38:07 Z 2025",
),
]

self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))

def test_collect_links_and_artifact_timestamps_repository_apache_org(self):
# https://repository.apache.org/snapshots/
with open(self.get_test_loc("maven/html/repository.apache.org/common-chain.html")) as file:
text = file.read()
expected = [
(
"https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/1.3-SNAPSHOT/",
"Thu Jul 04 05:45:00 UTC 2013",
),
(
"https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/2.0-SNAPSHOT/",
"Tue Aug 21 20:26:48 UTC 2018",
),
(
"https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/maven-metadata.xml.md5",
"Tue Aug 21 20:26:47 UTC 2018",
),
(
"https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/maven-metadata.xml.sha1",
"Tue Aug 21 20:26:47 UTC 2018",
),
]

self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))

def test_collect_links_and_artifact_timestamps_repo_spring_io(self):
# https://repo.spring.io/release
with open(self.get_test_loc("maven/html/repo.spring.io/scstest.html")) as file:
text = file.read()
expected = [
("0.0.11.M2/", "07-Aug-2019 08:40"),
("0.0.11.RC2/", "07-Aug-2019 08:36"),
("maven-metadata.xml", "07-Aug-2019 09:07"),
]

self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))

def test_collect_links_and_artifact_timestamps_plugin_gradle_org(self):
# https://plugins.gradle.org/m2/
with open(self.get_test_loc("maven/html/plugins.gradle.org/test.html")) as file:
text = file.read()
expected = [("0.0.10/", ""), ("1.0.1/", ""), ("1.1.0/", ""), ("maven-metadata.xml", "")]

self.assertEqual(expected, maven.collect_links_and_artifact_timestamps(text))
29 changes: 29 additions & 0 deletions minecode/tests/testfiles/maven/html/maven.apache.org/abbot.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<!DOCTYPE html>
<html>

<head>
<title>Central Repository: abbot/abbot</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<style>
body {
background: #fff;
}
</style>
</head>

<body>
<header>
<h1>abbot/abbot</h1>
</header>
<hr/>
<main>
<pre id="contents">
<a href="../">../</a>
<a href="1.4.0/" title="1.4.0/">1.4.0/</a> 2015-09-22 16:03 -
<a href="maven-metadata.xml" title="maven-metadata.xml">maven-metadata.xml</a> 2015-09-24 14:18 402
</pre>
</main>
<hr/>
</body>

</html>
10 changes: 10 additions & 0 deletions minecode/tests/testfiles/maven/html/plugins.gradle.org/test.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<html>
<head><script type='text/javascript' src='https://plugins.gradle.org/dozHnTTSNrd5c_IjGSXETRWLhbS7W7Sl-H-qWMlAJ-nnJaXDMcobTtriNFDE_NxL6mMWRjdcd0usaDqblFsN2Apks_z6IZCWIaCbLqGNPxgytmr6wYHb8SFa8Vogcg7u9QgA3Me1ndlareEd1AF6UF-iHMCznbe9q8_RnrT36M8='></script>
</head>
<body>
<pre><a href="0.0.10/">0.0.10/</a></pre>
<pre><a href="1.0.1/">1.0.1/</a></pre>
<pre><a href="1.1.0/">1.1.0/</a></pre>
<pre><a href="maven-metadata.xml">maven-metadata.xml</a></pre>
</body>
</html>
14 changes: 14 additions & 0 deletions minecode/tests/testfiles/maven/html/repo.spring.io/scstest.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head><meta name="robots" content="noindex" />
<title>Index of milestone/com/albertoimpl/test/scstest/releasetest</title>
</head>
<body>
<h1>Index of milestone/com/albertoimpl/test/scstest/releasetest</h1>
<pre>Name Last modified Size</pre><hr/>
<pre><a href="../">../</a>
<a href="0.0.11.M2/">0.0.11.M2/</a> 07-Aug-2019 08:40 -
<a href="0.0.11.RC2/">0.0.11.RC2/</a> 07-Aug-2019 08:36 -
<a href="maven-metadata.xml">maven-metadata.xml</a> 07-Aug-2019 09:07 449 bytes
</pre>
<hr/><address style="font-size:small;">Artifactory Online Server</address></body></html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<html>
<head><script type='text/javascript' src='https://repository.apache.org/A0HnoQhqkwFlCnYdGIUSeC6QxiXycDJQit71DQvwBneOTWUYqDfnR_rRZFOArTJIFfR1XaDdweihXOZeZY0IVNNCr8eYelM995osm88CBpWrw07LyaggpNRPkoPQFO9dPSmatFhFWILy9VivvOWnrB5M2ymOQX0LCcQpRa7ItTQ='></script>
<title>Index of /groups/snapshots/commons-chain/commons-chain</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>

<link rel="icon" type="image/png" href="https://repository.apache.org/favicon.png">
<!--[if IE]>
<link rel="SHORTCUT ICON" href="https://repository.apache.org/favicon.ico"/>
<![endif]-->

<link rel="stylesheet" href="https://repository.apache.org/static/css/Sonatype-content.css?2.15.2-03" type="text/css" media="screen" title="no title" charset="utf-8">
</head>
<body>
<h1>Index of /groups/snapshots/commons-chain/commons-chain</h1>
<table cellspacing="10">
<tr>
<th align="left">Name</th>
<th>Last Modified</th>
<th>Size</th>
<th>Description</th>
</tr>
<tr>
<td><a href="../">Parent Directory</a></td>
</tr>
<tr>
<td><a href="https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/1.3-SNAPSHOT/">1.3-SNAPSHOT/</a></td>
<td>Thu Jul 04 05:45:00 UTC 2013</td>
<td align="right">
&nbsp;
</td>
<td></td>
</tr>
<tr>
<td><a href="https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/2.0-SNAPSHOT/">2.0-SNAPSHOT/</a></td>
<td>Tue Aug 21 20:26:48 UTC 2018</td>
<td align="right">
&nbsp;
</td>
<td></td>
</tr>
<tr>
<td><a href="https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/maven-metadata.xml.md5">maven-metadata.xml.md5</a></td>
<td>Tue Aug 21 20:26:47 UTC 2018</td>
<td align="right">
33
</td>
<td></td>
</tr>
<tr>
<td><a href="https://repository.apache.org/content/groups/snapshots/commons-chain/commons-chain/maven-metadata.xml.sha1">maven-metadata.xml.sha1</a></td>
<td>Tue Aug 21 20:26:47 UTC 2018</td>
<td align="right">
41
</td>
<td></td>
</tr>
</table>
</body>
</html>
Loading
Loading