Skip to content

Commit c4d9199

Browse files
authored
Update compare_mermeroo_mirror.py
Signed-off-by: саня <109840177+kort0881@users.noreply.github.com>
1 parent 5c39c85 commit c4d9199

File tree

1 file changed

+84
-18
lines changed

1 file changed

+84
-18
lines changed

compare_mermeroo_mirror.py

Lines changed: 84 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
#!/usr/bin/env python3
22
import os
33
import json
4+
from urllib.parse import urlparse
45

56
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
67

7-
from mirror import URLS_BASE, CONFIG_SOURCES_FILE # импорт из mirror.py (нижний регистр)
8+
from mirror import URLS_BASE, CONFIG_SOURCES_FILE # mirror.py в корне репо
89

910
MERMEROO_FILE = os.path.join(BASE_PATH, "mermeroo_sources.txt")
1011
OUT_FILE = os.path.join(BASE_PATH, "mermeroo_only_new_for_mirror.txt")
@@ -33,43 +34,108 @@ def load_config_sources():
3334

3435

3536
def extract_repo_key(url: str) -> str:
36-
if "raw.githubusercontent.com" not in url:
37-
return url
38-
parts = url.split("/")
39-
try:
40-
i = parts.index("raw.githubusercontent.com")
41-
owner = parts[i + 1]
42-
repo = parts[i + 2]
43-
return f"{owner}/{repo}"
44-
except Exception:
45-
return url
37+
if "raw.githubusercontent.com" in url:
38+
parts = url.split("/")
39+
try:
40+
i = parts.index("raw.githubusercontent.com")
41+
owner = parts[i + 1]
42+
repo = parts[i + 2]
43+
return f"{owner}/{repo}"
44+
except Exception:
45+
return url
46+
47+
# github.com/.../raw/...
48+
if "github.com" in url and "/raw/" in url:
49+
parts = url.split("/")
50+
try:
51+
i = parts.index("github.com")
52+
owner = parts[i + 1]
53+
repo = parts[i + 2]
54+
return f"{owner}/{repo}"
55+
except Exception:
56+
return url
57+
58+
return url
59+
60+
61+
# ✅ оставляем только «нормальные» источники
62+
def is_good_source_url(url: str) -> bool:
63+
u = urlparse(url)
64+
65+
if not u.scheme.startswith("http"):
66+
return False
67+
68+
host = (u.netloc or "").lower()
69+
path = (u.path or "").lower()
70+
71+
# Git raw / CDN raw
72+
if "raw.githubusercontent.com" in host:
73+
return True
74+
if "github.com" in host and "/raw/" in path:
75+
return True
76+
if "gitlab.com" in host and "/-/raw/" in path:
77+
return True
78+
if "bitbucket.org" in host and "/raw/" in path:
79+
return True
80+
if "jsdelivr.net" in host and "/gh/" in path:
81+
return True
82+
83+
# всё, что явно «clash/proxies», api subscribe, просто сайты — режем
84+
bad_substrings = [
85+
"/clash/proxies",
86+
"/api/v1/client/subscribe",
87+
"/subscribe?",
88+
"token=",
89+
"/wp-content/",
90+
".html",
91+
".htm",
92+
"/free-ss",
93+
"/free-ssr",
94+
"/v2ray/",
95+
"/free/",
96+
]
97+
for b in bad_substrings:
98+
if b in url.lower():
99+
return False
100+
101+
# текстовые/plain-файлы тоже можно оставить как потенциальные
102+
good_exts = (".txt", ".yaml", ".yml", ".json")
103+
if any(path.endswith(ext) for ext in good_exts):
104+
return True
105+
106+
return False
46107

47108

48109
def main():
49110
mer_all = load_mermeroo_sources()
50111
print(f"Всего в mermeroo_sources.txt: {len(mer_all)}")
51112

52-
# Все источники, которые уже знает mirror.py
113+
# 1) фильтруем mermeroo по хорошим URL
114+
mer_filtered = [u for u in mer_all if is_good_source_url(u)]
115+
print(f"Из них после фильтра по типу URL: {len(mer_filtered)}")
116+
117+
# 2) все источники, которые уже знает mirror.py
53118
known_urls = set(URLS_BASE)
54119
known_urls |= load_config_sources()
55120

56-
# Новые по точному URL
57-
mer_new_urls = [u for u in mer_all if u not in known_urls]
121+
# 3) убираем уже известные URL
122+
mer_new_urls = [u for u in mer_filtered if u not in known_urls]
58123

59-
# Новые по репозиторию
124+
# 4) новые репозитории-доноры
60125
known_repos = {extract_repo_key(u) for u in known_urls}
61-
mer_repos = {extract_repo_key(u) for u in mer_all}
126+
mer_repos = {extract_repo_key(u) for u in mer_filtered}
62127
mer_new_repos = sorted(r for r in mer_repos if r not in known_repos)
63128

64-
print(f"Новых URL (нет в Mirror): {len(mer_new_urls)}")
129+
print(f"Новых URL (нет в Mirror, после фильтра): {len(mer_new_urls)}")
65130
print(f"Новых репозиториев-доноров: {len(mer_new_repos)}")
66131
for r in mer_new_repos:
67132
print(" ", r)
68133

134+
# 5) сохраняем только новые «нормальные» URL в файл
69135
with open(OUT_FILE, "w", encoding="utf-8") as f:
70136
f.write("\n".join(mer_new_urls))
71137

72-
print(f"\nСписок новых URL сохранён в {OUT_FILE}")
138+
print(f"\nСписок новых отфильтрованных URL сохранён в {OUT_FILE}")
73139

74140

75141
if __name__ == "__main__":

0 commit comments

Comments
 (0)