|
1 | 1 | #!/usr/bin/env python3 |
2 | 2 | import os |
3 | 3 | import json |
| 4 | +from urllib.parse import urlparse |
4 | 5 |
|
5 | 6 | BASE_PATH = os.path.dirname(os.path.abspath(__file__)) |
6 | 7 |
|
7 | | -from mirror import URLS_BASE, CONFIG_SOURCES_FILE # импорт из mirror.py (нижний регистр) |
| 8 | +from mirror import URLS_BASE, CONFIG_SOURCES_FILE # mirror.py в корне репо |
8 | 9 |
|
9 | 10 | MERMEROO_FILE = os.path.join(BASE_PATH, "mermeroo_sources.txt") |
10 | 11 | OUT_FILE = os.path.join(BASE_PATH, "mermeroo_only_new_for_mirror.txt") |
@@ -33,43 +34,108 @@ def load_config_sources(): |
33 | 34 |
|
34 | 35 |
|
35 | 36 | def extract_repo_key(url: str) -> str: |
36 | | - if "raw.githubusercontent.com" not in url: |
37 | | - return url |
38 | | - parts = url.split("/") |
39 | | - try: |
40 | | - i = parts.index("raw.githubusercontent.com") |
41 | | - owner = parts[i + 1] |
42 | | - repo = parts[i + 2] |
43 | | - return f"{owner}/{repo}" |
44 | | - except Exception: |
45 | | - return url |
| 37 | + if "raw.githubusercontent.com" in url: |
| 38 | + parts = url.split("/") |
| 39 | + try: |
| 40 | + i = parts.index("raw.githubusercontent.com") |
| 41 | + owner = parts[i + 1] |
| 42 | + repo = parts[i + 2] |
| 43 | + return f"{owner}/{repo}" |
| 44 | + except Exception: |
| 45 | + return url |
| 46 | + |
| 47 | + # github.com/.../raw/... |
| 48 | + if "github.com" in url and "/raw/" in url: |
| 49 | + parts = url.split("/") |
| 50 | + try: |
| 51 | + i = parts.index("github.com") |
| 52 | + owner = parts[i + 1] |
| 53 | + repo = parts[i + 2] |
| 54 | + return f"{owner}/{repo}" |
| 55 | + except Exception: |
| 56 | + return url |
| 57 | + |
| 58 | + return url |
| 59 | + |
| 60 | + |
| 61 | +# ✅ оставляем только «нормальные» источники |
| 62 | +def is_good_source_url(url: str) -> bool: |
| 63 | + u = urlparse(url) |
| 64 | + |
| 65 | + if not u.scheme.startswith("http"): |
| 66 | + return False |
| 67 | + |
| 68 | + host = (u.netloc or "").lower() |
| 69 | + path = (u.path or "").lower() |
| 70 | + |
| 71 | + # Git raw / CDN raw |
| 72 | + if "raw.githubusercontent.com" in host: |
| 73 | + return True |
| 74 | + if "github.com" in host and "/raw/" in path: |
| 75 | + return True |
| 76 | + if "gitlab.com" in host and "/-/raw/" in path: |
| 77 | + return True |
| 78 | + if "bitbucket.org" in host and "/raw/" in path: |
| 79 | + return True |
| 80 | + if "jsdelivr.net" in host and "/gh/" in path: |
| 81 | + return True |
| 82 | + |
| 83 | + # всё, что явно «clash/proxies», api subscribe, просто сайты — режем |
| 84 | + bad_substrings = [ |
| 85 | + "/clash/proxies", |
| 86 | + "/api/v1/client/subscribe", |
| 87 | + "/subscribe?", |
| 88 | + "token=", |
| 89 | + "/wp-content/", |
| 90 | + ".html", |
| 91 | + ".htm", |
| 92 | + "/free-ss", |
| 93 | + "/free-ssr", |
| 94 | + "/v2ray/", |
| 95 | + "/free/", |
| 96 | + ] |
| 97 | + for b in bad_substrings: |
| 98 | + if b in url.lower(): |
| 99 | + return False |
| 100 | + |
| 101 | + # текстовые/plain-файлы тоже можно оставить как потенциальные |
| 102 | + good_exts = (".txt", ".yaml", ".yml", ".json") |
| 103 | + if any(path.endswith(ext) for ext in good_exts): |
| 104 | + return True |
| 105 | + |
| 106 | + return False |
46 | 107 |
|
47 | 108 |
|
48 | 109 | def main(): |
49 | 110 | mer_all = load_mermeroo_sources() |
50 | 111 | print(f"Всего в mermeroo_sources.txt: {len(mer_all)}") |
51 | 112 |
|
52 | | - # Все источники, которые уже знает mirror.py |
| 113 | + # 1) фильтруем mermeroo по хорошим URL |
| 114 | + mer_filtered = [u for u in mer_all if is_good_source_url(u)] |
| 115 | + print(f"Из них после фильтра по типу URL: {len(mer_filtered)}") |
| 116 | + |
| 117 | + # 2) все источники, которые уже знает mirror.py |
53 | 118 | known_urls = set(URLS_BASE) |
54 | 119 | known_urls |= load_config_sources() |
55 | 120 |
|
56 | | - # Новые по точному URL |
57 | | - mer_new_urls = [u for u in mer_all if u not in known_urls] |
| 121 | + # 3) убираем уже известные URL |
| 122 | + mer_new_urls = [u for u in mer_filtered if u not in known_urls] |
58 | 123 |
|
59 | | - # Новые по репозиторию |
| 124 | + # 4) новые репозитории-доноры |
60 | 125 | known_repos = {extract_repo_key(u) for u in known_urls} |
61 | | - mer_repos = {extract_repo_key(u) for u in mer_all} |
| 126 | + mer_repos = {extract_repo_key(u) for u in mer_filtered} |
62 | 127 | mer_new_repos = sorted(r for r in mer_repos if r not in known_repos) |
63 | 128 |
|
64 | | - print(f"Новых URL (нет в Mirror): {len(mer_new_urls)}") |
| 129 | + print(f"Новых URL (нет в Mirror, после фильтра): {len(mer_new_urls)}") |
65 | 130 | print(f"Новых репозиториев-доноров: {len(mer_new_repos)}") |
66 | 131 | for r in mer_new_repos: |
67 | 132 | print(" ", r) |
68 | 133 |
|
| 134 | + # 5) сохраняем только новые «нормальные» URL в файл |
69 | 135 | with open(OUT_FILE, "w", encoding="utf-8") as f: |
70 | 136 | f.write("\n".join(mer_new_urls)) |
71 | 137 |
|
72 | | - print(f"\nСписок новых URL сохранён в {OUT_FILE}") |
| 138 | + print(f"\nСписок новых отфильтрованных URL сохранён в {OUT_FILE}") |
73 | 139 |
|
74 | 140 |
|
75 | 141 | if __name__ == "__main__": |
|
0 commit comments