diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba098fb --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.maigret.json + +venv +__pycache__ diff --git a/MAIGRET.md b/MAIGRET.md new file mode 100644 index 0000000..21892da --- /dev/null +++ b/MAIGRET.md @@ -0,0 +1,14 @@ +## Maigret Exporter + +Run Marple with the folloing parameters to export new sites to Maigret: + +``` +python3 marple.py text --plugins maigret extract_username maigret_export random_username +``` + +### TODO + +- [ ] Add an direct integration with Maigret (`--submit`) +- [ ] Implement the GitHub API call to create an issue with parameters of a new site +- [ ] Utilize AI to determine if a link is similar to an account page +- [ ] Add a generation of tags for a website with AI diff --git a/marple.py b/marple.py index 63d7fda..456eb21 100755 --- a/marple.py +++ b/marple.py @@ -11,6 +11,9 @@ import aiohttp import requests +import random +import string +import difflib import tqdm from aiohttp_socks import ProxyConnector from bs4 import BeautifulSoup as bs @@ -35,6 +38,118 @@ '/search?q=', ] +def ai_generate_username(): + url = "http://localhost:1234/v1/chat/completions" + headers = { + "Content-Type": "application/json" + } + data = { + "model": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF", + "messages": [ + {"role": "system", "content": "Always answer with a message contains only an answer, without any comments and explanations"}, + {"role": "user", "content": "Give a random internet username"} + ], + "temperature": 0.7, + "max_tokens": -1, + "stream": False + } + + try: + response = requests.post(url, headers=headers, json=data) + username = response.json()["choices"][0]["message"]["content"] + username = username.strip('"') + return username + except: + raise Exception("The LLM AI endpoint is not available. Please, edit the settings of LLM API endpoint in the source code") + +def generate_random_username(): + return ''.join(random.choices(string.ascii_lowercase, k=10)) + +def maigret_exporter(link): + username = link.name + random_username = generate_random_username() + try: + first_html_response = requests.get(link.url).text + url_of_non_existing_account = link.url.lower().replace(username.lower(), random_username) + second_html_response = requests.get(url_of_non_existing_account).text + except Exception as e: + return None, None, str(e) + + SEPARATORS = "\"'\n" + TOP_FEATURES = 5 + + tokens_a = set(re.split(f'[{SEPARATORS}]', first_html_response)) + tokens_b = set(re.split(f'[{SEPARATORS}]', second_html_response)) + + a_minus_b = tokens_a.difference(tokens_b) + b_minus_a = tokens_b.difference(tokens_a) + + a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b)) + b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a)) + + # Filter out strings containing usernames + a_minus_b = [s for s in a_minus_b if username not in s] + b_minus_a = [s for s in b_minus_a if random_username not in s] + + if len(a_minus_b) == len(b_minus_a) == 0: + return None, None, "HTML responses are the same" + + presence_strings = [ + "username", + "not found", + "пользователь", + "profile", + "lastname", + "firstname", + "biography", + "birthday", + "репутация", + "информация", + "e-mail" + ] + + def get_match_ratio(base_strs: list): + def get_match_inner(s: str): + return round( + max( + [ + difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio() + for s2 in base_strs + ] + ), + 2, + ) + return get_match_inner + + match_fun = get_match_ratio(presence_strings) + + presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[:TOP_FEATURES] + absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[:TOP_FEATURES] + + return presence_list, absence_list, "Found" + +def extract_username_from_url(site_url): + url = "http://localhost:1234/v1/chat/completions" + headers = { + "Content-Type": "application/json" + } + data = { + "model": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF", + "messages": [ + # {"role": "system", "content": "Always answer with a message contains only an answer (one word), without any comments and explanations"}, + {"role": "user", "content": f"Extract the username from the URL: {site_url}. The username is the part of the URL that comes immediately after the last '/' separator and may include '.', '-', and '_' as valid characters. The username should exclude any prefixes like 'http', 'https', 'www', or any trailing query parameters ('?') or fragments ('#'). Symbols '.', '-', and '_' must be treated as integral parts of the username and not removed or modified. Answer with a username only, which can be a combination of segments separated by '.', '-', and '_'."}, + ], + "n_ctx": 2048, + "temperature": 0.8, + "max_tokens": -1, + "stream": False + } + + try: + response = requests.post(url, headers=headers, json=data) + return response.json()["choices"][0]["message"]["content"] + except: + raise Exception("The LLM AI endpoint is not available. Please, edit the settings of LLM API endpoint in the source code") class Link: url: str @@ -547,7 +662,7 @@ def main(): dest='plugins', nargs='+', default='', - choices={'maigret', 'socid_extractor', 'metadata'}, + choices={'maigret', 'socid_extractor', 'metadata', 'random_username', 'extract_username', 'maigret_export'}, help='Additional plugins to analyze links', ) parser.add_argument( @@ -585,6 +700,11 @@ def main(): ) args = parser.parse_args() + if args.plugins and 'random_username' in args.plugins: + new_username = ai_generate_username() + print(colored(f'[random_username] AI-generated username "{new_username}" will be used for search instead of "{args.name}"', 'green')) + args.name = new_username + username = args.name if " " in username: print(colored('Warning, search by firstname+lastname ' @@ -640,6 +760,10 @@ def main(): def is_likely_profile(r): return r.is_it_likely_username_profile() and r.junk_score <= args.threshold and not r.filtered + junk_scores = [r.junk_score for r in result.unique_links] + medium_junk_score = sorted(junk_scores)[len(junk_scores)//2] if junk_scores else 0 + average_junk_score = sum(junk_scores)/len(junk_scores) if junk_scores else 0 + # reliable links section for r in result.unique_links: if is_likely_profile(r): @@ -651,12 +775,28 @@ def is_likely_profile(r): message = colored(f'[{r.junk_score}]', 'magenta') + ' ' + \ colored(f'[{r.source}]', 'green') + ' ' + message + maigret_found = False if 'maigret' in args.plugins and maigret.db: + main_url = r.url.replace(args.name, '') + + if os.path.exists('.maigret.json'): + urls = json.load(open('.maigret.json')) + else: + urls = [] + + if main_url in urls: + message += colored(' [v] Local findings', 'green') + if maigret.db.extract_ids_from_url(r.url): message += colored(' [v] Maigret', 'green') + maigret_found = True else: message += colored(' [ ] Maigret', 'yellow') + urls.append(main_url) + with open('.maigret.json', 'w') as f: + json.dump(urls, f, indent=4) + if 'socid_extractor' in args.plugins: try: req = requests.get(r.url) @@ -666,7 +806,32 @@ def is_likely_profile(r): except Exception as e: print(colored(e, 'red')) - print(f'{message}\n{r.title}\n') + message += f'\n{colored("Title:", "cyan")} {r.title}' + + if 'extract_username' in args.plugins: + guessed_username = extract_username_from_url(r.url) + # workaround for the case when an AI response contains a comment + guessed_username = guessed_username.split()[-1] + + comment = "" + if not guessed_username.lower() in r.url.lower(): + comment = colored(" Invalid", "red") + message += colored("\n[extract_username] Username guessed by AI: ", 'cyan') + guessed_username + comment + + if 'maigret_export' in args.plugins: + if maigret_found: + message += colored("\n[maigret_exporter] The site was already found in Maigret, skipping...", 'yellow') + else: + keywords = maigret_exporter(r) + if keywords[2] != "Found": + message += colored(f"\n[maigret_exporter] No keywords found: {keywords[2]}", 'yellow') + else: + presence_strings = keywords[0] + absence_strings = keywords[1] + message += colored("\n[maigret_exporter] Presense keywords for Maigret: ", 'yellow') + ', '.join(presence_strings) + message += colored("\n[maigret_exporter] Absence keywords for Maigret: ", 'yellow') + ', '.join(absence_strings) + + print(f'{colored("URL:", "cyan")} {message}\n') pdf_count = 0 @@ -724,6 +889,11 @@ def is_pdf_file(url): print(f"{colored(status_msg, 'cyan')}\n{colored(error_msg, 'yellow')}") + if displayed_count == 0 and uniq_count > 20: + print(colored('\nNo reliable links filtered, although there are more than 20 unique links.', 'red')) + print(colored(f'Try to decrease threshold with -t option ({args.threshold} at the moment).', 'red')) + print(colored(f'Junk scores: medium {medium_junk_score:.1f} / average {average_junk_score:.1f}\n', 'red')) + if args.csv: with open(args.csv, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL) diff --git a/requirements.txt b/requirements.txt index e18d71b..5cf4365 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,93 @@ -aiohttp>=3.8.0 -termcolor>=2.0.0 -beautifulsoup4>=4.9.0 -requests>=2.25.0 -yandex-search>=0.3.2 -PyPDF2>=2.0.0 -socid-extractor>=0.0.1 -aiohttp-socks>=0.7.0 -tqdm>=4.65.0 -google-search-results>=2.4.0 -mock>=4.0.0 -arabic-reshaper>=2.1.4 -maigret @ https://github.com/soxoj/maigret/archive/refs/heads/master.zip -search-engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip \ No newline at end of file +aiodns==3.2.0 +aiohappyeyeballs==2.4.3 +aiohttp==3.11.7 +aiohttp-socks==0.7.1 +aiosignal==1.3.1 +arabic-reshaper==3.0.0 +asn1crypto==1.5.1 +asttokens==2.4.1 +async-timeout==4.0.3 +attrs==22.2.0 +beautifulsoup4==4.12.3 +bs4==0.0.2 +certifi==2024.8.30 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.0 +click==8.1.7 +cloudscraper==1.2.71 +colorama==0.4.6 +cryptography==43.0.3 +cssselect2==0.7.0 +decorator==5.1.1 +executing==2.1.0 +frozenlist==1.5.0 +future==1.0.0 +future-annotations==1.0.0 +google_search_results==2.4.2 +html5lib==1.1 +idna==3.10 +ipython==8.29.0 +jedi==0.19.2 +Jinja2==3.1.4 +jsonpickle==4.0.0 +lxml==5.3.0 +maigret @ file:///Users/account/work/maigret +MarkupSafe==2.1.5 +matplotlib-inline==0.1.7 +mock==4.0.3 +multidict==6.1.0 +networkx==2.8.8 +oscrypto==1.3.0 +parso==0.8.4 +pexpect==4.9.0 +pillow==11.0.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycares==4.4.0 +pycountry==23.12.11 +pycparser==2.22 +Pygments==2.18.0 +pyHanko==0.25.3 +pyhanko-certvalidator==0.26.5 +pyparsing==3.2.0 +pypdf==5.1.0 +PyPDF2==3.0.1 +PySocks==1.7.1 +python-bidi==0.4.2 +python-dateutil==2.9.0.post0 +python-socks==2.5.3 +pyvis==0.2.1 +PyYAML==6.0.2 +qrcode==8.0 +reportlab==4.2.5 +requests==2.32.3 +requests-futures==1.0.2 +requests-toolbelt==1.0.0 +search_engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip#sha256=329c8a1aff702ced584e5a9f75663d6759104628df8634a740435043cb199ec0 +setuptools==75.6.0 +six==1.16.0 +socid-extractor==0.0.26 +soupsieve==2.6 +stack-data==0.6.3 +stem==1.8.2 +svglib==1.5.1 +termcolor==2.5.0 +tinycss2==1.4.0 +tokenize_rt==6.1.0 +torrequest==0.1.0 +tqdm==4.67.0 +traitlets==5.14.3 +typing_extensions==4.12.2 +tzlocal==5.2 +uritools==4.0.3 +urllib3==2.2.3 +wcwidth==0.2.13 +webencodings==0.5.1 +wheel==0.45.1 +xhtml2pdf==0.2.16 +XMind==1.2.0 +yandex-search==0.3.2 +yarl==1.18.0