Skip to content
This repository was archived by the owner on Oct 18, 2025. It is now read-only.

Commit c52a675

Browse files
committed
CopyrightArmor release v1
1 parent 359d7d7 commit c52a675

File tree

11 files changed

+202
-33
lines changed

11 files changed

+202
-33
lines changed

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
- **Web Scraping Engine**: CopyrightArmor uses a robust web scraping engine to crawl and analyze websites for potentially infringing content.
1010
- **Recursive Scrapping**: Recursively scans websites.
1111
- **Google Optimized**: by using `--google` and `--google-search` without `-url` the web scraping engine will be optimized for Google Search.
12+
- **Google SERP Engine**: CopyrightArmor detects most of pirate sites on the Google Search results. ([always check for false positives](https://github.com/Copy05/CopyrightArmor/discussions/4))
1213
- **Flexible**: You can configurate how and what type of links it should scrape:
1314
- Exclude Social Media Links
1415
- Exclude Query parameter links
@@ -48,6 +49,8 @@ pip install -r requirements.txt
4849
```
4950

5051
4. Open up `src/hashes.json` and add all content that you want to scan for and use this syntax:
52+
53+
**`--google-search`** scans only for the content inside `"entertainment"`
5154
```json
5255
{
5356
"images": [
@@ -65,6 +68,14 @@ pip install -r requirements.txt
6568
"hash": "3b99f49776f433aeb000fa010e452879198fe4be7f6660552527b53304268343",
6669
"description": "Another Pink Cherry Tree Exe"
6770
}
71+
],
72+
"entertainment": [
73+
{
74+
"title": "EntertainRewind 2024",
75+
"original_url": "example.com",
76+
"copyright_owner": "Entertainmasters",
77+
"hash": "c8392dc67d913d39664e0fc400280a2de03107348f7432e226194f0a7e4eeebe"
78+
}
6879
]
6980
}
7081
```
@@ -87,4 +98,4 @@ I do accept contributions to this project. create a pull request and describe yo
8798
| Manga | +195,188,170 |
8899
| Models | +167,086,838 |
89100
| XXX | +11,971,422 |
90-
| WGCZ (BangBros) | +11,355,801 |
101+
| WGCZ (BangBros) | +11,355,801 |

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ beautifulsoup4==4.12.2
22
selenium==4.14.0
33
webdriver-manager==4.0.1
44
requests==2.31.0
5-
colorama==0.4.6
5+
colorama==0.4.6

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
setup(
2424
name="CopyrightArmor",
25-
version="0.2",
25+
version="1.0",
2626
author="Copy05",
2727
description="A tool that scans the web for pirated content",
2828
url="https://github.com/Copy05/CopyrightArmor/",

src/ContentMatching.py

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,119 @@
2121
import requests
2222
import hashlib
2323
import json
24+
import re
2425

2526
from bs4 import BeautifulSoup
2627
from colorama import Style, Fore
2728
from urllib.parse import urljoin
2829
from IO import extract_domain, LoadIgnoreFileExts
2930

31+
def ScanTitle(title, my_content):
32+
with open('patterns.json', 'r') as file:
33+
patterns = json.load(file)['patterns']
34+
35+
anime_pattern = re.compile(patterns['anime_pattern'], re.IGNORECASE)
36+
turkish_pattern = re.compile(patterns['turkish_pattern'], re.IGNORECASE) # Some may include Turkish Language.
37+
pirated_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in patterns['pirated_patterns']]
38+
resolution_pattern = re.compile(patterns['resolution_pattern'], re.IGNORECASE)
39+
episode_pattern = re.compile(patterns['episode_pattern'], re.IGNORECASE)
40+
staffel_pattern = re.compile(patterns['staffel_pattern'], re.IGNORECASE)
41+
legal_sites_pattern = re.compile(patterns['legal_sites_pattern'], re.IGNORECASE)
42+
anime_deep_pattern = re.compile(patterns['anime_deep_pattern'], re.IGNORECASE)
43+
manga_chapter_pattern = re.compile(patterns['manga_chapter_pattern'], re.IGNORECASE)
44+
manga_online_pattern = re.compile(patterns['manga_online_pattern'], re.IGNORECASE)
45+
46+
if my_content not in title:
47+
return False
48+
49+
# Check for the presence of "Watch Anime" in the title
50+
if "Watch Anime" in title:
51+
return True
52+
53+
if anime_pattern.match(title) or turkish_pattern.match(title):
54+
return True
55+
for pattern in pirated_patterns:
56+
if re.match(pattern, title):
57+
return True
58+
if episode_pattern.match(title) or staffel_pattern.match(title):
59+
return True
60+
if manga_chapter_pattern.match(title) or manga_online_pattern.match(title):
61+
return True
62+
if legal_sites_pattern.search(title):
63+
return False
64+
if anime_deep_pattern.match(title):
65+
return True
66+
if resolution_pattern.match(title):
67+
return True
68+
return False
69+
70+
def ScanGoogleLink(url, title, DebugInformation=False, verbose=False):
71+
from GoogleScrape import SearchQuery, infriding_data, UniqueFiles, infringing_urls
72+
73+
titles_list = []
74+
contentFlagged = False
75+
76+
Query = SearchQuery.replace("+", " ").replace('%2C', ',').replace('%20', ' ')
77+
78+
if verbose:
79+
print(f"{Fore.YELLOW}URL: {url}\nTitle: {title}{Style.RESET_ALL}")
80+
81+
with open("hashes.json") as file:
82+
data = json.load(file)
83+
84+
entertainment_data = data.get("entertainment", [])
85+
86+
# For every entry inside the Entertainment data. the Show Name will be appended inside the list.
87+
for entry in entertainment_data:
88+
m_title = entry.get("title", "")
89+
titles_list.append(m_title)
90+
91+
for content in titles_list:
92+
93+
# to differentiate content we want to check if the title of the show is inside the search result title like: "Watch SHOW1" -- "SHOW1"
94+
# like: "Watch SHOW1" -- "SHOW1" so that it get's flagged as "SHOW1"
95+
if ScanTitle(title, content) and content.lower() in title.lower():
96+
if verbose:
97+
print(f"{Fore.RED}COPYRIGHTED MATERIAL FOUND{Style.RESET_ALL}")
98+
contentFlagged = True
99+
else:
100+
if verbose:
101+
print(f"{Fore.GREEN}LEGAL{Style.RESET_ALL}")
102+
contentFlagged = False
103+
104+
# to differentiate content we want to check if the title of the show is inside the search result title
105+
# like: "Watch SHOW1" -- "SHOW1" so that it get's flagged as "SHOW1"
106+
if contentFlagged and content.lower() in title.lower():
107+
for entry in data['entertainment']:
108+
109+
if entry['title'].lower() in content.lower():
110+
111+
original_owner = entry["copyright_owner"]
112+
original_source = entry["original_url"]
113+
114+
# If the URL is the original source
115+
if url == original_source:
116+
continue
117+
118+
infringing_urls.add(url)
119+
UniqueFiles.add(url)
120+
121+
infriding_data.append({
122+
"url": url,
123+
"type": "Copyrighted Show",
124+
"original_url": original_source,
125+
"copyright_owner": original_owner,
126+
"description": entry['title'],
127+
})
128+
129+
if verbose:
130+
print(Fore.RED, f"\nCopyright Infringing Show has been found on {url}.\nSearch Result Title: {title}\nCopyrighted Work: {entry['title']}\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")
131+
else:
132+
print(Fore.RED, f"\nCopyright Infringing Show has been found on {url}.\nCopyrighted Work: {entry['title']}\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")
133+
print(Style.RESET_ALL)
134+
135+
break
136+
30137
def ScanImage(soup : BeautifulSoup, url, DebugInformation : bool):
31138
from Scrape import ScannedImages, infriding_data, infringing_urls, TheBaseURL, UniqueFiles
32139

@@ -151,4 +258,4 @@ def ScanFiles(soup: BeautifulSoup, url, DebugInformation: bool):
151258
print(Fore.RED, f"\nCopyright Infringing File (\"{link_hash}\") has been found on {url}.\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")
152259
print(Style.RESET_ALL)
153260

154-
break
261+
break

src/CopyrightArmor.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,8 @@ def PrintVersion():
8888
# To Avoid Long Execution Time when not using the scraping engine.
8989
from GoogleScrape import GoogleScrape
9090

91-
if args.detailed_report and args.report_file is False:
92-
print(Fore.RED, "Error: Invalid Argument: \"--detailed-report\" because \"--report_file\" is false")
93-
print(Style.RESET_ALL)
94-
exit(1)
95-
9691
GoogleScrape(Query=args.google_search, RateLimmit=args.rate_limit, verbose=args.verbose, ReportFile=args.report_file)
9792

98-
9993
if not any(vars(args).values()):
10094
print(Fore.RED, "Error: No arguments provided. Use -h or --help for usage information.")
10195
print(Style.RESET_ALL)

src/GoogleScrape.py

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -35,34 +35,49 @@
3535
from selenium.webdriver.support.ui import WebDriverWait
3636
from selenium.webdriver.support import expected_conditions as EC
3737

38-
from IO import SaveReport
38+
from IO import SaveReport, LoadWhitelist, extract_domain
3939
from verbose_print import PrintFoundLinks
40+
from ContentMatching import ScanGoogleLink
41+
from ScrapingEngine import FilterLinks
42+
from utils import GetSettings
4043

4144
chrome_options = Options()
4245
chrome_options.add_argument('--headless')
4346
chrome_options.add_argument('--log-level=3')
4447
chrome_options.add_argument('--disable-logging')
4548
chrome_options.add_argument('--disable-dev-shm-usage')
49+
chrome_options.add_argument("--disable-gpu")
50+
chrome_options.add_argument("--disable-extensions")
51+
chrome_options.add_argument("--remote-debugging-pipe")
4652

4753
driver = webdriver.Chrome(options=chrome_options)
4854

4955
Found_Links = set()
56+
ScannedImages = set()
57+
UniqueFiles = set()
58+
infringing_urls = set()
59+
infriding_data = []
5060
Index = 1
5161
CookieBannerClicked = False
62+
SearchQuery = None
5263

53-
MORE_RESULTS_BUTTON_XPATHS = ["//*[@id='botstuff']/div/div[3]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[2]/h3/span"]
64+
MORE_RESULTS_BUTTON_XPATHS = ["//*[@id='botstuff']/div/div[3]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[2]/h3/span", "//*[@id='kp-wp-tab-cont-overview']/div/div[3]/div/div/div[4]/a[1]/h3/div", "//*[@id='botstuff']/div/div[4]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-TvmWatch']/div/div[3]/div/div/div[4]/a[1]/h3/div"]
5465

5566
def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateLimmitTime=2):
5667

5768
global CookieBannerClicked
69+
global SearchQuery
5870

5971
URL = f"https://google.com/search?q={Query}&cs=0&filter=0&safe=off&nfpr=1"
6072

6173
warnings.filterwarnings("ignore", category=InsecureRequestWarning)
6274

6375
soup = None
64-
MAXIMAL_RETRIES = 5
76+
MAXIMAL_RETRIES = 3
6577
Retries = 0
78+
SearchQuery = Query
79+
whitelist = LoadWhitelist()
80+
6681

6782
if RateLimmit:
6883
time.sleep(RateLimmitTime)
@@ -91,21 +106,13 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
91106
for link in soup.find_all('a', href=True):
92107
next_url = urljoin(URL, link['href'])
93108

94-
if next_url.startswith("mailto:"):
95-
if verbose:
96-
print(Fore.YELLOW, f"Skipping {next_url} because 'mailto' links arent allowed")
97-
print(Style.RESET_ALL)
98-
continue
99-
if next_url.startswith("javascript:"):
100-
if verbose:
101-
print(Fore.YELLOW, f"Skipping {next_url} because 'javascript' links arent allowed")
102-
print(Style.RESET_ALL)
109+
if FilterLinks(next_url, verbose, True):
103110
continue
104111

105112
if next_url not in Found_Links and "google.com" not in next_url:
106113
Found_Links.add(next_url)
107114
FoundLinkCount += 1
108-
115+
109116
if verbose:
110117
print(Fore.YELLOW, f"{FoundLinkCount} Links has been added to the List. | {len(Found_Links)} Links in the List")
111118
print(Style.RESET_ALL)
@@ -123,7 +130,10 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
123130
Found_Links.add(next_url)
124131
FoundLinkCount += 1
125132
FoundAnyLinks = True
126-
133+
134+
if extract_domain(next_url) not in whitelist:
135+
ScanGoogleLink(url=next_url, title=link.text.strip(), verbose=verbose, DebugInformation=False)
136+
127137
if CookieBannerClicked is False:
128138
try:
129139
cookie_banner = driver.find_element(By.XPATH, "//*[@id='CXQnmb']")
@@ -136,8 +146,7 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
136146
pass
137147

138148
try:
139-
print(Fore.GREEN, f"Searching [Links Found: {len(Found_Links)}]")
140-
print(Style.RESET_ALL)
149+
print(f"{Fore.GREEN}Searching [Links Found: {len(Found_Links)}] {Fore.WHITE}| {Fore.RED}Infriding Links Found: {len(infriding_data)}{Style.RESET_ALL}")
141150

142151
if verbose:
143152
print(Fore.YELLOW, f"{FoundLinkCount} Links has been added to the List. | {len(Found_Links)} Links in the List")
@@ -168,22 +177,22 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
168177
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
169178
pass
170179

171-
print(f"Query: {Query}\nFound Links: {len(Found_Links)}")
180+
print(f"Query: {Query}\nFound Links: {len(Found_Links)}\n{Fore.RED}Infriding Search Results: {len(infriding_data)}{Style.RESET_ALL}")
172181
if ReportFile:
173-
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
174-
exit()
182+
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
183+
exit()
175184

176185
except requests.exceptions.TooManyRedirects:
177186
print(Fore.RED, "Overloaded.")
178187
print(Style.RESET_ALL)
179188

180189
if ReportFile:
181-
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
182-
exit()
190+
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
191+
exit()
183192

184193
except KeyboardInterrupt:
185194
print("Exiting Scrape Mode.")
186195

187196
if ReportFile:
188-
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
189-
exit()
197+
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
198+
exit()

src/IO.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ def LoadIgnoreFileExts() -> list[str]:
4040
exts = data["ignore_exts"]
4141
return exts
4242

43+
def LoadWhitelist() -> list[str]:
44+
with open('filters.json', 'r') as file:
45+
data = json.load(file)
46+
47+
wl = data["whitelist"]
48+
return wl
49+
4350
def extract_domain(url):
4451
parsed_url = urlparse(url)
4552
return parsed_url.netloc

src/Scrape.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@
4646
chrome_options.add_argument('--log-level=3')
4747
chrome_options.add_argument('--disable-logging')
4848
chrome_options.add_argument('--disable-dev-shm-usage')
49+
chrome_options.add_argument("--disable-gpu")
50+
chrome_options.add_argument("--disable-extensions")
51+
chrome_options.add_argument("--remote-debugging-pipe")
4952

5053
driver = webdriver.Chrome(options=chrome_options)
5154

src/filters.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,8 @@
3838
".editorconfig",
3939
".npmrc",
4040
".dockerignore"
41+
],
42+
"whitelist": [
43+
"https://www.youtube.com/"
4144
]
4245
}

src/hashes.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,13 @@
1414
"hash": "81ecf8a6c049c784bd48dc40bcbd3840c7a95d31bd4a82ed40db9610cb639de2",
1515
"description": "Mediakit"
1616
}
17+
],
18+
"entertainment": [
19+
{
20+
"title": "Test123",
21+
"original_url": "https://imdb.com/zuez9zw7ez79z79q",
22+
"copyright_owner": "Example Inc",
23+
"hash": "c8392dc67d913d39664e0fc400280a2de03107348f7432e226194f0a7e43f2be"
24+
}
1725
]
1826
}

0 commit comments

Comments
 (0)