Skip to content
This repository was archived by the owner on Feb 22, 2025. It is now read-only.

Commit b6be404

Browse files
author
rix
committed
Enable RegEx search for all content types on BY
1 parent f81d74a commit b6be404

File tree

2 files changed

+18
-6
lines changed

2 files changed

+18
-6
lines changed

feedcrawler/external_sites/feed_search/sites/content_all_by.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,15 @@ def __init__(self, filename):
3030
self.url = self.hostnames.get('by')
3131
self.password = self.url.split('.')[0]
3232

33-
if "List_ContentAll_Seasons" not in filename:
33+
if "_Regex" in filename:
34+
self.URL = 'https://' + self.url + "/?cat="
35+
self.FEED_URLS = [self.URL + "1", self.URL + "2"]
36+
elif "List_ContentAll_Seasons" not in filename:
3437
self.URL = 'https://' + self.url + "/?cat=1"
38+
self.FEED_URLS = [self.URL]
3539
else:
3640
self.URL = 'https://' + self.url + "/?cat=2"
37-
self.FEED_URLS = [self.URL]
41+
self.FEED_URLS = [self.URL]
3842

3943
self.config = CrawlerConfig("ContentAll")
4044
self.feedcrawler = CrawlerConfig("FeedCrawler")
@@ -49,9 +53,17 @@ def __init__(self, filename):
4953
search = int(CrawlerConfig("ContentAll").get("search"))
5054
i = 2
5155
while i <= search:
52-
page_url = self.URL + "&start=" + str(i)
53-
if page_url not in self.FEED_URLS:
54-
self.FEED_URLS.append(page_url)
56+
if "_Regex" in filename:
57+
page_url_1 = self.URL + "1&start=" + str(i)
58+
page_url_2 = self.URL + "2&start=" + str(i)
59+
if page_url_1 not in self.FEED_URLS:
60+
self.FEED_URLS.append(page_url_1)
61+
if page_url_2 not in self.FEED_URLS:
62+
self.FEED_URLS.append(page_url_2)
63+
else:
64+
page_url = self.URL + "&start=" + str(i)
65+
if page_url not in self.FEED_URLS:
66+
self.FEED_URLS.append(page_url)
5567
i += 1
5668
self.cdc = FeedDb('cdc')
5769

feedcrawler/providers/common_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,7 @@ def keep_alphanumeric_with_special_characters(string):
620620

621621
def keep_alphanumeric_with_regex_characters(string):
622622
string = replace_with_stripped_ascii(string)
623-
return re.sub(r'[^0-9a-zA-Z\s\-.*+()|\[\]?!]', '', string)
623+
return re.sub(r'[^0-9a-zA-Z\s\-.*+()|\[\]\\{},?!]', '', string)
624624

625625

626626
def keep_numbers(string):

0 commit comments

Comments
 (0)