Skip to content

Commit 513bdcd

Browse files
committed
simplify directory structure and crawl command
1 parent 8669bc0 commit 513bdcd

File tree

9 files changed

+179
-181
lines changed

9 files changed

+179
-181
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## [1.2.0] - 2021-02-12
8+
### Added
9+
- Shell script that executes the crawl command
10+
### Changed
11+
- Directory structure
12+
- Simplified crawl command
13+
714
## [1.1.0] - 2021-02-11
815
### Added
916
- Three new config variables (`MONGO_OUTSIDE_PORT`, `MAX_WORKERS`, `KEEP_DAYS`). See [readme](README.md) for details.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Development environment is ready to be used with [VSCode](https://code.visualstu
1010
3. Run `docker-compose up --build` to create the crawler- and database-container (`-d` to detach the docker process).
1111

1212
## Usage
13-
To start the crawling process simply run `docker-compose run --rm crawler python -m app crawl` (`-d` to detach the docker process). Ideally execute this command as a cron job.
13+
To start the crawling process simply run `docker-compose run --rm crawler ./crawl.sh` (`-d` after `--rm` to detach the docker process). Ideally execute this command as a cron job.
1414

1515
## Configuration
1616
Environment variables in `.env`:

crawler/Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ WORKDIR /code
2424
# Copy the source code of the project into the container.
2525
COPY . .
2626

27+
RUN chmod +x crawl.sh
28+
2729
# Persist bash history (see docker-compose file)
2830
RUN SNIPPET="export PROMPT_COMMAND='history -a' && export HISTFILE=/commandhistory/.bash_history" \
2931
&& echo $SNIPPET >> "/root/.bashrc"

crawler/app/__main__.py

Lines changed: 164 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,170 @@
1-
import sys
2-
from pathlib import Path
1+
import json
2+
from time import time
3+
import logging
4+
from datetime import datetime
5+
from concurrent.futures import ThreadPoolExecutor
6+
from dateutil.parser import parse as parseDate
7+
from dateutil.parser import ParserError
8+
import feedparser
9+
import newspaper
10+
from newspaper import Article, Config
11+
from pymongo.errors import BulkWriteError
312

13+
from app.logger import create_rotating_log
14+
from app import config
15+
from app.database.create_client import create_client
16+
from app.random_user_agent import random_user_agent
417

5-
if len(sys.argv) <= 1:
6-
print('No arguments provided')
7-
sys.exit()
818

19+
create_rotating_log(config.LOGS_PATH.joinpath("crawlerlog"))
20+
logger = logging.getLogger("rotating_log")
921

10-
# add current diretory path to python path to execute "app" from any location
11-
sys.path.append(Path.cwd().as_posix())
22+
MIN_TEXT_LENGTH = 350
1223

13-
# parse args
14-
cmd = sys.argv[1]
1524

16-
# process args
17-
if cmd == 'crawl':
18-
from app import crawler
19-
crawler.init()
25+
def init():
26+
ts = time()
27+
logger.info('Downloading new articles')
28+
logger.info(f'Ignoring articles older than {config.KEEP_DAYS} days')
29+
30+
if config.MAX_WORKERS:
31+
logger.info(f'Maximum crawler workers: {config.MAX_WORKERS}')
32+
33+
# load sources
34+
with open(config.SOURCES_PATH, encoding='utf-8') as f:
35+
sources = json.load(f)
36+
37+
with ThreadPoolExecutor(max_workers=config.MAX_WORKERS) as executor:
38+
executor.map(scrape_source, sources)
39+
40+
logger.info(f"Downloading done in {time() - ts}")
41+
42+
43+
def create_newspaper_config():
44+
newspaper_config = Config()
45+
newspaper_config.browser_user_agent = random_user_agent()
46+
newspaper_config.language = 'de'
47+
48+
return newspaper_config
49+
50+
51+
def process_feed_item(feed_item, source, articles_in_memory, db):
52+
# check for duplicate in db
53+
if db.articles.find_one({'url': feed_item.link}) is not None:
54+
logger.debug('Skip: article already exists')
55+
return False
56+
57+
# check if link exists already in memory
58+
if any(a['url'] == feed_item.link for a in articles_in_memory):
59+
logger.debug('Skip: article already exists')
60+
return False
61+
62+
# parse article
63+
try:
64+
article = Article(feed_item.link, config=create_newspaper_config())
65+
article.download()
66+
article.parse()
67+
except newspaper.article.ArticleException as exc:
68+
logger.debug(f'Newspaper error: {exc}')
69+
# logger.exception(exc)
70+
return False
71+
72+
# check title
73+
article_title = article.title.strip()
74+
if not article_title:
75+
logger.debug('Skip: no title or text')
76+
return False
77+
78+
# check text
79+
article_text = article.text.strip()
80+
if len(article_text) < MIN_TEXT_LENGTH:
81+
logger.debug('Skip: text too short')
82+
return False
83+
84+
# must have date
85+
published_at_val = None
86+
if article.publish_date:
87+
# get from parsed article
88+
published_at_val = article.publish_date
89+
elif hasattr(feed_item, 'published'):
90+
# get from feed item
91+
published_at_val = feed_item.published
92+
93+
if not published_at_val:
94+
logger.debug('Skip: missing date')
95+
return False
96+
97+
# normalize date, create datetime object, remove time zone
98+
if isinstance(published_at_val, datetime):
99+
published_at = published_at_val.replace(tzinfo=None)
100+
elif isinstance(published_at_val, str):
101+
try:
102+
published_at = parseDate(published_at_val, ignoretz=True)
103+
except ParserError as exc:
104+
logger.debug(f'Dateutil parse error: {exc}')
105+
return False
106+
else:
107+
logger.debug('No valid date found')
108+
return False
109+
110+
# date must be withing last n days
111+
difference = datetime.now() - published_at
112+
if difference.days > config.KEEP_DAYS:
113+
logger.debug(
114+
f'Skip: Article older than {config.KEEP_DAYS} days ({published_at})'
115+
)
116+
return False
117+
118+
# create mew item
119+
return {
120+
'title': article_title,
121+
'published_at': published_at,
122+
'created_at': datetime.now(),
123+
'url': feed_item.link,
124+
'src': source['id'],
125+
'text': article_text
126+
}
127+
128+
129+
def scrape_source(source):
130+
[client, db] = create_client() # pyMongo not thread safe
131+
articles = []
132+
133+
# loop feed
134+
feed = feedparser.parse(source['url'])
135+
logger.info(
136+
f'Parsing: {source["name"]} (id: {source["id"]}), count: {len(feed.entries)}'
137+
)
138+
139+
# process feed items
140+
# download article & create database document
141+
for feed_item in feed.entries:
142+
new_article = process_feed_item(feed_item, source, articles, db)
143+
144+
if not new_article:
145+
continue
146+
147+
articles.append(new_article)
148+
149+
# save articles to db
150+
if articles:
151+
try:
152+
response = db.articles.insert_many(articles)
153+
logger.info(
154+
f'Saved {len(response.inserted_ids)} article(s) from {source["name"]} to database'
155+
)
156+
except BulkWriteError as bwe:
157+
logger.exception(bwe)
158+
logger.error(bwe.details)
159+
# you can also take this component and do more analysis
160+
# werrors = bwe.details['writeErrors']
161+
# raise
162+
else:
163+
logger.info(f'No new articles found in {source["name"]}')
164+
165+
# close connection to db client
166+
client.close()
167+
168+
169+
if __name__ == "__main__":
170+
init()

crawler/app/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
ROOT_PATH = Path.cwd()
88
ROOT_PATH = ROOT_PATH.joinpath('app')
99

10-
SOURCES_PATH = ROOT_PATH.joinpath('assets', 'sources.json')
10+
SOURCES_PATH = ROOT_PATH.joinpath('sources.json')
1111
LOGS_PATH = ROOT_PATH.joinpath('logs')
1212

1313
# general

crawler/app/crawler/__init__.py

Lines changed: 0 additions & 166 deletions
This file was deleted.
File renamed without changes.
File renamed without changes.

crawler/crawl.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
set -e
3+
4+
python -m app

0 commit comments

Comments
 (0)