|
1 | | -import sys |
2 | | -from pathlib import Path |
| 1 | +import json |
| 2 | +from time import time |
| 3 | +import logging |
| 4 | +from datetime import datetime |
| 5 | +from concurrent.futures import ThreadPoolExecutor |
| 6 | +from dateutil.parser import parse as parseDate |
| 7 | +from dateutil.parser import ParserError |
| 8 | +import feedparser |
| 9 | +import newspaper |
| 10 | +from newspaper import Article, Config |
| 11 | +from pymongo.errors import BulkWriteError |
3 | 12 |
|
| 13 | +from app.logger import create_rotating_log |
| 14 | +from app import config |
| 15 | +from app.database.create_client import create_client |
| 16 | +from app.random_user_agent import random_user_agent |
4 | 17 |
|
5 | | -if len(sys.argv) <= 1: |
6 | | - print('No arguments provided') |
7 | | - sys.exit() |
8 | 18 |
|
| 19 | +create_rotating_log(config.LOGS_PATH.joinpath("crawlerlog")) |
| 20 | +logger = logging.getLogger("rotating_log") |
9 | 21 |
|
10 | | -# add current diretory path to python path to execute "app" from any location |
11 | | -sys.path.append(Path.cwd().as_posix()) |
| 22 | +MIN_TEXT_LENGTH = 350 |
12 | 23 |
|
13 | | -# parse args |
14 | | -cmd = sys.argv[1] |
15 | 24 |
|
16 | | -# process args |
17 | | -if cmd == 'crawl': |
18 | | - from app import crawler |
19 | | - crawler.init() |
| 25 | +def init(): |
| 26 | + ts = time() |
| 27 | + logger.info('Downloading new articles') |
| 28 | + logger.info(f'Ignoring articles older than {config.KEEP_DAYS} days') |
| 29 | + |
| 30 | + if config.MAX_WORKERS: |
| 31 | + logger.info(f'Maximum crawler workers: {config.MAX_WORKERS}') |
| 32 | + |
| 33 | + # load sources |
| 34 | + with open(config.SOURCES_PATH, encoding='utf-8') as f: |
| 35 | + sources = json.load(f) |
| 36 | + |
| 37 | + with ThreadPoolExecutor(max_workers=config.MAX_WORKERS) as executor: |
| 38 | + executor.map(scrape_source, sources) |
| 39 | + |
| 40 | + logger.info(f"Downloading done in {time() - ts}") |
| 41 | + |
| 42 | + |
| 43 | +def create_newspaper_config(): |
| 44 | + newspaper_config = Config() |
| 45 | + newspaper_config.browser_user_agent = random_user_agent() |
| 46 | + newspaper_config.language = 'de' |
| 47 | + |
| 48 | + return newspaper_config |
| 49 | + |
| 50 | + |
| 51 | +def process_feed_item(feed_item, source, articles_in_memory, db): |
| 52 | + # check for duplicate in db |
| 53 | + if db.articles.find_one({'url': feed_item.link}) is not None: |
| 54 | + logger.debug('Skip: article already exists') |
| 55 | + return False |
| 56 | + |
| 57 | + # check if link exists already in memory |
| 58 | + if any(a['url'] == feed_item.link for a in articles_in_memory): |
| 59 | + logger.debug('Skip: article already exists') |
| 60 | + return False |
| 61 | + |
| 62 | + # parse article |
| 63 | + try: |
| 64 | + article = Article(feed_item.link, config=create_newspaper_config()) |
| 65 | + article.download() |
| 66 | + article.parse() |
| 67 | + except newspaper.article.ArticleException as exc: |
| 68 | + logger.debug(f'Newspaper error: {exc}') |
| 69 | + # logger.exception(exc) |
| 70 | + return False |
| 71 | + |
| 72 | + # check title |
| 73 | + article_title = article.title.strip() |
| 74 | + if not article_title: |
| 75 | + logger.debug('Skip: no title or text') |
| 76 | + return False |
| 77 | + |
| 78 | + # check text |
| 79 | + article_text = article.text.strip() |
| 80 | + if len(article_text) < MIN_TEXT_LENGTH: |
| 81 | + logger.debug('Skip: text too short') |
| 82 | + return False |
| 83 | + |
| 84 | + # must have date |
| 85 | + published_at_val = None |
| 86 | + if article.publish_date: |
| 87 | + # get from parsed article |
| 88 | + published_at_val = article.publish_date |
| 89 | + elif hasattr(feed_item, 'published'): |
| 90 | + # get from feed item |
| 91 | + published_at_val = feed_item.published |
| 92 | + |
| 93 | + if not published_at_val: |
| 94 | + logger.debug('Skip: missing date') |
| 95 | + return False |
| 96 | + |
| 97 | + # normalize date, create datetime object, remove time zone |
| 98 | + if isinstance(published_at_val, datetime): |
| 99 | + published_at = published_at_val.replace(tzinfo=None) |
| 100 | + elif isinstance(published_at_val, str): |
| 101 | + try: |
| 102 | + published_at = parseDate(published_at_val, ignoretz=True) |
| 103 | + except ParserError as exc: |
| 104 | + logger.debug(f'Dateutil parse error: {exc}') |
| 105 | + return False |
| 106 | + else: |
| 107 | + logger.debug('No valid date found') |
| 108 | + return False |
| 109 | + |
| 110 | + # date must be withing last n days |
| 111 | + difference = datetime.now() - published_at |
| 112 | + if difference.days > config.KEEP_DAYS: |
| 113 | + logger.debug( |
| 114 | + f'Skip: Article older than {config.KEEP_DAYS} days ({published_at})' |
| 115 | + ) |
| 116 | + return False |
| 117 | + |
| 118 | + # create mew item |
| 119 | + return { |
| 120 | + 'title': article_title, |
| 121 | + 'published_at': published_at, |
| 122 | + 'created_at': datetime.now(), |
| 123 | + 'url': feed_item.link, |
| 124 | + 'src': source['id'], |
| 125 | + 'text': article_text |
| 126 | + } |
| 127 | + |
| 128 | + |
| 129 | +def scrape_source(source): |
| 130 | + [client, db] = create_client() # pyMongo not thread safe |
| 131 | + articles = [] |
| 132 | + |
| 133 | + # loop feed |
| 134 | + feed = feedparser.parse(source['url']) |
| 135 | + logger.info( |
| 136 | + f'Parsing: {source["name"]} (id: {source["id"]}), count: {len(feed.entries)}' |
| 137 | + ) |
| 138 | + |
| 139 | + # process feed items |
| 140 | + # download article & create database document |
| 141 | + for feed_item in feed.entries: |
| 142 | + new_article = process_feed_item(feed_item, source, articles, db) |
| 143 | + |
| 144 | + if not new_article: |
| 145 | + continue |
| 146 | + |
| 147 | + articles.append(new_article) |
| 148 | + |
| 149 | + # save articles to db |
| 150 | + if articles: |
| 151 | + try: |
| 152 | + response = db.articles.insert_many(articles) |
| 153 | + logger.info( |
| 154 | + f'Saved {len(response.inserted_ids)} article(s) from {source["name"]} to database' |
| 155 | + ) |
| 156 | + except BulkWriteError as bwe: |
| 157 | + logger.exception(bwe) |
| 158 | + logger.error(bwe.details) |
| 159 | + # you can also take this component and do more analysis |
| 160 | + # werrors = bwe.details['writeErrors'] |
| 161 | + # raise |
| 162 | + else: |
| 163 | + logger.info(f'No new articles found in {source["name"]}') |
| 164 | + |
| 165 | + # close connection to db client |
| 166 | + client.close() |
| 167 | + |
| 168 | + |
| 169 | +if __name__ == "__main__": |
| 170 | + init() |
0 commit comments