simplify directory structure and crawl command

gambolputty · gambolputty · commit 513bdcdf3401 · 2021-02-12T23:40:11.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.2.0] - 2021-02-12
+### Added
+- Shell script that executes the crawl command
+### Changed
+- Directory structure
+- Simplified crawl command
+
 ## [1.1.0] - 2021-02-11
 ### Added
 - Three new config variables (`MONGO_OUTSIDE_PORT`, `MAX_WORKERS`, `KEEP_DAYS`). See [readme](README.md) for details.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Development environment is ready to be used with [VSCode](https://code.visualstu
 3. Run `docker-compose up --build` to create the crawler- and database-container (`-d` to detach the docker process).
 
 ## Usage
-To start the crawling process simply run `docker-compose run --rm crawler python -m app crawl` (`-d` to detach the docker process). Ideally execute this command as a cron job.
+To start the crawling process simply run `docker-compose run --rm crawler ./crawl.sh` (`-d` after `--rm` to detach the docker process). Ideally execute this command as a cron job.
 
 ## Configuration
 Environment variables in `.env`:
diff --git a/crawler/Dockerfile b/crawler/Dockerfile
@@ -24,6 +24,8 @@ WORKDIR /code
 # Copy the source code of the project into the container.
 COPY . .
 
+RUN chmod +x crawl.sh
+
 # Persist bash history (see docker-compose file)
 RUN SNIPPET="export PROMPT_COMMAND='history -a' && export HISTFILE=/commandhistory/.bash_history" \
     && echo $SNIPPET >> "/root/.bashrc"
diff --git a/crawler/app/__main__.py b/crawler/app/__main__.py
@@ -1,19 +1,170 @@
-import sys
-from pathlib import Path
+import json
+from time import time
+import logging
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
+from dateutil.parser import parse as parseDate
+from dateutil.parser import ParserError
+import feedparser
+import newspaper
+from newspaper import Article, Config
+from pymongo.errors import BulkWriteError
 
+from app.logger import create_rotating_log
+from app import config
+from app.database.create_client import create_client
+from app.random_user_agent import random_user_agent
 
-if len(sys.argv) <= 1:
-    print('No arguments provided')
-    sys.exit()
 
+create_rotating_log(config.LOGS_PATH.joinpath("crawlerlog"))
+logger = logging.getLogger("rotating_log")
 
-# add current diretory path to python path to execute "app" from any location
-sys.path.append(Path.cwd().as_posix())
+MIN_TEXT_LENGTH = 350
 
-# parse args
-cmd = sys.argv[1]
 
-# process args
-if cmd == 'crawl':
-    from app import crawler
-    crawler.init()
+def init():
+    ts = time()
+    logger.info('Downloading new articles')
+    logger.info(f'Ignoring articles older than {config.KEEP_DAYS} days')
+
+    if config.MAX_WORKERS:
+        logger.info(f'Maximum crawler workers: {config.MAX_WORKERS}')
+
+    # load sources
+    with open(config.SOURCES_PATH, encoding='utf-8') as f:
+        sources = json.load(f)
+
+    with ThreadPoolExecutor(max_workers=config.MAX_WORKERS) as executor:
+        executor.map(scrape_source, sources)
+
+    logger.info(f"Downloading done in {time() - ts}")
+
+
+def create_newspaper_config():
+    newspaper_config = Config()
+    newspaper_config.browser_user_agent = random_user_agent()
+    newspaper_config.language = 'de'
+
+    return newspaper_config
+
+
+def process_feed_item(feed_item, source, articles_in_memory, db):
+    # check for duplicate in db
+    if db.articles.find_one({'url': feed_item.link}) is not None:
+        logger.debug('Skip: article already exists')
+        return False
+
+    # check if link exists already in memory
+    if any(a['url'] == feed_item.link for a in articles_in_memory):
+        logger.debug('Skip: article already exists')
+        return False
+
+    # parse article
+    try:
+        article = Article(feed_item.link, config=create_newspaper_config())
+        article.download()
+        article.parse()
+    except newspaper.article.ArticleException as exc:
+        logger.debug(f'Newspaper error: {exc}')
+        # logger.exception(exc)
+        return False
+
+    # check title
+    article_title = article.title.strip()
+    if not article_title:
+        logger.debug('Skip: no title or text')
+        return False
+
+    # check text
+    article_text = article.text.strip()
+    if len(article_text) < MIN_TEXT_LENGTH:
+        logger.debug('Skip: text too short')
+        return False
+
+    # must have date
+    published_at_val = None
+    if article.publish_date:
+        # get from parsed article
+        published_at_val = article.publish_date
+    elif hasattr(feed_item, 'published'):
+        # get from feed item
+        published_at_val = feed_item.published
+
+    if not published_at_val:
+        logger.debug('Skip: missing date')
+        return False
+
+    # normalize date, create datetime object, remove time zone
+    if isinstance(published_at_val, datetime):
+        published_at = published_at_val.replace(tzinfo=None)
+    elif isinstance(published_at_val, str):
+        try:
+            published_at = parseDate(published_at_val, ignoretz=True)
+        except ParserError as exc:
+            logger.debug(f'Dateutil parse error: {exc}')
+            return False
+    else:
+        logger.debug('No valid date found')
+        return False
+
+    # date must be withing last n days
+    difference = datetime.now() - published_at
+    if difference.days > config.KEEP_DAYS:
+        logger.debug(
+            f'Skip: Article older than {config.KEEP_DAYS} days ({published_at})'
+        )
+        return False
+
+    # create mew item
+    return {
+        'title': article_title,
+        'published_at': published_at,
+        'created_at': datetime.now(),
+        'url': feed_item.link,
+        'src': source['id'],
+        'text': article_text
+    }
+
+
+def scrape_source(source):
+    [client, db] = create_client()  # pyMongo not thread safe
+    articles = []
+
+    # loop feed
+    feed = feedparser.parse(source['url'])
+    logger.info(
+        f'Parsing: {source["name"]} (id: {source["id"]}), count: {len(feed.entries)}'
+    )
+
+    # process feed items
+    # download article & create database document
+    for feed_item in feed.entries:
+        new_article = process_feed_item(feed_item, source, articles, db)
+
+        if not new_article:
+            continue
+
+        articles.append(new_article)
+
+    # save articles to db
+    if articles:
+        try:
+            response = db.articles.insert_many(articles)
+            logger.info(
+                f'Saved {len(response.inserted_ids)} article(s) from {source["name"]} to database'
+            )
+        except BulkWriteError as bwe:
+            logger.exception(bwe)
+            logger.error(bwe.details)
+            # you can also take this component and do more analysis
+            # werrors = bwe.details['writeErrors']
+            # raise
+    else:
+        logger.info(f'No new articles found in {source["name"]}')
+
+    # close connection to db client
+    client.close()
+
+
+if __name__ == "__main__":
+    init()
diff --git a/crawler/app/config.py b/crawler/app/config.py
@@ -7,7 +7,7 @@
 ROOT_PATH = Path.cwd()
 ROOT_PATH = ROOT_PATH.joinpath('app')
 
-SOURCES_PATH = ROOT_PATH.joinpath('assets', 'sources.json')
+SOURCES_PATH = ROOT_PATH.joinpath('sources.json')
 LOGS_PATH = ROOT_PATH.joinpath('logs')
 
 # general
diff --git a/crawler/app/crawler/__init__.py b/crawler/app/crawler/__init__.py
diff --git a/crawler/app/random_user_agent.py b/crawler/app/random_user_agent.py
diff --git a/crawler/app/sources.json b/crawler/app/sources.json
diff --git a/crawler/crawl.sh b/crawler/crawl.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+
+python -m app

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +#!/bin/bash
 +set -e
++
 +python -m app