j-e-d · xuv · Oct 11, 2016 · Oct 11, 2016 · Oct 11, 2016 · Oct 11, 2016
diff --git a/README.md b/README.md
@@ -1,10 +1,21 @@
-# NYTdiff
+# NYTdiff+
 
-Code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff).
+Based on @j-e-d's code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff).  
+RSS feed fetching added for @xuv's twitter bot [@lesoir_diff](https://twitter.com/lesoir_diff)
 
-The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file.
+[Twitter keys](https://dev.twitter.com/) are needed.  
+[NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" are needed for The New York Times.  
+An RSS Url is needed for [Le Soir](http://lesoir.be) or any other news website.
 
-[Twitter keys](https://dev.twitter.com/) and the [NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" service are needed, values of this keys need to be entered in the run_diff.sh file.
+Installation
+------------
++ The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file.
++ `pip install -r requirements.txt`
 
-Font: [Merriweather](https://fonts.google.com/specimen/Merriweather). Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).
 
+Credits
+-------
++ Original script and idea: @j-e-d Juan E.D. http://unahormiga.com/
++ RSS fetching: @xuv Julien Deswaef http://xuv.be
++ Font: [Merriweather](https://fonts.google.com/specimen/Merriweather)
++ Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).
diff --git a/css/styles.css b/css/styles.css
@@ -1,12 +1,12 @@
-@font-face { 
-    font-family: Merriweather; 
+@font-face {
+    font-family: Merriweather;
     font-style: normal;
     font-weight: normal;
-    src: url('../fonts/Merriweather-Regular.ttf') format("truetype"); 
-} 
+    src: url('../fonts/Merriweather-Regular.ttf') format("truetype");
+}
 
-body { 
-    background: lightgray url('../img/paper_fibers.png') repeat; 
+body {
+    background: lightgray url('../img/paper_fibers.png') repeat;
     font-family: Merriweather;
     font-size: 16px;
 }
@@ -17,6 +17,7 @@ p {
     margin-top: 1em;
     margin-bottom: 1em;
     font-weight: normal;
+    word-wrap: break-word;
 }
 
 del {

diff --git a/nytdiff.py b/nytdiff.py
@@ -18,7 +18,9 @@
 from simplediff import html_diff
 from selenium import webdriver
 
-TIMEZONE = 'America/Buenos_Aires'
+import feedparser
+
+TIMEZONE = 'Europe/Brussels'
 LOCAL_TZ = timezone(TIMEZONE)
 MAX_RETRIES = 10
 RETRY_DELAY = 3
@@ -124,6 +126,7 @@ def tweet_with_media(self, text, images, reply_to=None):
     def tweet_text(self, text):
         if TESTING:
             print (text)
+            return True
         try:
             tweet_id = self.api.update_status(status=text)
         except:
@@ -143,11 +146,17 @@ def tweet(self, text, article_id, url, column='id'):
         if reply_to is None:
             logging.info('Tweeting url: %s', url)
             tweet = self.tweet_text(url)
-            reply_to = tweet.id
+            # if TESTING, give a random id based on time
+            reply_to = tweet.id if not TESTING else time.time()
         logging.info('Replying to: %s', reply_to)
         tweet = self.tweet_with_media(text, images, reply_to)
-        logging.info('Id to store: %s', tweet.id)
-        self.update_tweet_db(article_id, tweet.id, column)
+        if TESTING :
+            # if TESTING, give a random id based on time
+            tweet_id = time.time()
+        else:
+            tweet_id = tweet.id
+        logging.info('Id to store: %s', tweet_id)
+        self.update_tweet_db(article_id, tweet_id, column)
         return
 
     def get_page(self, url, header=None, payload=None):
@@ -302,7 +311,7 @@ def store_data(self, data):
                                        ORDER BY version DESC \
                                        LIMIT 1' % (data['article_id']))
                 for row in result:
-                    data['version'] = row['version'] + 1
+                    data['version'] = row['version']
                     self.versions_table.insert(data)
                     url = data['url']
                     if row['url'] != data['url']:
@@ -363,6 +372,115 @@ def parse_pages(self):
         if loop:
             self.remove_old('article_id')
 
+class RSSParser(BaseParser):
+    def __init__(self, api, rss_url):
+        BaseParser.__init__(self, api)
+        self.urls = [rss_url]
+        self.articles_table = self.db['rss_ids']
+        self.versions_table = self.db['rss_versions']
+
+    def entry_to_dict(self, article):
+        article_dict = dict()
+        article_dict['article_id'] = article.id.split(' ')[0]
+        article_dict['url'] = article.link
+        article_dict['title'] = article.title
+        article_dict['abstract'] = self.strip_html(article.description)
+        article_dict['author'] = article.author
+        # article_dict['illustration'] = article.media_content[0]['url']
+        # article_dict['illustartion_size'] = article.media_content[0]['filesize']
+        od = collections.OrderedDict(sorted(article_dict.items()))
+        article_dict['hash'] = hashlib.sha224(
+            repr(od.items()).encode('utf-8')).hexdigest()
+        article_dict['date_time'] = datetime.now(LOCAL_TZ)
+        return article_dict
+
+    def store_data(self, data):
+        if self.articles_table.find_one(
+                article_id=data['article_id']) is None:  # New
+            article = {
+                'article_id': data['article_id'],
+                'add_dt': data['date_time'],
+                'status': 'home',
+                'tweet_id': None
+            }
+            self.articles_table.insert(article)
+            logging.info('New article tracked: %s', data['url'])
+            data['version'] = 1
+            self.versions_table.insert(data)
+        else:
+            # re insert
+            if self.articles_table.find_one(article_id=data['article_id'],
+                                            status='removed') is not None:
+                article = {
+                    'article_id': data['article_id'],
+                    'add_dt': data['date_time'],
+                }
+
+            count = self.versions_table.count(
+                self.versions_table.table.columns.article_id == data[
+                    'article_id'],
+                hash=data['hash'])
+            if count == 1:  # Existing
+                pass
+            else:  # Changed
+                result = self.db.query('SELECT * \
+                                       FROM rss_versions\
+                                       WHERE article_id = "%s" \
+                                       ORDER BY version DESC \
+                                       LIMIT 1' % (data['article_id']))
+                for row in result:
+                    data['version'] = row['version'] +1
+                    self.versions_table.insert(data)
+                    url = data['url']
+                    if row['title'] != data['title']:
+                        if self.show_diff(row['title'], data['title']):
+                            tweet_text = "Modification du Titre"
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
+                    if row['abstract'] != data['abstract']:
+                        if self.show_diff(row['abstract'], data['abstract']):
+                            tweet_text = "Modification de la Description"
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
+                    if row['author'] != data['author']:
+                        if self.show_diff(row['author'], data['author']):
+                            tweet_text = "Modification de l'auteur"
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
+                    if row['url'] != data['url']:
+                        if self.show_diff(row['url'], data['url']):
+                            tweet_text = "Modification d'URL"
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
+
+    def loop_entries(self, entries):
+        if len(entries) == 0:
+            return False
+        for article in entries:
+            try:
+                article_dict = self.entry_to_dict(article)
+                if article_dict is not None:
+                    self.store_data(article_dict)
+                    self.current_ids.add(article_dict['article_id'])
+            except BaseException as e:
+                logging.exception('Problem looping RSS: %s', article)
+                print ('Exception: {}'.format(str(e)))
+                print('***************')
+                print(article)
+                print('***************')
+                return False
+        return True
+
+    def parse_rss(self):
+        r = feedparser.parse(self.urls[0])
+        if r is None:
+            logging.warning('Empty response RSS')
+            return
+        else:
+            logging.info('Parsing %s', r.feed.title)
+        loop = self.loop_entries(r.entries)
+        if loop:
+            self.remove_old('article_id')
 
 def main():
     # logging
@@ -380,17 +498,19 @@ def main():
     auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
     auth.secure = True
     auth.set_access_token(access_token, access_token_secret)
-    nyt_api = tweepy.API(auth)
-    logging.debug('NYT Twitter API configured')
+    twitter_api = tweepy.API(auth)
+    logging.debug('Twitter API configured')
 
     try:
-        logging.debug('Starting NYT')
-        nyt_api_key = os.environ['NYT_API_KEY']
-        nyt = NYTParser(nyt_api, nyt_api_key)
-        nyt.parse_pages()
-        logging.debug('Finished NYT')
+        logging.debug('Starting RSS')
+        #nyt_api_key = os.environ['NYT_API_KEY']
+        #nyt = NYTParser(nyt_api, nyt_api_key)
+        rss_url = os.environ['RSS_URL']
+        rss = RSSParser(twitter_api, rss_url)
+        rss.parse_rss()
+        logging.debug('Finished RSS')
     except:
-        logging.exception('NYT')
+        logging.exception('RSS')
 
     logging.info('Finished script')
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
 alembic==0.8.7
 bleach==1.4.3
 dataset==0.6.4
+feedparser==5.2.1
 html5lib==0.9999999
 Mako==1.0.4
 MarkupSafe==0.23

diff --git a/run_diff.sh b/run_diff.sh
@@ -7,6 +7,7 @@ export NYT_TWITTER_ACCESS_TOKEN=""
 export NYT_TWITTER_ACCESS_TOKEN_SECRET=""
 
 export NYT_API_KEY=""
+export RSS_URL=""
 
 export PHANTOMJS_PATH="./"