Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
# NYTdiff
# NYTdiff+

Code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff).
Based on @j-e-d's code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff).
RSS feed fetching added for @xuv's twitter bot [@lesoir_diff](https://twitter.com/lesoir_diff)

The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file.
[Twitter keys](https://dev.twitter.com/) are needed.
[NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" are needed for The New York Times.
An RSS Url is needed for [Le Soir](http://lesoir.be) or any other news website.

[Twitter keys](https://dev.twitter.com/) and the [NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" service are needed, values of this keys need to be entered in the run_diff.sh file.
Installation
------------
+ The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file.
+ `pip install -r requirements.txt`

Font: [Merriweather](https://fonts.google.com/specimen/Merriweather). Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).

Credits
-------
+ Original script and idea: @j-e-d Juan E.D. http://unahormiga.com/
+ RSS fetching: @xuv Julien Deswaef http://xuv.be
+ Font: [Merriweather](https://fonts.google.com/specimen/Merriweather)
+ Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).
13 changes: 7 additions & 6 deletions css/styles.css
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
@font-face {
font-family: Merriweather;
@font-face {
font-family: Merriweather;
font-style: normal;
font-weight: normal;
src: url('../fonts/Merriweather-Regular.ttf') format("truetype");
}
src: url('../fonts/Merriweather-Regular.ttf') format("truetype");
}

body {
background: lightgray url('../img/paper_fibers.png') repeat;
body {
background: lightgray url('../img/paper_fibers.png') repeat;
font-family: Merriweather;
font-size: 16px;
}
Expand All @@ -17,6 +17,7 @@ p {
margin-top: 1em;
margin-bottom: 1em;
font-weight: normal;
word-wrap: break-word;
}

del {
Expand Down
146 changes: 133 additions & 13 deletions nytdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
from simplediff import html_diff
from selenium import webdriver

TIMEZONE = 'America/Buenos_Aires'
import feedparser

TIMEZONE = 'Europe/Brussels'
LOCAL_TZ = timezone(TIMEZONE)
MAX_RETRIES = 10
RETRY_DELAY = 3
Expand Down Expand Up @@ -124,6 +126,7 @@ def tweet_with_media(self, text, images, reply_to=None):
def tweet_text(self, text):
if TESTING:
print (text)
return True
try:
tweet_id = self.api.update_status(status=text)
except:
Expand All @@ -143,11 +146,17 @@ def tweet(self, text, article_id, url, column='id'):
if reply_to is None:
logging.info('Tweeting url: %s', url)
tweet = self.tweet_text(url)
reply_to = tweet.id
# if TESTING, give a random id based on time
reply_to = tweet.id if not TESTING else time.time()
logging.info('Replying to: %s', reply_to)
tweet = self.tweet_with_media(text, images, reply_to)
logging.info('Id to store: %s', tweet.id)
self.update_tweet_db(article_id, tweet.id, column)
if TESTING :
# if TESTING, give a random id based on time
tweet_id = time.time()
else:
tweet_id = tweet.id
logging.info('Id to store: %s', tweet_id)
self.update_tweet_db(article_id, tweet_id, column)
return

def get_page(self, url, header=None, payload=None):
Expand Down Expand Up @@ -302,7 +311,7 @@ def store_data(self, data):
ORDER BY version DESC \
LIMIT 1' % (data['article_id']))
for row in result:
data['version'] = row['version'] + 1
data['version'] = row['version']
self.versions_table.insert(data)
url = data['url']
if row['url'] != data['url']:
Expand Down Expand Up @@ -363,6 +372,115 @@ def parse_pages(self):
if loop:
self.remove_old('article_id')

class RSSParser(BaseParser):
def __init__(self, api, rss_url):
BaseParser.__init__(self, api)
self.urls = [rss_url]
self.articles_table = self.db['rss_ids']
self.versions_table = self.db['rss_versions']

def entry_to_dict(self, article):
article_dict = dict()
article_dict['article_id'] = article.id.split(' ')[0]
article_dict['url'] = article.link
article_dict['title'] = article.title
article_dict['abstract'] = self.strip_html(article.description)
article_dict['author'] = article.author
# article_dict['illustration'] = article.media_content[0]['url']
# article_dict['illustartion_size'] = article.media_content[0]['filesize']
od = collections.OrderedDict(sorted(article_dict.items()))
article_dict['hash'] = hashlib.sha224(
repr(od.items()).encode('utf-8')).hexdigest()
article_dict['date_time'] = datetime.now(LOCAL_TZ)
return article_dict

def store_data(self, data):
if self.articles_table.find_one(
article_id=data['article_id']) is None: # New
article = {
'article_id': data['article_id'],
'add_dt': data['date_time'],
'status': 'home',
'tweet_id': None
}
self.articles_table.insert(article)
logging.info('New article tracked: %s', data['url'])
data['version'] = 1
self.versions_table.insert(data)
else:
# re insert
if self.articles_table.find_one(article_id=data['article_id'],
status='removed') is not None:
article = {
'article_id': data['article_id'],
'add_dt': data['date_time'],
}

count = self.versions_table.count(
self.versions_table.table.columns.article_id == data[
'article_id'],
hash=data['hash'])
if count == 1: # Existing
pass
else: # Changed
result = self.db.query('SELECT * \
FROM rss_versions\
WHERE article_id = "%s" \
ORDER BY version DESC \
LIMIT 1' % (data['article_id']))
for row in result:
data['version'] = row['version'] +1
self.versions_table.insert(data)
url = data['url']
if row['title'] != data['title']:
if self.show_diff(row['title'], data['title']):
tweet_text = "Modification du Titre"
self.tweet(tweet_text, data['article_id'], url,
'article_id')
if row['abstract'] != data['abstract']:
if self.show_diff(row['abstract'], data['abstract']):
tweet_text = "Modification de la Description"
self.tweet(tweet_text, data['article_id'], url,
'article_id')
if row['author'] != data['author']:
if self.show_diff(row['author'], data['author']):
tweet_text = "Modification de l'auteur"
self.tweet(tweet_text, data['article_id'], url,
'article_id')
if row['url'] != data['url']:
if self.show_diff(row['url'], data['url']):
tweet_text = "Modification d'URL"
self.tweet(tweet_text, data['article_id'], url,
'article_id')

def loop_entries(self, entries):
if len(entries) == 0:
return False
for article in entries:
try:
article_dict = self.entry_to_dict(article)
if article_dict is not None:
self.store_data(article_dict)
self.current_ids.add(article_dict['article_id'])
except BaseException as e:
logging.exception('Problem looping RSS: %s', article)
print ('Exception: {}'.format(str(e)))
print('***************')
print(article)
print('***************')
return False
return True

def parse_rss(self):
r = feedparser.parse(self.urls[0])
if r is None:
logging.warning('Empty response RSS')
return
else:
logging.info('Parsing %s', r.feed.title)
loop = self.loop_entries(r.entries)
if loop:
self.remove_old('article_id')

def main():
# logging
Expand All @@ -380,17 +498,19 @@ def main():
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.secure = True
auth.set_access_token(access_token, access_token_secret)
nyt_api = tweepy.API(auth)
logging.debug('NYT Twitter API configured')
twitter_api = tweepy.API(auth)
logging.debug('Twitter API configured')

try:
logging.debug('Starting NYT')
nyt_api_key = os.environ['NYT_API_KEY']
nyt = NYTParser(nyt_api, nyt_api_key)
nyt.parse_pages()
logging.debug('Finished NYT')
logging.debug('Starting RSS')
#nyt_api_key = os.environ['NYT_API_KEY']
#nyt = NYTParser(nyt_api, nyt_api_key)
rss_url = os.environ['RSS_URL']
rss = RSSParser(twitter_api, rss_url)
rss.parse_rss()
logging.debug('Finished RSS')
except:
logging.exception('NYT')
logging.exception('RSS')

logging.info('Finished script')

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
alembic==0.8.7
bleach==1.4.3
dataset==0.6.4
feedparser==5.2.1
html5lib==0.9999999
Mako==1.0.4
MarkupSafe==0.23
Expand Down
1 change: 1 addition & 0 deletions run_diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export NYT_TWITTER_ACCESS_TOKEN=""
export NYT_TWITTER_ACCESS_TOKEN_SECRET=""

export NYT_API_KEY=""
export RSS_URL=""

export PHANTOMJS_PATH="./"

Expand Down