radiofrance-podcasts/radiofrance.py at master · nilshamerlinck/radiofrance-podcasts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import codecs, locale, os, sys
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

import httplib
import urllib2
from cookielib import CookieJar
import urlparse

import re

VERBOSE = False
OPENER = None

def get_opener():
    cj = CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    opener.addheaders = [
        ('Content-type', 'application/x-www-form-urlencoded'),
        ('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'),
        ('Accept', 'text/plain, text/html')
        ]

    return opener

def get_podcast(page):
    netloc = urlparse.urlsplit(page).netloc # ex: www.franceculture.fr

    global OPENER
    if not OPENER:
        OPENER = get_opener()

    content_id = get_content_id_from_page(page)
    if VERBOSE:
        print content_id

    iframe = 'http://%s/player/export-reecouter?content=%s' % (netloc, content_id)

    if VERBOSE:
        print iframe

    data = get_data_from_iframe(iframe)

    podcast = data['url']
    output_file = data['output_file']

    if VERBOSE:
        print 'wget -O %s "%s"' % (
            output_file,
            podcast)

    return podcast

def get_content_id_from_page(page):
    global OPENER
    if not OPENER:
        OPENER = get_opener()

    resp = OPENER.open(page)
    c = resp.read().decode('utf-8')

    RE_CONTENT_ID = re.compile(r'href="/player/reecouter\?play=(?P<content_id>\d+)"', re.UNICODE)
    m = RE_CONTENT_ID.findall(c)
    if not m:
        return None
    if len(m) == 1:
        m = m[0]
    else:
        """
        éviter le journal
        ex: http://www.franceinter.fr/emission-a-ton-age-23-ans
        """

        m = m[1]

    content_id = int(m)

    return content_id

def get_data_from_iframe(iframe):
    global OPENER
    if not OPENER:
        OPENER = get_opener()

    resp = OPENER.open(iframe)
    c = resp.read().decode('utf-8')

    data = {}

    """
    ex: http://www.franceculture.fr/emission-sur-les-docks-polyandrie-23-%C2%AB-les-na-de-chine-le-fantasme-de-la-femme-liberee-%C2%BB-2014-03-05
    """

    RE_URL = re.compile(r'<a id="player-link" href="(?P<url>[^"]+)"')
    m = RE_URL.search(c)

    if m:
        data['url'] = m.group('url')

        RE_MORE = re.compile(r'<a href="/(?P<more>[^"]+)" class="more" target="_blank">', re.UNICODE)
        m = RE_MORE.search(c)

        data['output_file'] = '%s.mp3' % m.group('more')
    else:
        """
        autre cas de figure : chronique dans une émission
        ex: http://www.franceinter.fr/emission-a-ton-age-23-ans
        """

        RE_URL = re.compile(r'<a id="player" data-diffusion-id="" href="(?P<url>[^"]+)"')
        m = RE_URL.search(c)
        data['url'] = m.group('url')

        RE_MORE = re.compile(r'<span class="path-diffusion">/(?P<more>.+)</span>')
        m = RE_MORE.search(c)

        data['output_file'] = '%s.mp3' % m.group('more')

    return data

def main():
    global VERBOSE

    from optparse import OptionParser

    usage="""
%prog [ options ]

Exemple:

$ python radiofrance.py http://www.franceculture.fr/emission-le-tete-a-tete-sophie-calle-rediffusion-de-l-emission-du-30-septembre-2012-2013-08-18
  """[1:-3]

    parser = OptionParser(usage=usage)
    parser.add_option('--verbose',
                      help='verbose',
                      default=VERBOSE,
                      action='store_true',
                      dest='verbose')

    options, args = parser.parse_args()

    VERBOSE = options.verbose

    if not args:
        print usage

    for arg in args:
        print get_podcast(arg)

if __name__ == '__main__':
    sys.exit(main())