diff --git a/risscraper/scraperallris.py b/risscraper/scraperallris.py index 6f2eb04..f1f5abf 100644 --- a/risscraper/scraperallris.py +++ b/risscraper/scraperallris.py @@ -32,6 +32,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ +import collections import datetime import HTMLParser import logging @@ -144,12 +145,8 @@ def guess_system(self): logging.info("Nothing to guess until now.") def find_person(self): - find_person_url = (self.config['scraper']['base_url'] + - 'kp041.asp?template=xyz&selfaction=ws&showAll=true&' - 'PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&' - 'kpsonst=&kpampa=99999999&kpfr=99999999&' - 'kpamfr=99999999&kpau=99999999&kpamau=99999999&' - 'searchForm=true&search=Suchen') + # example: https://ksd.rostock.de/bi/kp041.asp?selfaction=ws + find_person_url = self.config['scraper']['base_url'] + 'kp041.asp?selfaction=ws' logging.info("Getting person overview from %s", find_person_url) """parse an XML file and return the tree""" @@ -288,6 +285,27 @@ def get_person_organization(self, person_id=None, organization_url=None): % (self.config['scraper']['base_url'], person_id)) logging.info("Getting person organization from %s", url) + # maps name of type to form name and membership type + membership = collections.namedtuple('Membership', ('mtype', 'field')) + membership_map = { + u'Rat der Stadt' : membership('parliament', 'PALFDNR'), + u'Parlament' : membership('parliament', 'PALFDNR'), + u'Bürgerschaft' : membership('parliament', 'PALFDNR'), + u'Fraktion' : membership('organisation', 'FRLFDNR'), + u'Fraktionen': membership('parliament', 'FRLFDNR'), + u'Ausschüsse' : membership('organization', 'AULFDNR'), + u'Stadtbezirk': membership('parliament', 'PALFDNR'), + u'BVV': membership('parliament', 'PALFDNR'), + u'Bezirksparlament': membership('parliament', 'PALFDNR'), + u'Bezirksverordnetenversammlung': membership('parliament', + 'PALFDNR'), + u'Ortsbeiräte': membership('organization', 'AULFDNR'), + u'Aufsichtsräte': membership('organization', 'AULFDNR'), + u'sonstige Gremien': membership('organization', 'AULFDNR'), + # At least in Rostock there can be an empty organization type. + # see: https://ksd.rostock.de/bi/kp020.asp?KPLFDNR=300&history=true + u'': membership('organization', 'AULFDNR'), + } # Stupid re-try concept because AllRis sometimes misses start < at # tags at first request. try_counter = 0 @@ -296,55 +314,48 @@ def get_person_organization(self, person_id=None, organization_url=None): response = self.get_url(url) if not url: return - tree = html.fromstring(response.text) + text = response.text.encode('ascii', 'xmlcharrefreplace') + tree = html.fromstring(text) memberships = [] person = Person(originalId=person_id) - # maps name of type to form name and membership type - type_map = { - u'Rat der Stadt' : {'mtype' : 'parliament', - 'field' : 'PALFDNR'}, - u'Parlament' : {'mtype' : 'parliament', - 'field' : 'PALFDNR'}, - u'Fraktion' : {'mtype' : 'organisation', - 'field' : 'FRLFDNR'}, - 'Fraktionen': {'mtype' : 'parliament', 'field' : 'FRLFDNR'}, - u'Ausschüsse' : {'mtype' : 'organization', - 'field' : 'AULFDNR'}, - 'Stadtbezirk': {'mtype' : 'parliament', - 'field' : 'PALFDNR'}, - 'BVV': {'mtype' : 'parliament', 'field' : 'PALFDNR'}, - 'Bezirksparlament': {'mtype' : 'parliament', - 'field' : 'PALFDNR'}, - 'Bezirksverordnetenversammlung': {'mtype' : 'parliament', - 'field' : 'PALFDNR'} - } + # Different versions contain different "main" divs: + # Rostock (ALLRIS net Version 3.8.8): "rismain" + # others: "rismain_raw" + for key in ("rismain_raw", "rismain"): + # There are three tables on this page: + # Anschrift, Sonstiges, Mitarbeit + # We are interested in "Mitarbeit". + table = tree.xpath('//*[@id="%s"]/table[2]' % key) + if table: + break # obtain the table with the membership list via a simple state machine - mtype = "parliament" - field = 'PALFDNR' - # for checking if it changes - old_group_id = None - # for checking if it changes - old_group_name = None - # might break otherwise - group_id = None - table = tree.xpath('//*[@id="rismain_raw"]/table[2]') - if len(table): + if table: table = table[0] + mtype = None + field = None + # for checking if it changes + old_group_id = None + # for checking if it changes + old_group_name = None + # might break otherwise + group_id = None for line in table.findall("tr"): if line[0].tag == "th": - what = line[0].text.strip() + # This is a subtitle for the following memberships. + # Carefully look inside - maybe it is empty. + what = (line[0].text or "").strip() field = None field_list = None - if what in type_map: - mtype = type_map[what]['mtype'] - field = type_map[what]['field'] + if what in membership_map: + mtype = membership_map[what].mtype + field = membership_map[what].field elif 'Wahlperiode' in what: mtype = 'parliament' # 'FRLFDNR' field_list = ['KPLFDNR', 'AULFDNR'] - elif "Auskünfte gemäß BVV" in what: + elif u"Auskünfte gemäß BVV" in what: break else: logging.error("Unknown organization type %s " @@ -352,8 +363,23 @@ def get_person_organization(self, person_id=None, organization_url=None): what, person_id) continue else: + """ + This is a membership description consisting of + organization icon, organization name, role and + timespan. + Example: +
+ + +
+ + Ausschuss für Stadt- und Regionalentwicklung, Umwelt und Ordnung + Mitglied  + 14.07.1999 - 13.12.1999  + """ if "Keine Information" in line.text_content(): # skip because no content is available + # Typically "Fraktion" is undefined. continue # Empty line = strange stuff comes after this @@ -407,25 +433,28 @@ def get_person_organization(self, person_id=None, organization_url=None): membership.originalId = (unicode(person_id) + '-' + unicode(group_id)) - # TODO: create a list of functions so we can + # TODO: create a list of roles so we can # index them somehow - function = line[2].text_content() + role = line[2].text_content() raw_date = line[3].text_content() # parse the date information if "seit" in raw_date: + # Example: "seit 02.07.2014" dparts = raw_date.split() membership.endDate = dparts[-1] elif "Keine" in raw_date or not raw_date.strip(): # no date information available start_date = end_date = None else: + # Example: "14.07.1999 - 13.12.1999" dparts = raw_date.split() membership.startDate = dparts[0] membership.endDate = dparts[-1] if organization.originalId is not None: memberships.append(membership) else: - logging.warn("Bad organization at %s", url) + logging.warn("Bad organization (%s): %s", + url, line.text_content()) person.membership = memberships oid = self.db.save_person(person) @@ -603,7 +632,7 @@ def get_paper(self, paper_url=None, paper_id=None): logging.warn("Paper %s in %s seems to private", paper_id, paper_url) return - text = response.text + text = response.text.encode('ascii', 'xmlcharrefreplace') doc = html.fromstring(text) data = {}