diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..67169d0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +ARG PYTHON_VERSION=3.7 +FROM python:${PYTHON_VERSION}-alpine + +WORKDIR /usr/src/app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD [ "python", "./gedcom/__init__.py" ] diff --git a/README.md b/README.md index e715033..55b3d15 100644 --- a/README.md +++ b/README.md @@ -33,10 +33,94 @@ from gedcom import Gedcom file_path = '' # Path to your `.ged` file gedcom = Gedcom(file_path) +``` + +### GEDCOM Quirks + +Large sites like Ancesty and MyHeritage (among others) don't always produce perfectly formatted GEDCOM files. If you encounter errors in parsing, you might consider disabling strict parsing which will make a best effort to parse the file: + + +```python +from gedcom import Gedcom + +file_path = '' # Path to your `.ged` file +gedcom = Gedcom(file_path, False) # Disable strict parsing +``` + +Disabling strict parsing will allow the parser to gracefully handle the following quirks: + +- Multi-line fields that don't use CONC or CONT +- Handle the last line not ending in a CRLF -# Then run methods on `gedcom` ... :) +### Iterate through all records, search last names and print matches + +```python +all_records = gedcom.get_root_child_elements() +for record in all_records: + if record.is_individual(): + if record.surname_match('Brodie'): + (first, last) = record.get_name() + print(first + " " + last) ``` +## Reference + +The Element class contains all the information for a single record in the GEDCOM file, for example and individual. + +### Single Record Methods + +Method | Parameters | Returns | Description +-----------------------|------------|---------|------------ +get_child_elements | none | List of Element | Returns all the child elements of this record +get_parent_element | none | Element | Returns parent Element +new_child_element | String tag, String pointer, String value | Element | Create a new Element +add_child_element | Element child | Element | Adds the child record +set_parent_element | Element parent| none | Not normally required to be called (add_child_element calls this automatically +is_individual | none | Boolean | Is this record of a person +is_family | none | Boolean | +is_file | none | Boolean | +is_object | none | Boolean | +is_private | none | Boolean | Returns True if the record is marked Private +is_deceased | none | Boolean | Returns True if the individual is marked deceased +criteria_match | colon separated string "surname=[name]:name=[name]:birth][year]:birth_range=[year-to-year]:death=[year]:death_range[year-to-year]"| Boolean | Returns True if the criteria matches +surname_match | String | Boolean | Returns True if substring matches +given_match | String | Boolean | Returns True if subscring matches +death_range_match | Int from, Int to | Boolean | Returns True if Death Year is in the supplied range +death_year_match | Int | Boolean | Returns True if Death Year equals parameter +birth_range_match | Int from, Int to | Boolean | Returns True if Birth Year is in the supplied range +birth_year_match | Int | Boolean | Returns True if Birth Year equals parameter +get_name | none | (String given, String surname) | Returns the Given name(s) and Surname in a tuple +get_gender | none | String | Returns individual's gender +get_birth_data | none | (String date, String place, Array sources) | Returns a tuple of the birth data +get_birth_year | none | Int | Returns the Birth Year +get_death_data | none | (String date, String place, Array sources) | Returns a tuple of the death data +get_death_year | none | Int | Returns the Death Year +get_burial | none | (String date, String place, Array sources) | Returns a tuple of the burial data +get_census | none | List [String date, String place, Array sources] | Returns a List of tuple of the census data +get_last_change_date | none | String | Returns the date of the last update to this individual +get_occupation | none | String | Returns the individual's occupation +get_individual | none | Individual | Returns the individual + +### Gedcom operations + +Method | Parameters | Returns | Description +------------------------|------------|---------|------------ +get_root_element | none | Element root | Returns the virtual "root" individual +get_root_child_elements | none | List of Element | Returns a List of all Elements +get_element_dictionary | none | Dict of Element | Returns a Dict of all Elements +get_element_list | none | List of Element | Returns a List of all Elements +get_marriages | Element individual | List of Marriage ("Date", "Place") | Returns List of Tuples of Marriage data (Date and Place) +find_path_to_ancestors | Element descendant, Element ancestor|| +get_family_members | Element individual, optional String members_type - one of "ALL" (default), "PARENTS", "HUSB", "WIFE", "CHIL" | List of Element individuals|| +get_parents | Element individual, optional String parent_type - one of "ALL" (default) or "NAT" | List of Element individuals| +get_ancestors | Element individual, optional String ancestor_type - one of "All" (default) or "NAT" || +get_families | Element individual optional String family_type - one of "FAMS" (default), "FAMC"|| +marriage_range_match | Element individual, Int from, Int to| Boolean | Check if individual is married within the specified range +marriage_year_match | Element individual, Int year| Boolean | Check if individual is married in the year specified +get_marriage_years | Element individual |List of Int| Returns Marriage event years +print_gedcom | none | none | Prints the gedcom to STDOUT +save_gedcom | String filename | none | Writes gedcom to specified filename + ## History This module was originally based on a GEDCOM parser written by @@ -44,8 +128,16 @@ Daniel Zappala at Brigham Young University (Copyright (C) 2005) which was licensed under the GPL v2 and then continued by [Mad Price Ball](https://github.com/madprime) in 2012. +Further updates by [Nicklas Reincke](https://github.com/nickreynke) and [Damon Brodie](https://github.com/nomadyow) in 2018. + ## Changelog +**v0.2.2dev** + +- Support BOM control characters +- Support the last line not having a CR and/or LF +- Support incorrect line splitting generated by Ancestry. Insert CONT/CONC tag as necessary + **v0.2.1dev** - Changed broken links to GEDCOM format specification ([#2](https://github.com/nickreynke/python-gedcom/issues/2)) @@ -70,6 +162,7 @@ was licensed under the GPL v2 and then continued by Licensed under the [GNU General Public License v2](http://www.gnu.org/licenses/gpl-2.0.html) **Python GEDCOM Parser** +
Copyright (C) 2018 Damon Brodie (damon.brodie at gmail.com)
Copyright (C) 2018 Nicklas Reincke (contact at reynke.com)
Copyright (C) 2016 Andreas Oberritter
Copyright (C) 2012 Madeleine Price Ball diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4c35a33 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,8 @@ +version: '2' + +services: + + python: + build: ./ + volumes: + - ./:/usr/src/app:z diff --git a/gedcom/__init__.py b/gedcom/__init__.py index f1d78d1..a775f6e 100644 --- a/gedcom/__init__.py +++ b/gedcom/__init__.py @@ -1,5 +1,6 @@ # Python GEDCOM Parser # +# Copyright (C) 2018 Damon Brodie (damon.brodie at gmail.com) # Copyright (C) 2018 Nicklas Reincke (contact at reynke.com) # Copyright (C) 2016 Andreas Oberritter # Copyright (C) 2012 Madeleine Price Ball @@ -137,7 +138,7 @@ class Gedcom: - a dict (only elements with pointers, which are the keys) """ - def __init__(self, file_path): + def __init__(self, file_path, use_strict = True): """Initialize a GEDCOM data object. You must supply a GEDCOM file :type file_path: str """ @@ -145,7 +146,8 @@ def __init__(self, file_path): self.__element_dictionary = {} self.invalidate_cache() self.__root_element = Element(-1, "", "ROOT", "") - self.__parse(file_path) + self.__parse(file_path, use_strict) + self.__use_strict = use_strict def invalidate_cache(self): """Cause get_element_list() and get_element_dictionary() to return updated data @@ -213,7 +215,7 @@ def get_root_child_elements(self): # Private methods - def __parse(self, file_path): + def __parse(self, file_path, use_strict = True): """Open and parse file path as GEDCOM 5.5 formatted data :type file_path: str """ @@ -221,11 +223,11 @@ def __parse(self, file_path): line_number = 1 last_element = self.__root_element for line in gedcom_file: - last_element = self.__parse_line(line_number, line.decode('utf-8'), last_element) + last_element = self.__parse_line(line_number, line.decode('utf-8-sig'), last_element, use_strict ) line_number += 1 @staticmethod - def __parse_line(line_number, line, last_element): + def __parse_line(line_number, line, last_element, use_strict = True): """Parse a line from a GEDCOM 5.5 formatted document Each line should have the following (bracketed items optional): @@ -257,17 +259,47 @@ def __parse_line(line_number, line, last_element): regex_match = regex.match(gedcom_line_regex, line) if regex_match is None: - error_message = ("Line `%d` of document violates GEDCOM format 5.5" % line_number + - "\nSee: https://chronoplexsoftware.com/gedcomvalidator/gedcom/gedcom-5.5.pdf") - raise SyntaxError(error_message) - - line_parts = regex_match.groups() - - level = int(line_parts[0]) - pointer = line_parts[1].rstrip(' ') - tag = line_parts[2] - value = line_parts[3][1:] - crlf = line_parts[4] + if use_strict: + error_message = ("Line %d of document violates GEDCOM format 5.5" % line_number + + "\nSee: https://chronoplexsoftware.com/gedcomvalidator/gedcom/gedcom-5.5.pdf") + raise SyntaxError(error_message) + else: + # Quirk check - see if this is a line without a CRLF (which could be the last line) + last_line_regex = level_regex + pointer_regex + tag_regex + value_regex + regex_match = regex.match(last_line_regex, line) + if regex_match is not None: + line_parts = regex_match.groups() + + level = int(line_parts[0]) + pointer = line_parts[1].rstrip(' ') + tag = line_parts[2] + value = line_parts[3][1:] + crlf = '\n' + else: + # Quirk check - Sometimes a gedcom has a text field with a CR. + # This creates a line without the standard level and pointer. If this is detected + # then turn it into a CONC or CONT + line_regex = '([^\n\r]*|)' + cont_line_regex = line_regex + end_of_line_regex + regex_match = regex.match(cont_line_regex, line) + line_parts = regex_match.groups() + level = last_element.get_level() + tag = last_element.get_tag() + pointer = None + value = line_parts[0][1:] + crlf = line_parts[1] + if tag != GEDCOM_TAG_CONTINUED and tag != GEDCOM_TAG_CONCATENATION: + # Increment level and change this line to a CONC + level += 1 + tag = GEDCOM_TAG_CONCATENATION + else: + line_parts = regex_match.groups() + + level = int(line_parts[0]) + pointer = line_parts[1].rstrip(' ') + tag = line_parts[2] + value = line_parts[3][1:] + crlf = line_parts[4] # Check level: should never be more than one higher than previous line. if level > last_element.get_level() + 1: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index d84dcd9..5266523 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='python-gedcom', - version='0.2.0dev', + version='0.2.2dev', packages=['gedcom', ], license='GPLv2', package_dir={'': '.'},