diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..67169d0
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,11 @@
+ARG PYTHON_VERSION=3.7
+FROM python:${PYTHON_VERSION}-alpine
+
+WORKDIR /usr/src/app
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+CMD [ "python", "./gedcom/__init__.py" ]
diff --git a/README.md b/README.md
index e715033..55b3d15 100644
--- a/README.md
+++ b/README.md
@@ -33,10 +33,94 @@ from gedcom import Gedcom
file_path = '' # Path to your `.ged` file
gedcom = Gedcom(file_path)
+```
+
+### GEDCOM Quirks
+
+Large sites like Ancesty and MyHeritage (among others) don't always produce perfectly formatted GEDCOM files. If you encounter errors in parsing, you might consider disabling strict parsing which will make a best effort to parse the file:
+
+
+```python
+from gedcom import Gedcom
+
+file_path = '' # Path to your `.ged` file
+gedcom = Gedcom(file_path, False) # Disable strict parsing
+```
+
+Disabling strict parsing will allow the parser to gracefully handle the following quirks:
+
+- Multi-line fields that don't use CONC or CONT
+- Handle the last line not ending in a CRLF
-# Then run methods on `gedcom` ... :)
+### Iterate through all records, search last names and print matches
+
+```python
+all_records = gedcom.get_root_child_elements()
+for record in all_records:
+ if record.is_individual():
+ if record.surname_match('Brodie'):
+ (first, last) = record.get_name()
+ print(first + " " + last)
```
+## Reference
+
+The Element class contains all the information for a single record in the GEDCOM file, for example and individual.
+
+### Single Record Methods
+
+Method | Parameters | Returns | Description
+-----------------------|------------|---------|------------
+get_child_elements | none | List of Element | Returns all the child elements of this record
+get_parent_element | none | Element | Returns parent Element
+new_child_element | String tag, String pointer, String value | Element | Create a new Element
+add_child_element | Element child | Element | Adds the child record
+set_parent_element | Element parent| none | Not normally required to be called (add_child_element calls this automatically
+is_individual | none | Boolean | Is this record of a person
+is_family | none | Boolean |
+is_file | none | Boolean |
+is_object | none | Boolean |
+is_private | none | Boolean | Returns True if the record is marked Private
+is_deceased | none | Boolean | Returns True if the individual is marked deceased
+criteria_match | colon separated string "surname=[name]:name=[name]:birth][year]:birth_range=[year-to-year]:death=[year]:death_range[year-to-year]"| Boolean | Returns True if the criteria matches
+surname_match | String | Boolean | Returns True if substring matches
+given_match | String | Boolean | Returns True if subscring matches
+death_range_match | Int from, Int to | Boolean | Returns True if Death Year is in the supplied range
+death_year_match | Int | Boolean | Returns True if Death Year equals parameter
+birth_range_match | Int from, Int to | Boolean | Returns True if Birth Year is in the supplied range
+birth_year_match | Int | Boolean | Returns True if Birth Year equals parameter
+get_name | none | (String given, String surname) | Returns the Given name(s) and Surname in a tuple
+get_gender | none | String | Returns individual's gender
+get_birth_data | none | (String date, String place, Array sources) | Returns a tuple of the birth data
+get_birth_year | none | Int | Returns the Birth Year
+get_death_data | none | (String date, String place, Array sources) | Returns a tuple of the death data
+get_death_year | none | Int | Returns the Death Year
+get_burial | none | (String date, String place, Array sources) | Returns a tuple of the burial data
+get_census | none | List [String date, String place, Array sources] | Returns a List of tuple of the census data
+get_last_change_date | none | String | Returns the date of the last update to this individual
+get_occupation | none | String | Returns the individual's occupation
+get_individual | none | Individual | Returns the individual
+
+### Gedcom operations
+
+Method | Parameters | Returns | Description
+------------------------|------------|---------|------------
+get_root_element | none | Element root | Returns the virtual "root" individual
+get_root_child_elements | none | List of Element | Returns a List of all Elements
+get_element_dictionary | none | Dict of Element | Returns a Dict of all Elements
+get_element_list | none | List of Element | Returns a List of all Elements
+get_marriages | Element individual | List of Marriage ("Date", "Place") | Returns List of Tuples of Marriage data (Date and Place)
+find_path_to_ancestors | Element descendant, Element ancestor||
+get_family_members | Element individual, optional String members_type - one of "ALL" (default), "PARENTS", "HUSB", "WIFE", "CHIL" | List of Element individuals||
+get_parents | Element individual, optional String parent_type - one of "ALL" (default) or "NAT" | List of Element individuals|
+get_ancestors | Element individual, optional String ancestor_type - one of "All" (default) or "NAT" ||
+get_families | Element individual optional String family_type - one of "FAMS" (default), "FAMC"||
+marriage_range_match | Element individual, Int from, Int to| Boolean | Check if individual is married within the specified range
+marriage_year_match | Element individual, Int year| Boolean | Check if individual is married in the year specified
+get_marriage_years | Element individual |List of Int| Returns Marriage event years
+print_gedcom | none | none | Prints the gedcom to STDOUT
+save_gedcom | String filename | none | Writes gedcom to specified filename
+
## History
This module was originally based on a GEDCOM parser written by
@@ -44,8 +128,16 @@ Daniel Zappala at Brigham Young University (Copyright (C) 2005) which
was licensed under the GPL v2 and then continued by
[Mad Price Ball](https://github.com/madprime) in 2012.
+Further updates by [Nicklas Reincke](https://github.com/nickreynke) and [Damon Brodie](https://github.com/nomadyow) in 2018.
+
## Changelog
+**v0.2.2dev**
+
+- Support BOM control characters
+- Support the last line not having a CR and/or LF
+- Support incorrect line splitting generated by Ancestry. Insert CONT/CONC tag as necessary
+
**v0.2.1dev**
- Changed broken links to GEDCOM format specification ([#2](https://github.com/nickreynke/python-gedcom/issues/2))
@@ -70,6 +162,7 @@ was licensed under the GPL v2 and then continued by
Licensed under the [GNU General Public License v2](http://www.gnu.org/licenses/gpl-2.0.html)
**Python GEDCOM Parser**
+
Copyright (C) 2018 Damon Brodie (damon.brodie at gmail.com)
Copyright (C) 2018 Nicklas Reincke (contact at reynke.com)
Copyright (C) 2016 Andreas Oberritter
Copyright (C) 2012 Madeleine Price Ball
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..4c35a33
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,8 @@
+version: '2'
+
+services:
+
+ python:
+ build: ./
+ volumes:
+ - ./:/usr/src/app:z
diff --git a/gedcom/__init__.py b/gedcom/__init__.py
index f1d78d1..a775f6e 100644
--- a/gedcom/__init__.py
+++ b/gedcom/__init__.py
@@ -1,5 +1,6 @@
# Python GEDCOM Parser
#
+# Copyright (C) 2018 Damon Brodie (damon.brodie at gmail.com)
# Copyright (C) 2018 Nicklas Reincke (contact at reynke.com)
# Copyright (C) 2016 Andreas Oberritter
# Copyright (C) 2012 Madeleine Price Ball
@@ -137,7 +138,7 @@ class Gedcom:
- a dict (only elements with pointers, which are the keys)
"""
- def __init__(self, file_path):
+ def __init__(self, file_path, use_strict = True):
"""Initialize a GEDCOM data object. You must supply a GEDCOM file
:type file_path: str
"""
@@ -145,7 +146,8 @@ def __init__(self, file_path):
self.__element_dictionary = {}
self.invalidate_cache()
self.__root_element = Element(-1, "", "ROOT", "")
- self.__parse(file_path)
+ self.__parse(file_path, use_strict)
+ self.__use_strict = use_strict
def invalidate_cache(self):
"""Cause get_element_list() and get_element_dictionary() to return updated data
@@ -213,7 +215,7 @@ def get_root_child_elements(self):
# Private methods
- def __parse(self, file_path):
+ def __parse(self, file_path, use_strict = True):
"""Open and parse file path as GEDCOM 5.5 formatted data
:type file_path: str
"""
@@ -221,11 +223,11 @@ def __parse(self, file_path):
line_number = 1
last_element = self.__root_element
for line in gedcom_file:
- last_element = self.__parse_line(line_number, line.decode('utf-8'), last_element)
+ last_element = self.__parse_line(line_number, line.decode('utf-8-sig'), last_element, use_strict )
line_number += 1
@staticmethod
- def __parse_line(line_number, line, last_element):
+ def __parse_line(line_number, line, last_element, use_strict = True):
"""Parse a line from a GEDCOM 5.5 formatted document
Each line should have the following (bracketed items optional):
@@ -257,17 +259,47 @@ def __parse_line(line_number, line, last_element):
regex_match = regex.match(gedcom_line_regex, line)
if regex_match is None:
- error_message = ("Line `%d` of document violates GEDCOM format 5.5" % line_number +
- "\nSee: https://chronoplexsoftware.com/gedcomvalidator/gedcom/gedcom-5.5.pdf")
- raise SyntaxError(error_message)
-
- line_parts = regex_match.groups()
-
- level = int(line_parts[0])
- pointer = line_parts[1].rstrip(' ')
- tag = line_parts[2]
- value = line_parts[3][1:]
- crlf = line_parts[4]
+ if use_strict:
+ error_message = ("Line %d of document violates GEDCOM format 5.5" % line_number +
+ "\nSee: https://chronoplexsoftware.com/gedcomvalidator/gedcom/gedcom-5.5.pdf")
+ raise SyntaxError(error_message)
+ else:
+ # Quirk check - see if this is a line without a CRLF (which could be the last line)
+ last_line_regex = level_regex + pointer_regex + tag_regex + value_regex
+ regex_match = regex.match(last_line_regex, line)
+ if regex_match is not None:
+ line_parts = regex_match.groups()
+
+ level = int(line_parts[0])
+ pointer = line_parts[1].rstrip(' ')
+ tag = line_parts[2]
+ value = line_parts[3][1:]
+ crlf = '\n'
+ else:
+ # Quirk check - Sometimes a gedcom has a text field with a CR.
+ # This creates a line without the standard level and pointer. If this is detected
+ # then turn it into a CONC or CONT
+ line_regex = '([^\n\r]*|)'
+ cont_line_regex = line_regex + end_of_line_regex
+ regex_match = regex.match(cont_line_regex, line)
+ line_parts = regex_match.groups()
+ level = last_element.get_level()
+ tag = last_element.get_tag()
+ pointer = None
+ value = line_parts[0][1:]
+ crlf = line_parts[1]
+ if tag != GEDCOM_TAG_CONTINUED and tag != GEDCOM_TAG_CONCATENATION:
+ # Increment level and change this line to a CONC
+ level += 1
+ tag = GEDCOM_TAG_CONCATENATION
+ else:
+ line_parts = regex_match.groups()
+
+ level = int(line_parts[0])
+ pointer = line_parts[1].rstrip(' ')
+ tag = line_parts[2]
+ value = line_parts[3][1:]
+ crlf = line_parts[4]
# Check level: should never be more than one higher than previous line.
if level > last_element.get_level() + 1:
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e69de29
diff --git a/setup.py b/setup.py
index d84dcd9..5266523 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
setup(
name='python-gedcom',
- version='0.2.0dev',
+ version='0.2.2dev',
packages=['gedcom', ],
license='GPLv2',
package_dir={'': '.'},