meldonization
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 116 additions & 3 deletions b/‎README.md
Lines changed: 116 additions & 3 deletions
diff --git a/‎annotations.jpg
116 KB b/‎annotations.jpg
116 KB
diff --git a/‎depdf/api.py
Lines changed: 1 addition & 1 deletion b/‎depdf/api.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎depdf/base.py
Lines changed: 4 additions & 0 deletions b/‎depdf/base.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎depdf/components/image.py
Lines changed: 1 addition & 1 deletion b/‎depdf/components/image.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎depdf/components/paragraph.py
Lines changed: 10 additions & 4 deletions b/‎depdf/components/paragraph.py
Lines changed: 10 additions & 4 deletions
diff --git a/‎depdf/components/span.py
Lines changed: 3 additions & 3 deletions b/‎depdf/components/span.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎depdf/components/table.py
Lines changed: 4 additions & 0 deletions b/‎depdf/components/table.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎depdf/config.py
Lines changed: 8 additions & 0 deletions b/‎depdf/config.py
Lines changed: 8 additions & 0 deletions
@@ -1,4 +1,5 @@
 .DS_Store
 .idea
 __pycache__
-*.py[cod]
+*.py[cod]
+temp_depdf/
@@ -1,5 +1,118 @@
-# depdf
+# DePDF
 
-An ultimate pdf file disintegration tool. Yet able to extract pages embedded with tables and paragraphs into structured markup language.
+An ultimate pdf file disintegration tool. DePDF is designed to extract tables and paragraphs into structured markup language [eg. html] from embedding pdf pages. You can also use it to convert page/pdf to html.
 
-Built on [`pdfplumber`](https://github.com/jsvine/pdfplumber)
+Built on top of [`pdfplumber`](https://github.com/jsvine/pdfplumber)
+
+# Table of Contents
+[toc]
+
+
+# Installation
+`pip install depdf`
+
+# Example
+```python
+from depdf import DePDF
+from depdf import DePage
+
+# general
+with DePDF.load('test/test_general.pdf') as pdf
+    pdf_html = pdf.to_html
+    print(pdf_html)
+
+# with dedicated configurations
+c = Config(
+    debug_flag=True,
+    verbose_flag=True,
+    add_line_flag=True
+)
+pdf = DePDF.load('test/test_general.pdf', config=c)
+page_index = 23  # start from zero
+page = pdf_file.pages[page_index]
+page_soup = page.soup
+print(page_soup.text)
+```
+
+
+# APIs
+| **functions** | usage |
+|:---:|---|
+| `extract_page_paragraphs` | extract paragraphs from specific page |
+| `extract_page_tables` | extract tables from specific page |
+| `convert_pdf_to_html` | convert the entire pdf to html | 
+| `convert_page_to_html` | convert specific page to html | 
+
+
+# In-Depth
+
+## In-page elements
+* Paragraph
+    + Text
+    + Span
+* Table
+    + Cell
+* Image
+
+## Common properties
+| **property & method** | explanation |
+|:---:|---|
+| `html` | converted html string |
+| `soup` | converted beautiful soup |
+| `bbox` | bounding box region | 
+| `save_html` | write html tag to local file| 
+
+## DePDf HTML structure
+```html
+<div class="{pdf_class}">
+    %for <!--page-{pid}-->
+        <div id="page-{}" class="{}">
+            %for {html_elements} endfor%
+        </div>
+    endfor%
+</div>
+```
+
+## DePage HTML element structure
+
+### Paragraph
+```html
+<p>
+    {paragraph-content}
+    <span> {span-content} </span>
+    ... 
+</p>
+```
+
+### Table
+```html
+<table>
+    <tr>
+        <td> {cell_0_0} </td>
+        <td> {cell_0_1} </td>
+        ...
+    </tr>
+    <tr colspan=2>
+        <td> {cell_1_0} </td>
+        ...
+    </tr>
+    ...
+</table>
+```
+
+### Image
+```
+<img src="temp_depdf/$prefix.png"></img>
+```
+# Appendix
+
+## DePage element denotations
+> Useful element properties within page
+
+![page element](annotations.jpg)
+
+## todo
+
+* [ ] add support for multiple-column pdf page
+* [ ] better table structure recognition
+* [x] recognize embedded objects inside page elements
@@ -21,7 +21,7 @@ def wrapper(pdf_file_path, *args, **kwargs):
         elif isinstance(pdf_file_path, PDF):
             pdf = DePDF(pdf_file_path, config=config, **kwargs)
         elif isinstance(pdf_file_path, str):
-            pdf = DePDF.open(pdf_file_path, config=config, **kwargs)
+            pdf = DePDF.load(pdf_file_path, config=config, **kwargs)
         else:
             raise PDFTypeError
         res = api_func(pdf, pid) if pid > 0 else api_func(pdf)
 
@@ -61,6 +61,10 @@ def html(self, html_value):
     def soup(self):
         return convert_html_to_soup(self._html)
 
+    def write_to(self, file_name):
+        with open(file_name, "w") as file:
+            file.write(self.html)
+
     @property
     def to_dict(self):
         return {
 
@@ -15,7 +15,7 @@ def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None)
         width = bbox[2] - bbox[0]
         img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)
         img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)
-        html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(
+        html = '<img id="{img_id}" class="{img_class}" src="{src}" width="{width}">'.format(
             img_id=img_id, img_class=img_class, src=src, width=width
         )
         html += '</img>'
 
@@ -10,15 +10,17 @@ class Paragraph(InnerWrapper, Box):
     object_type = 'paragraph'
 
     @check_config
-    def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None):
+    def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None, align=None):
         para_id = 'page-{pid}-paragraph-{para_id}'.format(pid=pid, para_id=para_idx)
         para_class = '{para_class} page-{pid}'.format(para_class=getattr(config, 'paragraph_class'), pid=pid)
-        style = construct_style(style=style)
-        html = '<p id="{para_id}" class="{para_class}"{style}>'.format(
-            para_id=para_id, para_class=para_class, style=style
+        style_text = construct_style(style=style)
+        align_text = ' align="{}"'.format(align) if align else ''
+        html = '<p id="{para_id}" class="{para_class}"{align_text}{style_text}>'.format(
+            para_id=para_id, para_class=para_class, style_text=style_text, align_text=align_text
         )
         self.pid = pid
         self.para_id = para_idx
+        self.config = config
         self.bbox = bbox
         if text:
             self.text = text
@@ -34,3 +36,7 @@ def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_obj
 
     def __repr__(self):
         return '<depdf.Paragraph: ({}, {})>'.format(self.pid, self.para_id)
+
+    def save_html(self):
+        paragraph_file_name = '{}_page_{}_paragraph_{}.html'.format(self.config.unique_prefix, self.pid, self.para_id)
+        return super().write_to(paragraph_file_name)
@@ -14,7 +14,7 @@ def __init__(self, bbox=None, span_text='', config=None, style=None):
         self.bbox = bbox
         self.text = span_text
         span_class = getattr(config, 'span_class')
-        style = construct_style(style=style)
-        self.html = '<span class="{span_class}{style}">{span_text}</span>'.format(
-            span_class=span_class, span_text=span_text, style=style
+        style_text = construct_style(style=style)
+        self.html = '<span class="{span_class}"{style_text}>{span_text}</span>'.format(
+            span_class=span_class, span_text=span_text, style_text=style_text
         )
@@ -45,6 +45,10 @@ def to_dict(self):
         ]
         return table_dict
 
+    def save_html(self):
+        table_file_name = '{}_page_{}_table_{}.html'.format(self.config.unique_prefix, self.pid, self.tid)
+        return super().write_to(table_file_name)
+
     @property
     def html(self):
         if not self._html and hasattr(self, 'to_html'):
 
@@ -1,4 +1,5 @@
 from functools import wraps
+import os
 
 from depdf.base import Base
 from depdf.error import ConfigTypeError
@@ -43,6 +44,9 @@ class Config(Base):
     vertical_double_line_tolerance = DEFAULT_VERTICAL_DOUBLE_LINE_TOLERANCE  # used in page class
     table_cell_merge_tolerance = DEFAULT_TABLE_CELL_MERGE_TOLERANCE
     skip_empty_table = DEFAULT_SKIP_EMPTY_TABLE
+    add_vertical_lines_flag = DEFAULT_ADD_VERTICAL_LINES_FLAG
+    add_horizontal_lines_flag = DEFAULT_ADD_HORIZONTAL_LINES_FLAG
+    add_horizontal_line_tolerance = DEFAULT_ADD_HORIZONTAL_LINE_TOLERANCE
 
     # image
     min_image_size = DEFAULT_MIN_IMAGE_SIZE
@@ -74,6 +78,10 @@ def __init__(self, **kwargs):
         self.update(**kwargs)
         self._kwargs = kwargs
 
+        # create temporary folder
+        if not os.path.isdir(self.temp_dir_prefix):
+            os.mkdir(self.temp_dir_prefix)
+
         # set logging level by log_level parameter
         logging.getLogger('depdf').setLevel(self.log_level)
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None)`
`15`	`15`	`width = bbox[2] - bbox[0]`
`16`	`16`	`img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)`
`17`	`17`	`img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)`
`18`		`- html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(`
	`18`	`+ html = '<img id="{img_id}" class="{img_class}" src="{src}" width="{width}">'.format(`
`19`	`19`	`img_id=img_id, img_class=img_class, src=src, width=width`
`20`	`20`	`)`
`21`	`21`	`html += '</img>'`