Merge pull request #232 from socialcopsdev/pdfminer_kwargs

[MRG] Add option to pass pdfminer kwargs
2018-12-19 18:45:33 +05:30 · 2018-12-19 18:45:33 +05:30 · e89e147b5c
parent d918293fea e0cb935130
commit e89e147b5c
11 changed files with 86 additions and 31 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -4,6 +4,12 @@ Release History
 master
 ------

+**Improvements**
+
+* [#170](https://github.com/socialcopsdev/camelot/issues/170) Add option to pass pdfminer layout kwargs. [#232](https://github.com/socialcopsdev/camelot/pull/232) by Vinayak Mehta.
+    * Keyword arguments for [pdfminer.layout.LAParams](https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33) can now be passed using `layout_kwargs` in `read_pdf()`.
+    * The `margins` keyword argument in `read_pdf()` is now deprecated.
+
 0.5.0 (2018-12-13)
 ------------------

--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -125,7 +125,7 @@ class PDFHandler(object):
                with open(fpath, 'wb') as f:
                    outfile.write(f)

-    def parse(self, flavor='lattice', suppress_stdout=False, **kwargs):
+    def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.

@ -136,6 +136,8 @@ class PDFHandler(object):
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
+        layout_kwargs : dict, optional (default: {})
+            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
        kwargs : dict
            See camelot.read_pdf kwargs.

@ -153,6 +155,7 @@ class PDFHandler(object):
                     for p in self.pages]
            parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
            for p in pages:
-                t = parser.extract_tables(p, suppress_stdout=suppress_stdout)
+                t = parser.extract_tables(p, suppress_stdout=suppress_stdout,
+                                          layout_kwargs=layout_kwargs)
                tables.extend(t)
        return TableList(tables)
--- a/camelot/io.py
+++ b/camelot/io.py
@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra


 def read_pdf(filepath, pages='1', password=None, flavor='lattice',
-             suppress_stdout=False, **kwargs):
+             suppress_stdout=False, layout_kwargs={}, **kwargs):
    """Read PDF and return extracted tables.

    Note: kwargs annotated with ^ can only be used with flavor='stream'
@ -26,6 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
        Lattice is used by default.
    suppress_stdout : bool, optional (default: True)
        Print all logs and warnings.
+    layout_kwargs : dict, optional (default: {})
+        A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@ -77,10 +79,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
        Number of times for erosion/dilation is applied.

        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
-    margins : tuple
-        PDFMiner char_margin, line_margin and word_margin.
-
-        For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

    Returns
    -------
@ -98,5 +96,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
        validate_input(kwargs, flavor=flavor)
        p = PDFHandler(filepath, pages=pages, password=password)
        kwargs = remove_extra(kwargs, flavor=flavor)
-        tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, **kwargs)
+        tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
+                         layout_kwargs=layout_kwargs, **kwargs)
        return tables
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -8,13 +8,11 @@ from ..utils import get_page_layout, get_text_objects
 class BaseParser(object):
    """Defines a base parser.
    """
-    def _generate_layout(self, filename):
+    def _generate_layout(self, filename, layout_kwargs):
        self.filename = filename
+        self.layout_kwargs = layout_kwargs
        self.layout, self.dimensions = get_page_layout(
-            self.filename,
-            char_margin=self.char_margin,
-            line_margin=self.line_margin,
-            word_margin=self.word_margin)
+            filename, **layout_kwargs)
        self.horizontal_text = get_text_objects(self.layout, ltype="lh")
        self.vertical_text = get_text_objects(self.layout, ltype="lv")
        self.pdf_width, self.pdf_height = self.dimensions
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -70,17 +70,13 @@ class Lattice(BaseParser):
        Number of times for erosion/dilation is applied.

        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
-    margins : tuple
-        PDFMiner char_margin, line_margin and word_margin.
-
-        For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

    """
    def __init__(self, table_areas=None, process_background=False,
                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
                 split_text=False, flag_size=False, line_close_tol=2,
                 joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
-                 iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
+                 iterations=0, **kwargs):
        self.table_areas = table_areas
        self.process_background = process_background
        self.line_size_scaling = line_size_scaling
@ -93,7 +89,6 @@ class Lattice(BaseParser):
        self.threshold_blocksize = threshold_blocksize
        self.threshold_constant = threshold_constant
        self.iterations = iterations
-        self.char_margin, self.line_margin, self.word_margin = margins

    @staticmethod
    def _reduce_index(t, idx, shift_text):
@ -348,8 +343,8 @@ class Lattice(BaseParser):

        return table

-    def extract_tables(self, filename, suppress_stdout=False):
-        self._generate_layout(filename)
+    def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
+        self._generate_layout(filename, layout_kwargs)
        if not suppress_stdout:
            logger.info('Processing {}'.format(os.path.basename(self.rootname)))

--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -44,15 +44,10 @@ class Stream(BaseParser):
    col_close_tol : int, optional (default: 0)
        Tolerance parameter used to combine text horizontally,
        to generate columns.
-    margins : tuple, optional (default: (1.0, 0.5, 0.1))
-        PDFMiner char_margin, line_margin and word_margin.
-
-        For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

    """
    def __init__(self, table_areas=None, columns=None, split_text=False,
-                 flag_size=False, row_close_tol=2, col_close_tol=0,
-                 margins=(1.0, 0.5, 0.1), **kwargs):
+                 flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs):
        self.table_areas = table_areas
        self.columns = columns
        self._validate_columns()
@ -60,7 +55,6 @@ class Stream(BaseParser):
        self.flag_size = flag_size
        self.row_close_tol = row_close_tol
        self.col_close_tol = col_close_tol
-        self.char_margin, self.line_margin, self.word_margin = margins

    @staticmethod
    def _text_bbox(t_bbox):
@ -388,8 +382,8 @@ class Stream(BaseParser):

        return table

-    def extract_tables(self, filename, suppress_stdout=False):
-        self._generate_layout(filename)
+    def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
+        self._generate_layout(filename, layout_kwargs)
        if not suppress_stdout:
            logger.info('Processing {}'.format(os.path.basename(self.rootname)))

--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -558,7 +558,7 @@ def compute_whitespace(d):


 def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
-               detect_vertical=True, all_texts=True):
+                    detect_vertical=True, all_texts=True):
    """Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. See https://euske.github.io/pdfminer/ to get definitions
    of kwargs.
--- a/docs/user/advanced.rst
+++ b/docs/user/advanced.rst
@ -524,3 +524,14 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in
    "4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..."
    "4","West Bengal","Birbhum","v.  Food Poisoning","199","0","31/12/13","31/12/13","Under control","..."
    "4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..."
+
+Tweak layout generation
+-----------------------
+
+Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 <https://github.com/socialcopsdev/camelot/issues/170>`_ and `#215 <https://github.com/socialcopsdev/camelot/issues/215>`_), PDFMiner can group characters that should belong to the same sentence into separate sentences.
+
+To deal with such cases, you can tweak PDFMiner's `LAParams kwargs <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() <camelot.read_pdf>`. To know more about the parameters you can tweak, you can check out `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
+
+::
+
+    >>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': False})
--- a/tests/data.py
+++ b/tests/data.py
@ -491,3 +491,43 @@ data_arabic = [
    ['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'],
    ['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', '']
 ]
+
+data_stream_layout_kwargs = [
+    ['V i n s   a u   Ve r r e', ''],
+    ['Les Blancs', '12.5CL'],
+    ['A.O.P Côtes du Rhône', ''],
+    ['Domaine de la Guicharde «  Autour de la chapelle » 2016', '8 €'],
+    ['A.O.P Vacqueyras', ''],
+    ['Domaine de Montvac  « Melodine » 2016', '10 €'],
+    ['A.O.P Châteauneuf du Pape', ''],
+    ['Domaine de Beaurenard 2017', '13 €'],
+    ['A.O.P Côteaux du Languedoc', ''],
+    ['Villa Tempora « Un temps pour elle » 2014', '9 €'],
+    ['A.O.P Côtes de Provence', ''],
+    ['Château Grand Boise 2017', '9 €'],
+    ['Les Rosés', '12,5 CL'],
+    ['A.O.P Côtes du Rhône', ''],
+    ['Domaine de la Florane « A fleur de Pampre » 2016', '8 €'],
+    ['Famille Coulon (Domaine Beaurenard) Biotifulfox 2017', '8 €'],
+    ['A.O.P Vacqueyras', ''],
+    ['Domaine de Montvac 2017', '9 €'],
+    ['A.O.P Languedoc', ''],
+    ['Domaine de Joncas « Nébla » 2015', '8 €'],
+    ['Villa Tempora « L’arroseur arrosé » 2015', '9 €'],
+    ['A.O.P Côtes de Provence', ''],
+    ['Château Grand Boise « Sainte Victoire » 2017', '9 €'],
+    ['Château Léoube 2016', '10 €'],
+    ['Les Rouges', '12,CL'],
+    ['A.O.P Côtes du Rhône', ''],
+    ['Domaine de Dionysos « La Cigalette »', '8 €'],
+    ['Château Saint Estève d’Uchaux « Grande Réserve » 2014', '9 €'],
+    ['Domaine de la Guicharde « Cuvée Massillan » 2016', '9 €'],
+    ['Domaine de la Florane « Terre Pourpre » 2014', '10 €'],
+    ['L’Oratoire St Martin « Réserve des Seigneurs » 2015', '11 €'],
+    ['A.O.P Saint Joseph', ''],
+    ['Domaine Monier Perréol « Châtelet » 2015', '13 €'],
+    ['A.O.P Châteauneuf du Pape', ''],
+    ['Domaine de Beaurenard 2011', '15 €'],
+    ['A.O.P Cornas', ''],
+    ['Domaine Lionnet « Terre Brûlée » 2012', '15 €']
+]
--- a/tests/files/detect_vertical_false.pdf
+++ b/tests/files/detect_vertical_false.pdf
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -102,6 +102,15 @@ def test_stream_flag_size():
    assert df.equals(tables[0].df)


+def test_stream_layout_kwargs():
+    df = pd.DataFrame(data_stream_layout_kwargs)
+
+    filename = os.path.join(testdir, "detect_vertical_false.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="stream", layout_kwargs={"detect_vertical": False})
+    assert df.equals(tables[0].df)
+
+
 def test_lattice():
    df = pd.DataFrame(data_lattice)