Merge pull request #232 from socialcopsdev/pdfminer_kwargs
[MRG] Add option to pass pdfminer kwargspull/2/head
commit
e89e147b5c
|
|
@ -4,6 +4,12 @@ Release History
|
||||||
master
|
master
|
||||||
------
|
------
|
||||||
|
|
||||||
|
**Improvements**
|
||||||
|
|
||||||
|
* [#170](https://github.com/socialcopsdev/camelot/issues/170) Add option to pass pdfminer layout kwargs. [#232](https://github.com/socialcopsdev/camelot/pull/232) by Vinayak Mehta.
|
||||||
|
* Keyword arguments for [pdfminer.layout.LAParams](https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33) can now be passed using `layout_kwargs` in `read_pdf()`.
|
||||||
|
* The `margins` keyword argument in `read_pdf()` is now deprecated.
|
||||||
|
|
||||||
0.5.0 (2018-12-13)
|
0.5.0 (2018-12-13)
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -125,7 +125,7 @@ class PDFHandler(object):
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
def parse(self, flavor='lattice', suppress_stdout=False, **kwargs):
|
def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page PDFs.
|
page PDFs.
|
||||||
|
|
||||||
|
|
@ -136,6 +136,8 @@ class PDFHandler(object):
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : str (default: False)
|
suppress_stdout : str (default: False)
|
||||||
Suppress logs and warnings.
|
Suppress logs and warnings.
|
||||||
|
layout_kwargs : dict, optional (default: {})
|
||||||
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||||
kwargs : dict
|
kwargs : dict
|
||||||
See camelot.read_pdf kwargs.
|
See camelot.read_pdf kwargs.
|
||||||
|
|
||||||
|
|
@ -153,6 +155,7 @@ class PDFHandler(object):
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
t = parser.extract_tables(p, suppress_stdout=suppress_stdout)
|
t = parser.extract_tables(p, suppress_stdout=suppress_stdout,
|
||||||
|
layout_kwargs=layout_kwargs)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
return TableList(tables)
|
return TableList(tables)
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
suppress_stdout=False, **kwargs):
|
suppress_stdout=False, layout_kwargs={}, **kwargs):
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
|
|
@ -26,6 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : bool, optional (default: True)
|
suppress_stdout : bool, optional (default: True)
|
||||||
Print all logs and warnings.
|
Print all logs and warnings.
|
||||||
|
layout_kwargs : dict, optional (default: {})
|
||||||
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||||
table_areas : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
|
@ -77,10 +79,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
margins : tuple
|
|
||||||
PDFMiner char_margin, line_margin and word_margin.
|
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -98,5 +96,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
validate_input(kwargs, flavor=flavor)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages=pages, password=password)
|
p = PDFHandler(filepath, pages=pages, password=password)
|
||||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, **kwargs)
|
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
|
||||||
|
layout_kwargs=layout_kwargs, **kwargs)
|
||||||
return tables
|
return tables
|
||||||
|
|
|
||||||
|
|
@ -8,13 +8,11 @@ from ..utils import get_page_layout, get_text_objects
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
def _generate_layout(self, filename):
|
def _generate_layout(self, filename, layout_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout, self.dimensions = get_page_layout(
|
self.layout, self.dimensions = get_page_layout(
|
||||||
self.filename,
|
filename, **layout_kwargs)
|
||||||
char_margin=self.char_margin,
|
|
||||||
line_margin=self.line_margin,
|
|
||||||
word_margin=self.word_margin)
|
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
|
|
|
||||||
|
|
@ -70,17 +70,13 @@ class Lattice(BaseParser):
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
margins : tuple
|
|
||||||
PDFMiner char_margin, line_margin and word_margin.
|
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, process_background=False,
|
def __init__(self, table_areas=None, process_background=False,
|
||||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||||
split_text=False, flag_size=False, line_close_tol=2,
|
split_text=False, flag_size=False, line_close_tol=2,
|
||||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||||
iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
|
iterations=0, **kwargs):
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_size_scaling = line_size_scaling
|
self.line_size_scaling = line_size_scaling
|
||||||
|
|
@ -93,7 +89,6 @@ class Lattice(BaseParser):
|
||||||
self.threshold_blocksize = threshold_blocksize
|
self.threshold_blocksize = threshold_blocksize
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.iterations = iterations
|
self.iterations = iterations
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
|
@ -348,8 +343,8 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False):
|
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,15 +44,10 @@ class Stream(BaseParser):
|
||||||
col_close_tol : int, optional (default: 0)
|
col_close_tol : int, optional (default: 0)
|
||||||
Tolerance parameter used to combine text horizontally,
|
Tolerance parameter used to combine text horizontally,
|
||||||
to generate columns.
|
to generate columns.
|
||||||
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
|
||||||
PDFMiner char_margin, line_margin and word_margin.
|
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, columns=None, split_text=False,
|
def __init__(self, table_areas=None, columns=None, split_text=False,
|
||||||
flag_size=False, row_close_tol=2, col_close_tol=0,
|
flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs):
|
||||||
margins=(1.0, 0.5, 0.1), **kwargs):
|
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
|
|
@ -60,7 +55,6 @@ class Stream(BaseParser):
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
self.row_close_tol = row_close_tol
|
self.row_close_tol = row_close_tol
|
||||||
self.col_close_tol = col_close_tol
|
self.col_close_tol = col_close_tol
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _text_bbox(t_bbox):
|
def _text_bbox(t_bbox):
|
||||||
|
|
@ -388,8 +382,8 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False):
|
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -524,3 +524,14 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in
|
||||||
"4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..."
|
"4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..."
|
||||||
"4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..."
|
"4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..."
|
||||||
"4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..."
|
"4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..."
|
||||||
|
|
||||||
|
Tweak layout generation
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 <https://github.com/socialcopsdev/camelot/issues/170>`_ and `#215 <https://github.com/socialcopsdev/camelot/issues/215>`_), PDFMiner can group characters that should belong to the same sentence into separate sentences.
|
||||||
|
|
||||||
|
To deal with such cases, you can tweak PDFMiner's `LAParams kwargs <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() <camelot.read_pdf>`. To know more about the parameters you can tweak, you can check out `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': False})
|
||||||
|
|
|
||||||
|
|
@ -491,3 +491,43 @@ data_arabic = [
|
||||||
['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'],
|
['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'],
|
||||||
['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', '']
|
['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', '']
|
||||||
]
|
]
|
||||||
|
|
||||||
|
data_stream_layout_kwargs = [
|
||||||
|
['V i n s a u Ve r r e', ''],
|
||||||
|
['Les Blancs', '12.5CL'],
|
||||||
|
['A.O.P Côtes du Rhône', ''],
|
||||||
|
['Domaine de la Guicharde « Autour de la chapelle » 2016', '8 €'],
|
||||||
|
['A.O.P Vacqueyras', ''],
|
||||||
|
['Domaine de Montvac « Melodine » 2016', '10 €'],
|
||||||
|
['A.O.P Châteauneuf du Pape', ''],
|
||||||
|
['Domaine de Beaurenard 2017', '13 €'],
|
||||||
|
['A.O.P Côteaux du Languedoc', ''],
|
||||||
|
['Villa Tempora « Un temps pour elle » 2014', '9 €'],
|
||||||
|
['A.O.P Côtes de Provence', ''],
|
||||||
|
['Château Grand Boise 2017', '9 €'],
|
||||||
|
['Les Rosés', '12,5 CL'],
|
||||||
|
['A.O.P Côtes du Rhône', ''],
|
||||||
|
['Domaine de la Florane « A fleur de Pampre » 2016', '8 €'],
|
||||||
|
['Famille Coulon (Domaine Beaurenard) Biotifulfox 2017', '8 €'],
|
||||||
|
['A.O.P Vacqueyras', ''],
|
||||||
|
['Domaine de Montvac 2017', '9 €'],
|
||||||
|
['A.O.P Languedoc', ''],
|
||||||
|
['Domaine de Joncas « Nébla » 2015', '8 €'],
|
||||||
|
['Villa Tempora « L’arroseur arrosé » 2015', '9 €'],
|
||||||
|
['A.O.P Côtes de Provence', ''],
|
||||||
|
['Château Grand Boise « Sainte Victoire » 2017', '9 €'],
|
||||||
|
['Château Léoube 2016', '10 €'],
|
||||||
|
['Les Rouges', '12,CL'],
|
||||||
|
['A.O.P Côtes du Rhône', ''],
|
||||||
|
['Domaine de Dionysos « La Cigalette »', '8 €'],
|
||||||
|
['Château Saint Estève d’Uchaux « Grande Réserve » 2014', '9 €'],
|
||||||
|
['Domaine de la Guicharde « Cuvée Massillan » 2016', '9 €'],
|
||||||
|
['Domaine de la Florane « Terre Pourpre » 2014', '10 €'],
|
||||||
|
['L’Oratoire St Martin « Réserve des Seigneurs » 2015', '11 €'],
|
||||||
|
['A.O.P Saint Joseph', ''],
|
||||||
|
['Domaine Monier Perréol « Châtelet » 2015', '13 €'],
|
||||||
|
['A.O.P Châteauneuf du Pape', ''],
|
||||||
|
['Domaine de Beaurenard 2011', '15 €'],
|
||||||
|
['A.O.P Cornas', ''],
|
||||||
|
['Domaine Lionnet « Terre Brûlée » 2012', '15 €']
|
||||||
|
]
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -102,6 +102,15 @@ def test_stream_flag_size():
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_layout_kwargs():
|
||||||
|
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="stream", layout_kwargs={"detect_vertical": False})
|
||||||
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice():
|
def test_lattice():
|
||||||
df = pd.DataFrame(data_lattice)
|
df = pd.DataFrame(data_lattice)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue