Update advanced docs

pull/2/head
Vinayak Mehta 2018-12-19 18:19:39 +05:30
parent 6301fee523
commit 48b2dce633
7 changed files with 34 additions and 32 deletions

View File

@ -4,6 +4,12 @@ Release History
master
------
**Improvements**
* [#170](https://github.com/socialcopsdev/camelot/issues/170) Add option to pass pdfminer layout kwargs. [#232](https://github.com/socialcopsdev/camelot/pull/232) by Vinayak Mehta.
* Keyword arguments for [pdfminer.layout.LAParams](https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33) can now be passed using `layout_kwargs` in `read_pdf()`.
* The `margins` keyword argument in `read_pdf()` is now deprecated.
0.5.0 (2018-12-13)
------------------

View File

@ -125,7 +125,7 @@ class PDFHandler(object):
with open(fpath, 'wb') as f:
outfile.write(f)
def parse(self, flavor='lattice', suppress_stdout=False, extra_kwargs={}, **kwargs):
def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
@ -136,8 +136,8 @@ class PDFHandler(object):
Lattice is used by default.
suppress_stdout : str (default: False)
Suppress logs and warnings.
extra_kwargs : dict, optional (default: {})
A dict of pdfminer.layout.LAParams kwargs.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
kwargs : dict
See camelot.read_pdf kwargs.
@ -156,6 +156,6 @@ class PDFHandler(object):
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
for p in pages:
t = parser.extract_tables(p, suppress_stdout=suppress_stdout,
extra_kwargs=extra_kwargs)
layout_kwargs=layout_kwargs)
tables.extend(t)
return TableList(tables)

View File

@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
suppress_stdout=False, extra_kwargs={}, **kwargs):
suppress_stdout=False, layout_kwargs={}, **kwargs):
"""Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream'
@ -26,8 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
Lattice is used by default.
suppress_stdout : bool, optional (default: True)
Print all logs and warnings.
extra_kwargs : dict, optional (default: {})
A dict of pdfminer.layout.LAParams kwargs.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@ -79,10 +79,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
Returns
-------
@ -101,5 +97,5 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
p = PDFHandler(filepath, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
extra_kwargs=extra_kwargs, **kwargs)
layout_kwargs=layout_kwargs, **kwargs)
return tables

View File

@ -8,11 +8,11 @@ from ..utils import get_page_layout, get_text_objects
class BaseParser(object):
"""Defines a base parser.
"""
def _generate_layout(self, filename, extra_kwargs):
def _generate_layout(self, filename, layout_kwargs):
self.filename = filename
self.extra_kwargs = extra_kwargs
self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(
filename, **extra_kwargs)
filename, **layout_kwargs)
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
self.vertical_text = get_text_objects(self.layout, ltype="lv")
self.pdf_width, self.pdf_height = self.dimensions

View File

@ -70,17 +70,13 @@ class Lattice(BaseParser):
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
margins : tuple
PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
"""
def __init__(self, table_areas=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, line_close_tol=2,
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
iterations=0, **kwargs):
self.table_areas = table_areas
self.process_background = process_background
self.line_size_scaling = line_size_scaling
@ -93,7 +89,6 @@ class Lattice(BaseParser):
self.threshold_blocksize = threshold_blocksize
self.threshold_constant = threshold_constant
self.iterations = iterations
self.char_margin, self.line_margin, self.word_margin = margins
@staticmethod
def _reduce_index(t, idx, shift_text):
@ -348,8 +343,8 @@ class Lattice(BaseParser):
return table
def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}):
self._generate_layout(filename, extra_kwargs)
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
if not suppress_stdout:
logger.info('Processing {}'.format(os.path.basename(self.rootname)))

View File

@ -44,15 +44,10 @@ class Stream(BaseParser):
col_close_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
margins : tuple, optional (default: (1.0, 0.5, 0.1))
PDFMiner char_margin, line_margin and word_margin.
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
"""
def __init__(self, table_areas=None, columns=None, split_text=False,
flag_size=False, row_close_tol=2, col_close_tol=0,
margins=(1.0, 0.5, 0.1), **kwargs):
flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs):
self.table_areas = table_areas
self.columns = columns
self._validate_columns()
@ -60,7 +55,6 @@ class Stream(BaseParser):
self.flag_size = flag_size
self.row_close_tol = row_close_tol
self.col_close_tol = col_close_tol
self.char_margin, self.line_margin, self.word_margin = margins
@staticmethod
def _text_bbox(t_bbox):
@ -388,8 +382,8 @@ class Stream(BaseParser):
return table
def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}):
self._generate_layout(filename, extra_kwargs)
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
if not suppress_stdout:
logger.info('Processing {}'.format(os.path.basename(self.rootname)))

View File

@ -524,3 +524,14 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in
"4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..."
"4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..."
"4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..."
Tweak layout generation
-----------------------
Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 <https://github.com/socialcopsdev/camelot/issues/170>`_ and `#215 <https://github.com/socialcopsdev/camelot/issues/215>`_), PDFMiner can group characters that should belong to the same sentence into separate sentences.
To deal with such cases, you can tweak PDFMiner's `LAParams kwargs <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() <camelot.read_pdf>`. To know more about the parameters you can tweak, you can check out `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
::
>>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': True})