Added multi parameter for page level parameters
parent
7ecfcad239
commit
b167a8c7dd
|
|
@ -34,7 +34,7 @@ class PDFHandler(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, filepath, pages="1", password=None):
|
def __init__(self, filepath, pages="1", password=None, multi=[]):
|
||||||
if is_url(filepath):
|
if is_url(filepath):
|
||||||
filepath = download_url(filepath)
|
filepath = download_url(filepath)
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
|
|
@ -48,6 +48,7 @@ class PDFHandler(object):
|
||||||
if sys.version_info[0] < 3:
|
if sys.version_info[0] < 3:
|
||||||
self.password = self.password.encode("ascii")
|
self.password = self.password.encode("ascii")
|
||||||
self.pages = self._get_pages(self.filepath, pages)
|
self.pages = self._get_pages(self.filepath, pages)
|
||||||
|
self.multi = multi
|
||||||
|
|
||||||
def _get_pages(self, filepath, pages):
|
def _get_pages(self, filepath, pages):
|
||||||
"""Converts pages string to list of ints.
|
"""Converts pages string to list of ints.
|
||||||
|
|
@ -168,7 +169,16 @@ class PDFHandler(object):
|
||||||
]
|
]
|
||||||
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
t = parser.extract_tables(
|
p_no = p[-5]
|
||||||
|
|
||||||
|
page_kwargs = kwargs
|
||||||
|
page_parser = parser
|
||||||
|
|
||||||
|
if p_no in self.multi:
|
||||||
|
page_kwargs.update(self.multi[p_no])
|
||||||
|
page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs)
|
||||||
|
|
||||||
|
t = page_parser.extract_tables(
|
||||||
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
||||||
)
|
)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ def read_pdf(
|
||||||
flavor="lattice",
|
flavor="lattice",
|
||||||
suppress_stdout=False,
|
suppress_stdout=False,
|
||||||
layout_kwargs={},
|
layout_kwargs={},
|
||||||
|
multi = {},
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
@ -36,6 +37,9 @@ def read_pdf(
|
||||||
Print all logs and warnings.
|
Print all logs and warnings.
|
||||||
layout_kwargs : dict, optional (default: {})
|
layout_kwargs : dict, optional (default: {})
|
||||||
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||||
|
multi: dict, optional(default: {})
|
||||||
|
A dict to enter parameters specific only for a page. Key: Page(str) to dict(defined params).
|
||||||
|
Parameters defined in multi overwrite kwargs for that page
|
||||||
table_areas : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
|
@ -108,7 +112,7 @@ def read_pdf(
|
||||||
warnings.simplefilter("ignore")
|
warnings.simplefilter("ignore")
|
||||||
|
|
||||||
validate_input(kwargs, flavor=flavor)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages=pages, password=password)
|
p = PDFHandler(filepath, pages=pages, password=password, multi=multi)
|
||||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables = p.parse(
|
tables = p.parse(
|
||||||
flavor=flavor,
|
flavor=flavor,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue