Merge de75c05fc0 into 83f816f104
commit
39fec76cd1
|
|
@ -34,7 +34,7 @@ class PDFHandler(object):
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self, filepath, pages="1", password=None):
|
||||
def __init__(self, filepath, pages="1", password=None, multi=[]):
|
||||
if is_url(filepath):
|
||||
filepath = download_url(filepath)
|
||||
self.filepath = filepath
|
||||
|
|
@ -48,6 +48,7 @@ class PDFHandler(object):
|
|||
if sys.version_info[0] < 3:
|
||||
self.password = self.password.encode("ascii")
|
||||
self.pages = self._get_pages(self.filepath, pages)
|
||||
self.multi = multi
|
||||
|
||||
def _get_pages(self, filepath, pages):
|
||||
"""Converts pages string to list of ints.
|
||||
|
|
@ -168,7 +169,16 @@ class PDFHandler(object):
|
|||
]
|
||||
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
||||
for p in pages:
|
||||
t = parser.extract_tables(
|
||||
p_no = p[-5]
|
||||
|
||||
page_kwargs = kwargs
|
||||
page_parser = parser
|
||||
|
||||
if p_no in self.multi:
|
||||
page_kwargs.update(self.multi[p_no])
|
||||
page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs)
|
||||
|
||||
t = page_parser.extract_tables(
|
||||
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
||||
)
|
||||
tables.extend(t)
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ def read_pdf(
|
|||
flavor="lattice",
|
||||
suppress_stdout=False,
|
||||
layout_kwargs={},
|
||||
multi = {},
|
||||
**kwargs
|
||||
):
|
||||
"""Read PDF and return extracted tables.
|
||||
|
|
@ -36,6 +37,9 @@ def read_pdf(
|
|||
Print all logs and warnings.
|
||||
layout_kwargs : dict, optional (default: {})
|
||||
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||
multi: dict, optional(default: {})
|
||||
A dict to enter parameters specific only for a page. Key: Page(str) to dict(defined params).
|
||||
Parameters defined in multi overwrite kwargs for that page
|
||||
table_areas : list, optional (default: None)
|
||||
List of table area strings of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
|
|
@ -108,7 +112,7 @@ def read_pdf(
|
|||
warnings.simplefilter("ignore")
|
||||
|
||||
validate_input(kwargs, flavor=flavor)
|
||||
p = PDFHandler(filepath, pages=pages, password=password)
|
||||
p = PDFHandler(filepath, pages=pages, password=password, multi=multi)
|
||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||
tables = p.parse(
|
||||
flavor=flavor,
|
||||
|
|
|
|||
|
|
@ -2858,3 +2858,20 @@ data_stream_layout_kwargs = [
|
|||
["A.O.P Cornas", ""],
|
||||
["Domaine Lionnet « Terre Brûlée » 2012", "15 €"],
|
||||
]
|
||||
|
||||
data_multi_params1 = [
|
||||
["Number of Coils", "Number of Paperclips"],
|
||||
["5", "3, 5, 4"],
|
||||
["10", "7, 8, 6"],
|
||||
["15", "11, 10, 12"],
|
||||
["20", "15, 13, 14"]
|
||||
]
|
||||
|
||||
data_multi_params2 = [
|
||||
["Time (drops of water)", "Distance (cm)"],
|
||||
["1", "10,11,9"],
|
||||
["2", "29, 31, 30"],
|
||||
["3", "59, 58, 61"],
|
||||
["4", "102, 100, 98"],
|
||||
["5", "122, 125, 127"]
|
||||
]
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -298,3 +298,16 @@ def test_table_order():
|
|||
(1, 2),
|
||||
(1, 1),
|
||||
]
|
||||
|
||||
def test_multi_params():
|
||||
df1 = pd.DataFrame(data_multi_params1)
|
||||
df2 = pd.DataFrame(data_multi_params2)
|
||||
|
||||
filename = os.path.join(
|
||||
testdir, "multi_params.pdf"
|
||||
)
|
||||
tables = camelot.read_pdf(filename, pages="all", multi={'2': {"table_regions": ["120, 210, 400, 90"]}},
|
||||
split_text=True)
|
||||
|
||||
assert df1.equals(tables[0].df)
|
||||
assert df2.equals(tables[1].df)
|
||||
|
|
|
|||
Loading…
Reference in New Issue