Add extra_kwargs
parent
d918293fea
commit
ca6cefa362
|
|
@ -125,7 +125,7 @@ class PDFHandler(object):
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
def parse(self, flavor='lattice', suppress_stdout=False, **kwargs):
|
def parse(self, flavor='lattice', suppress_stdout=False, extra_kwargs={}, **kwargs):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page PDFs.
|
page PDFs.
|
||||||
|
|
||||||
|
|
@ -136,6 +136,8 @@ class PDFHandler(object):
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : str (default: False)
|
suppress_stdout : str (default: False)
|
||||||
Suppress logs and warnings.
|
Suppress logs and warnings.
|
||||||
|
extra_kwargs : dict, optional (default: {})
|
||||||
|
A dict of pdfminer.layout.LAParams kwargs.
|
||||||
kwargs : dict
|
kwargs : dict
|
||||||
See camelot.read_pdf kwargs.
|
See camelot.read_pdf kwargs.
|
||||||
|
|
||||||
|
|
@ -153,6 +155,7 @@ class PDFHandler(object):
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
t = parser.extract_tables(p, suppress_stdout=suppress_stdout)
|
t = parser.extract_tables(p, suppress_stdout=suppress_stdout
|
||||||
|
extra_kwargs=extra_kwargs)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
return TableList(tables)
|
return TableList(tables)
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
suppress_stdout=False, **kwargs):
|
suppress_stdout=False, extra_kwargs={}, **kwargs):
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
|
|
@ -26,6 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : bool, optional (default: True)
|
suppress_stdout : bool, optional (default: True)
|
||||||
Print all logs and warnings.
|
Print all logs and warnings.
|
||||||
|
extra_kwargs : dict, optional (default: {})
|
||||||
|
A dict of pdfminer.layout.LAParams kwargs.
|
||||||
table_areas : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
|
@ -98,5 +100,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
validate_input(kwargs, flavor=flavor)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages=pages, password=password)
|
p = PDFHandler(filepath, pages=pages, password=password)
|
||||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, **kwargs)
|
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
|
||||||
|
extra_kwargs=extra_kwargs, **kwargs)
|
||||||
return tables
|
return tables
|
||||||
|
|
|
||||||
|
|
@ -8,13 +8,10 @@ from ..utils import get_page_layout, get_text_objects
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
def _generate_layout(self, filename):
|
def _generate_layout(self, filename, extra_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.layout, self.dimensions = get_page_layout(
|
self.layout, self.dimensions = get_page_layout(
|
||||||
self.filename,
|
self.filename, **self.extra_kwargs)
|
||||||
char_margin=self.char_margin,
|
|
||||||
line_margin=self.line_margin,
|
|
||||||
word_margin=self.word_margin)
|
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
|
|
|
||||||
|
|
@ -348,8 +348,8 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False):
|
def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}):
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename, extra_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -388,8 +388,8 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False):
|
def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}):
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename, extra_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -558,7 +558,7 @@ def compute_whitespace(d):
|
||||||
|
|
||||||
|
|
||||||
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||||
detect_vertical=True, all_texts=True):
|
detect_vertical=True, all_texts=True):
|
||||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||||
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
||||||
of kwargs.
|
of kwargs.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue