Add flavors
parent
4a30c5a514
commit
3170a9689f
|
|
@ -1,4 +1,3 @@
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
|
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
from .plotting import plot_geometry
|
|
||||||
103
camelot/cli.py
103
camelot/cli.py
|
|
@ -5,18 +5,9 @@ import click
|
||||||
|
|
||||||
from . import __version__
|
from . import __version__
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
from .plotting import plot_geometry
|
|
||||||
from .utils import validate_input, remove_extra
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
class Mutex(click.Option):
|
|
||||||
def handle_parse_result(self, ctx, opts, args):
|
|
||||||
mesh = opts.get('mesh', False)
|
|
||||||
geometry_type = opts.get('geometry_type', False)
|
|
||||||
validate_input(opts, mesh=mesh, geometry_type=geometry_type)
|
|
||||||
return super(Mutex, self).handle_parse_result(ctx, opts, args)
|
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.version_option(version=__version__)
|
@click.version_option(version=__version__)
|
||||||
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
|
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
|
||||||
|
|
@ -27,8 +18,6 @@ class Mutex(click.Option):
|
||||||
help="Output file format.")
|
help="Output file format.")
|
||||||
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
|
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
|
||||||
" archive.")
|
" archive.")
|
||||||
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
|
|
||||||
" use Lattice method of parsing. Stream is used by default.")
|
|
||||||
@click.option("-T", "--table_area", default=[], multiple=True,
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
help="Table areas (x1,y1,x2,y2) to process.\n"
|
help="Table areas (x1,y1,x2,y2) to process.\n"
|
||||||
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
||||||
|
|
@ -39,12 +28,44 @@ class Mutex(click.Option):
|
||||||
" super/subscripts)")
|
" super/subscripts)")
|
||||||
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
||||||
help="char_margin, line_margin, word_margin for PDFMiner.")
|
help="char_margin, line_margin, word_margin for PDFMiner.")
|
||||||
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
|
@click.option("-G", "--geometry_type",
|
||||||
help="x-coordinates of column separators.")
|
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
||||||
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
|
help="Plot geometry found on pdf page for debugging.\n\n"
|
||||||
" formed by combining text vertically within this tolerance.")
|
"text: Plot text objects. (Useful to get table_area and"
|
||||||
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
|
" columns coordinates)\ntable: Plot parsed table.\n"
|
||||||
" be formed by combining text horizontally within this tolerance.")
|
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
|
||||||
|
" intersections.\nline (with --mesh): Plot detected lines.")
|
||||||
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
|
def cli(*args, **kwargs):
|
||||||
|
pages = kwargs.pop("pages")
|
||||||
|
output = kwargs.pop("output")
|
||||||
|
f = kwargs.pop("format")
|
||||||
|
compress = kwargs.pop("zip")
|
||||||
|
mesh = kwargs.pop("mesh")
|
||||||
|
geometry_type = kwargs.pop("geometry_type")
|
||||||
|
filepath = kwargs.pop("filepath")
|
||||||
|
|
||||||
|
table_area = list(kwargs['table_area'])
|
||||||
|
kwargs['table_area'] = None if not table_area else table_area
|
||||||
|
columns = list(kwargs['columns'])
|
||||||
|
kwargs['columns'] = None if not columns else columns
|
||||||
|
copy_text = list(kwargs['copy_text'])
|
||||||
|
kwargs['copy_text'] = None if not copy_text else copy_text
|
||||||
|
kwargs['shift_text'] = list(kwargs['shift_text'])
|
||||||
|
|
||||||
|
kwargs = remove_extra(kwargs, mesh=mesh)
|
||||||
|
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
|
||||||
|
click.echo(tables)
|
||||||
|
if output is None:
|
||||||
|
raise click.UsageError("Please specify an output filepath using --output")
|
||||||
|
if f is None:
|
||||||
|
raise click.UsageError("Please specify an output format using --format")
|
||||||
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
|
help="Table areas (x1,y1,x2,y2) to process.\n"
|
||||||
|
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
||||||
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
|
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
|
||||||
help="(with --mesh) Whether or not to process lines that are in"
|
help="(with --mesh) Whether or not to process lines that are in"
|
||||||
" background.")
|
" background.")
|
||||||
|
|
@ -75,40 +96,18 @@ class Mutex(click.Option):
|
||||||
@click.option("-I", "--iterations", default=0, cls=Mutex,
|
@click.option("-I", "--iterations", default=0, cls=Mutex,
|
||||||
help="(with --mesh) Number of times for erosion/dilation is"
|
help="(with --mesh) Number of times for erosion/dilation is"
|
||||||
" applied.")
|
" applied.")
|
||||||
@click.option("-G", "--geometry_type",
|
def lattice(*args, **kwargs):
|
||||||
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
pass
|
||||||
help="Plot geometry found on pdf page for debugging.\n\n"
|
|
||||||
"text: Plot text objects. (Useful to get table_area and"
|
|
||||||
" columns coordinates)\ntable: Plot parsed table.\n"
|
|
||||||
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
|
|
||||||
" intersections.\nline (with --mesh): Plot detected lines.")
|
|
||||||
@click.argument("filepath", type=click.Path(exists=True))
|
|
||||||
def cli(*args, **kwargs):
|
|
||||||
pages = kwargs.pop("pages")
|
|
||||||
output = kwargs.pop("output")
|
|
||||||
f = kwargs.pop("format")
|
|
||||||
compress = kwargs.pop("zip")
|
|
||||||
mesh = kwargs.pop("mesh")
|
|
||||||
geometry_type = kwargs.pop("geometry_type")
|
|
||||||
filepath = kwargs.pop("filepath")
|
|
||||||
|
|
||||||
table_area = list(kwargs['table_area'])
|
|
||||||
kwargs['table_area'] = None if not table_area else table_area
|
|
||||||
columns = list(kwargs['columns'])
|
|
||||||
kwargs['columns'] = None if not columns else columns
|
|
||||||
copy_text = list(kwargs['copy_text'])
|
|
||||||
kwargs['copy_text'] = None if not copy_text else copy_text
|
|
||||||
kwargs['shift_text'] = list(kwargs['shift_text'])
|
|
||||||
|
|
||||||
kwargs = remove_extra(kwargs, mesh=mesh)
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
if geometry_type is None:
|
help="Table areas (x1,y1,x2,y2) to process.\n"
|
||||||
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
|
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
||||||
click.echo(tables)
|
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
|
||||||
if output is None:
|
help="x-coordinates of column separators.")
|
||||||
raise click.UsageError("Please specify an output filepath using --output")
|
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
|
||||||
if f is None:
|
" formed by combining text vertically within this tolerance.")
|
||||||
raise click.UsageError("Please specify an output format using --format")
|
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
|
||||||
tables.export(output, f=f, compress=compress)
|
" be formed by combining text horizontally within this tolerance.")
|
||||||
else:
|
def stream(*args, **kwargs):
|
||||||
plot_geometry(filepath, pages=pages, mesh=mesh,
|
pass
|
||||||
geometry_type=geometry_type, **kwargs)
|
|
||||||
|
|
@ -6,6 +6,8 @@ import tempfile
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from .plotting import *
|
||||||
|
|
||||||
|
|
||||||
class Cell(object):
|
class Cell(object):
|
||||||
"""Defines a cell in a table with coordinates relative to a
|
"""Defines a cell in a table with coordinates relative to a
|
||||||
|
|
@ -318,6 +320,32 @@ class Table(object):
|
||||||
cell.hspan = True
|
cell.hspan = True
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def plot(self, geometry_type):
|
||||||
|
"""Plot geometry found on PDF page based on geometry_type
|
||||||
|
specified, useful for debugging and playing with different
|
||||||
|
parameters to get the best output.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
geometry_type : str
|
||||||
|
The geometry type for which a plot should be generated.
|
||||||
|
Can be 'text', 'table', 'contour', 'joint', 'line'
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']:
|
||||||
|
raise NotImplementedError("{} cannot be plotted with flavor='stream'")
|
||||||
|
|
||||||
|
if geometry_type == 'text':
|
||||||
|
plot_text(self._text)
|
||||||
|
elif geometry_type == 'table':
|
||||||
|
plot_table(self)
|
||||||
|
elif geometry_type == 'contour':
|
||||||
|
plot_contour(self._image)
|
||||||
|
elif geometry_type == 'joint':
|
||||||
|
plot_joint(self._image)
|
||||||
|
elif geometry_type == 'line':
|
||||||
|
plot_line(self._segments)
|
||||||
|
|
||||||
def to_csv(self, path, **kwargs):
|
def to_csv(self, path, **kwargs):
|
||||||
"""Writes Table to a comma-separated values (csv) file.
|
"""Writes Table to a comma-separated values (csv) file.
|
||||||
|
|
||||||
|
|
@ -488,36 +516,4 @@ class TableList(object):
|
||||||
if compress:
|
if compress:
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
||||||
|
|
||||||
class Geometry(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.text = []
|
|
||||||
self.images = ()
|
|
||||||
self.segments = ()
|
|
||||||
self.tables = []
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<{} text={} images={} segments={} tables={}>'.format(
|
|
||||||
self.__class__.__name__,
|
|
||||||
len(self.text),
|
|
||||||
len(self.images),
|
|
||||||
len(self.segments),
|
|
||||||
len(self.tables))
|
|
||||||
|
|
||||||
|
|
||||||
class GeometryList(object):
|
|
||||||
def __init__(self, geometry):
|
|
||||||
self.text = [g.text for g in geometry]
|
|
||||||
self.images = [g.images for g in geometry]
|
|
||||||
self.segments = [g.segments for g in geometry]
|
|
||||||
self.tables = [g.tables for g in geometry]
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<{} text={} images={} segments={} tables={}>'.format(
|
|
||||||
self.__class__.__name__,
|
|
||||||
len(self.text),
|
|
||||||
len(self.images),
|
|
||||||
len(self.segments),
|
|
||||||
len(self.tables))
|
|
||||||
|
|
@ -2,7 +2,7 @@ import os
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
from .core import TableList, GeometryList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice
|
from .parsers import Stream, Lattice
|
||||||
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
||||||
get_rotation)
|
get_rotation)
|
||||||
|
|
@ -17,7 +17,7 @@ class PDFHandler(object):
|
||||||
----------
|
----------
|
||||||
filename : str
|
filename : str
|
||||||
Path to pdf file.
|
Path to pdf file.
|
||||||
pages : str
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers to parse.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end
|
||||||
|
|
||||||
|
|
@ -35,7 +35,7 @@ class PDFHandler(object):
|
||||||
----------
|
----------
|
||||||
filename : str
|
filename : str
|
||||||
Path to pdf file.
|
Path to pdf file.
|
||||||
pages : str
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers to parse.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end
|
||||||
|
|
||||||
|
|
@ -112,15 +112,15 @@ class PDFHandler(object):
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
def parse(self, mesh=False, **kwargs):
|
def parse(self, flavor='lattice', **kwargs):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page pdfs.
|
page pdfs.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
mesh : bool (default: False)
|
flavor : str (default: 'lattice')
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
The parsing method to use ('lattice' or 'stream').
|
||||||
is used by default.
|
Lattice is used by default.
|
||||||
kwargs : dict
|
kwargs : dict
|
||||||
See camelot.read_pdf kwargs.
|
See camelot.read_pdf kwargs.
|
||||||
|
|
||||||
|
|
@ -134,15 +134,13 @@ class PDFHandler(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
tables = []
|
tables = []
|
||||||
geometry = []
|
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
self._save_page(self.filename, p, tempdir)
|
self._save_page(self.filename, p, tempdir)
|
||||||
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
|
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
t, g = parser.extract_tables(p)
|
t = parser.extract_tables(p)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
geometry.append(g)
|
return TableList(tables)
|
||||||
return TableList(tables), GeometryList(geometry)
|
|
||||||
|
|
@ -2,22 +2,22 @@ from .handlers import PDFHandler
|
||||||
from .utils import validate_input, remove_extra
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
||||||
"""Read PDF and return parsed data tables.
|
"""Read PDF and return parsed data tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with mesh=False
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
and kwargs annotated with * can only be used with mesh=True.
|
and kwargs annotated with * can only be used with flavor='lattice'.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
filepath : str
|
||||||
Path to pdf file.
|
Path to pdf file.
|
||||||
pages : str
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers to parse.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end
|
||||||
mesh : bool (default: False)
|
flavor : str (default: 'lattice')
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
The parsing method to use ('lattice' or 'stream').
|
||||||
is used by default.
|
Lattice is used by default.
|
||||||
table_area : list, optional (default: None)
|
table_area : list, optional (default: None)
|
||||||
List of table areas to process as strings of the form
|
List of table areas to process as strings of the form
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
|
|
@ -85,8 +85,8 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
tables : camelot.core.TableList
|
tables : camelot.core.TableList
|
||||||
|
|
||||||
"""
|
"""
|
||||||
validate_input(kwargs, mesh=mesh)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages)
|
p = PDFHandler(filepath, pages)
|
||||||
kwargs = remove_extra(kwargs, mesh=mesh)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables, __ = p.parse(mesh=mesh, **kwargs)
|
tables, __ = p.parse(flavor=flavor, **kwargs)
|
||||||
return tables
|
return tables
|
||||||
|
|
@ -194,7 +194,8 @@ class Lattice(BaseParser):
|
||||||
stderr=subprocess.STDOUT)
|
stderr=subprocess.STDOUT)
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background,
|
self.image, self.threshold = adaptive_threshold(
|
||||||
|
self.imagename, process_background=self.process_background,
|
||||||
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
||||||
image_width = self.image.shape[1]
|
image_width = self.image.shape[1]
|
||||||
image_height = self.image.shape[0]
|
image_height = self.image.shape[0]
|
||||||
|
|
@ -297,11 +298,20 @@ class Lattice(BaseParser):
|
||||||
table.shape = table.df.shape
|
table.shape = table.df.shape
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
whitespace = compute_whitespace(data)
|
||||||
|
table.flavor = 'lattice'
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||||
|
|
||||||
|
# for plotting
|
||||||
|
_text = []
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
|
table._text = _text
|
||||||
|
table._image = (self.image, self.table_bbox_unscaled)
|
||||||
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
|
|
@ -311,7 +321,7 @@ class Lattice(BaseParser):
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
logger.info("No tables found on {}".format(
|
logger.info("No tables found on {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
return [], self.g
|
return []
|
||||||
|
|
||||||
self._generate_image()
|
self._generate_image()
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
@ -324,13 +334,4 @@ class Lattice(BaseParser):
|
||||||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
if self.debug:
|
return _tables
|
||||||
text = []
|
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
|
||||||
self.g.text = text
|
|
||||||
self.g.images = (self.image, self.table_bbox_unscaled)
|
|
||||||
self.g.segments = (self.vertical_segments, self.horizontal_segments)
|
|
||||||
self.g.tables = _tables
|
|
||||||
|
|
||||||
return _tables, self.g
|
|
||||||
|
|
@ -333,11 +333,20 @@ class Stream(BaseParser):
|
||||||
table.shape = table.df.shape
|
table.shape = table.df.shape
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
whitespace = compute_whitespace(data)
|
||||||
|
table.flavor = 'stream'
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||||
|
|
||||||
|
# for plotting
|
||||||
|
_text = []
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
|
table._text = _text
|
||||||
|
table._image = None
|
||||||
|
table._segments = None
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
|
|
@ -347,7 +356,7 @@ class Stream(BaseParser):
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
logger.info("No tables found on {}".format(
|
logger.info("No tables found on {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
return [], self.g
|
return []
|
||||||
|
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
||||||
|
|
@ -359,11 +368,4 @@ class Stream(BaseParser):
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
table = self._generate_table(table_idx, cols, rows)
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
if self.debug:
|
return _tables
|
||||||
text = []
|
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
|
||||||
self.g.text = text
|
|
||||||
self.g.tables = _tables
|
|
||||||
|
|
||||||
return _tables, self.g
|
|
||||||
|
|
@ -2,165 +2,72 @@ import cv2
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import matplotlib.patches as patches
|
import matplotlib.patches as patches
|
||||||
|
|
||||||
from .handlers import PDFHandler
|
|
||||||
from .utils import validate_input, remove_extra
|
def plot_text(text):
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
xs, ys = [], []
|
||||||
|
for t in text:
|
||||||
|
xs.extend([t[0], t[1]])
|
||||||
|
ys.extend([t[2], t[3]])
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
|
t[2] - t[0],
|
||||||
|
t[3] - t[1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs):
|
def plot_table(table):
|
||||||
"""Plot geometry found on pdf page based on type specified,
|
for row in table.cells:
|
||||||
useful for debugging and playing with different parameters to get
|
for cell in row:
|
||||||
the best output.
|
if cell.left:
|
||||||
|
plt.plot([cell.lb[0], cell.lt[0]],
|
||||||
|
[cell.lb[1], cell.lt[1]])
|
||||||
|
if cell.right:
|
||||||
|
plt.plot([cell.rb[0], cell.rt[0]],
|
||||||
|
[cell.rb[1], cell.rt[1]])
|
||||||
|
if cell.top:
|
||||||
|
plt.plot([cell.lt[0], cell.rt[0]],
|
||||||
|
[cell.lt[1], cell.rt[1]])
|
||||||
|
if cell.bottom:
|
||||||
|
plt.plot([cell.lb[0], cell.rb[0]],
|
||||||
|
[cell.lb[1], cell.rb[1]])
|
||||||
|
plt.show()
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with mesh=False
|
|
||||||
and kwargs annotated with * can only be used with mesh=True.
|
|
||||||
|
|
||||||
Parameters
|
def plot_contour(image):
|
||||||
----------
|
img, table_bbox = image
|
||||||
filepath : str
|
for t in table_bbox.keys():
|
||||||
Path to pdf file.
|
cv2.rectangle(img, (t[0], t[1]),
|
||||||
pages : str
|
(t[2], t[3]), (255, 0, 0), 20)
|
||||||
Comma-separated page numbers to parse.
|
plt.imshow(img)
|
||||||
Example: 1,3,4 or 1,4-end
|
plt.show()
|
||||||
mesh : bool (default: False)
|
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
|
||||||
is used by default.
|
|
||||||
geometry_type : str, optional (default: None)
|
|
||||||
* 'text' : Plot text objects found on page. (Useful to get \
|
|
||||||
table_area and columns coordinates)
|
|
||||||
* 'table' : Plot parsed table.
|
|
||||||
* 'contour'* : Plot detected rectangles.
|
|
||||||
* 'joint'* : Plot detected line intersections.
|
|
||||||
* 'line'* : Plot detected lines.
|
|
||||||
table_area : list, optional (default: None)
|
|
||||||
List of table areas to process as strings of the form
|
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
|
||||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
|
||||||
columns^ : list, optional (default: None)
|
|
||||||
List of column x-coordinates as strings where the coordinates
|
|
||||||
are comma-separated.
|
|
||||||
split_text : bool, optional (default: False)
|
|
||||||
Whether or not to split a text line if it spans across
|
|
||||||
multiple cells.
|
|
||||||
flag_size : bool, optional (default: False)
|
|
||||||
Whether or not to highlight a substring using <s></s>
|
|
||||||
if its size is different from rest of the string. (Useful for
|
|
||||||
super and subscripts.)
|
|
||||||
row_close_tol^ : int, optional (default: 2)
|
|
||||||
Rows will be formed by combining text vertically
|
|
||||||
within this tolerance.
|
|
||||||
col_close_tol^ : int, optional (default: 0)
|
|
||||||
Columns will be formed by combining text horizontally
|
|
||||||
within this tolerance.
|
|
||||||
process_background* : bool, optional (default: False)
|
|
||||||
Whether or not to process lines that are in background.
|
|
||||||
line_size_scaling* : int, optional (default: 15)
|
|
||||||
Factor by which the page dimensions will be divided to get
|
|
||||||
smallest length of lines that should be detected.
|
|
||||||
|
|
||||||
The larger this value, smaller the detected lines. Making it
|
|
||||||
too large will lead to text being detected as lines.
|
|
||||||
copy_text* : list, optional (default: None)
|
|
||||||
{'h', 'v'}
|
|
||||||
Select one or more strings from above and pass them as a list
|
|
||||||
to specify the direction in which text should be copied over
|
|
||||||
when a cell spans multiple rows or columns.
|
|
||||||
shift_text* : list, optional (default: ['l', 't'])
|
|
||||||
{'l', 'r', 't', 'b'}
|
|
||||||
Select one or more strings from above and pass them as a list
|
|
||||||
to specify where the text in a spanning cell should flow.
|
|
||||||
line_close_tol* : int, optional (default: 2)
|
|
||||||
Tolerance parameter used to merge vertical and horizontal
|
|
||||||
detected lines which lie close to each other.
|
|
||||||
joint_close_tol* : int, optional (default: 2)
|
|
||||||
Tolerance parameter used to decide whether the detected lines
|
|
||||||
and points lie close to each other.
|
|
||||||
threshold_blocksize* : int, optional (default: 15)
|
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
def plot_joint(image):
|
||||||
threshold_constant* : int, optional (default: -2)
|
img, table_bbox = image
|
||||||
Constant subtracted from the mean or weighted mean.
|
x_coord = []
|
||||||
Normally, it is positive but may be zero or negative as well.
|
y_coord = []
|
||||||
|
for k in table_bbox.keys():
|
||||||
|
for coord in table_bbox[k]:
|
||||||
|
x_coord.append(coord[0])
|
||||||
|
y_coord.append(coord[1])
|
||||||
|
max_x, max_y = max(x_coord), max(y_coord)
|
||||||
|
plt.plot(x_coord, y_coord, 'ro')
|
||||||
|
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||||
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
|
||||||
iterations* : int, optional (default: 0)
|
|
||||||
Number of times for erosion/dilation is applied.
|
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
def plot_line(segments):
|
||||||
margins : tuple
|
vertical, horizontal = segments
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
for v in vertical:
|
||||||
|
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
for h in horizontal:
|
||||||
|
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
"""
|
plt.show()
|
||||||
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
|
|
||||||
p = PDFHandler(filepath, pages)
|
|
||||||
kwargs = remove_extra(kwargs, mesh=mesh)
|
|
||||||
debug = True if geometry_type is not None else False
|
|
||||||
kwargs.update({'debug': debug})
|
|
||||||
__, geometry = p.parse(mesh=mesh, **kwargs)
|
|
||||||
|
|
||||||
if geometry_type == 'text':
|
|
||||||
for text in geometry.text:
|
|
||||||
fig = plt.figure()
|
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
|
||||||
xs, ys = [], []
|
|
||||||
for t in text:
|
|
||||||
xs.extend([t[0], t[1]])
|
|
||||||
ys.extend([t[2], t[3]])
|
|
||||||
ax.add_patch(
|
|
||||||
patches.Rectangle(
|
|
||||||
(t[0], t[1]),
|
|
||||||
t[2] - t[0],
|
|
||||||
t[3] - t[1]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
|
||||||
plt.show()
|
|
||||||
elif geometry_type == 'table':
|
|
||||||
for tables in geometry.tables:
|
|
||||||
for table in tables:
|
|
||||||
for row in table.cells:
|
|
||||||
for cell in row:
|
|
||||||
if cell.left:
|
|
||||||
plt.plot([cell.lb[0], cell.lt[0]],
|
|
||||||
[cell.lb[1], cell.lt[1]])
|
|
||||||
if cell.right:
|
|
||||||
plt.plot([cell.rb[0], cell.rt[0]],
|
|
||||||
[cell.rb[1], cell.rt[1]])
|
|
||||||
if cell.top:
|
|
||||||
plt.plot([cell.lt[0], cell.rt[0]],
|
|
||||||
[cell.lt[1], cell.rt[1]])
|
|
||||||
if cell.bottom:
|
|
||||||
plt.plot([cell.lb[0], cell.rb[0]],
|
|
||||||
[cell.lb[1], cell.rb[1]])
|
|
||||||
plt.show()
|
|
||||||
elif geometry_type == 'contour':
|
|
||||||
for img, table_bbox in geometry.images:
|
|
||||||
for t in table_bbox.keys():
|
|
||||||
cv2.rectangle(img, (t[0], t[1]),
|
|
||||||
(t[2], t[3]), (255, 0, 0), 20)
|
|
||||||
plt.imshow(img)
|
|
||||||
plt.show()
|
|
||||||
elif geometry_type == 'joint':
|
|
||||||
for img, table_bbox in geometry.images:
|
|
||||||
x_coord = []
|
|
||||||
y_coord = []
|
|
||||||
for k in table_bbox.keys():
|
|
||||||
for coord in table_bbox[k]:
|
|
||||||
x_coord.append(coord[0])
|
|
||||||
y_coord.append(coord[1])
|
|
||||||
max_x, max_y = max(x_coord), max(y_coord)
|
|
||||||
plt.plot(x_coord, y_coord, 'ro')
|
|
||||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
|
||||||
plt.imshow(img)
|
|
||||||
plt.show()
|
|
||||||
elif geometry_type == 'line':
|
|
||||||
for v_s, h_s in geometry.segments:
|
|
||||||
for v in v_s:
|
|
||||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
|
||||||
for h in h_s:
|
|
||||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
|
||||||
plt.show()
|
|
||||||
|
|
@ -38,25 +38,25 @@ lattice_kwargs = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def validate_input(kwargs, mesh=False, geometry_type=False):
|
def validate_input(kwargs, flavor='lattice', geometry_type=False):
|
||||||
def check_intersection(parser_kwargs, input_kwargs, message_bool):
|
def check_intersection(parser_kwargs, input_kwargs):
|
||||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
||||||
if isec:
|
if isec:
|
||||||
raise ValueError("{} can not be used with mesh set to {}".format(
|
raise ValueError("{} cannot be used with flavor='{}'".format(
|
||||||
",".join(sorted(isec)), message_bool))
|
",".join(sorted(isec)), flavor))
|
||||||
|
|
||||||
if mesh:
|
if flavor == 'lattice':
|
||||||
check_intersection(stream_kwargs, kwargs, True)
|
check_intersection(stream_kwargs, kwargs)
|
||||||
else:
|
else:
|
||||||
check_intersection(lattice_kwargs, kwargs, False)
|
check_intersection(lattice_kwargs, kwargs)
|
||||||
if geometry_type:
|
if geometry_type:
|
||||||
if not mesh and geometry_type in ['contour', 'joint', 'line']:
|
if flavor != 'lattice' and geometry_type in ['contour', 'joint', 'line']:
|
||||||
raise ValueError("Use geometry_type={} with mesh set to True".format(
|
raise ValueError("Use geometry_type='{}' with flavor='lattice'".format(
|
||||||
geometry_type))
|
geometry_type))
|
||||||
|
|
||||||
|
|
||||||
def remove_extra(kwargs, mesh=False):
|
def remove_extra(kwargs, flavor='lattice'):
|
||||||
if mesh:
|
if flavor == 'lattice':
|
||||||
for key in kwargs.keys():
|
for key in kwargs.keys():
|
||||||
if key in stream_kwargs:
|
if key in stream_kwargs:
|
||||||
kwargs.pop(key)
|
kwargs.pop(key)
|
||||||
|
|
|
||||||
2
setup.py
2
setup.py
|
|
@ -9,7 +9,7 @@ with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
|
||||||
exec(f.read(), about)
|
exec(f.read(), about)
|
||||||
|
|
||||||
# TODO: Move these to __version__.py
|
# TODO: Move these to __version__.py
|
||||||
NAME = 'camelot'
|
NAME = 'camelot-py'
|
||||||
VERSION = about['__version__']
|
VERSION = about['__version__']
|
||||||
DESCRIPTION = 'PDF Table Parsing for Humans'
|
DESCRIPTION = 'PDF Table Parsing for Humans'
|
||||||
with open('README.md') as f:
|
with open('README.md') as f:
|
||||||
|
|
|
||||||
|
|
@ -18,11 +18,11 @@ def test_stream_table_rotated():
|
||||||
df = pd.DataFrame(data_stream_table_rotated)
|
df = pd.DataFrame(data_stream_table_rotated)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -30,7 +30,7 @@ def test_stream_table_area():
|
||||||
df = pd.DataFrame(data_stream_table_area_single)
|
df = pd.DataFrame(data_stream_table_area_single)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
tables = camelot.read_pdf(filename, table_area=["320,500,573,335"])
|
tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"])
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -39,7 +39,7 @@ def test_stream_columns():
|
||||||
|
|
||||||
filename = os.path.join(testdir, "mexican_towns.pdf")
|
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, columns=["67,180,230,425,475"], row_close_tol=10)
|
filename, flavor="stream", columns=["67,180,230,425,475"], row_close_tol=10)
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -48,7 +48,7 @@ def test_lattice():
|
||||||
|
|
||||||
filename = os.path.join(testdir,
|
filename = os.path.join(testdir,
|
||||||
"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
|
"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
|
||||||
tables = camelot.read_pdf(filename, pages="2", mesh=True)
|
tables = camelot.read_pdf(filename, pages="2")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -56,11 +56,11 @@ def test_lattice_table_rotated():
|
||||||
df = pd.DataFrame(data_lattice_table_rotated)
|
df = pd.DataFrame(data_lattice_table_rotated)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, mesh=True)
|
tables = camelot.read_pdf(filename)
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, mesh=True)
|
tables = camelot.read_pdf(filename)
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -68,7 +68,7 @@ def test_lattice_process_background():
|
||||||
df = pd.DataFrame(data_lattice_process_background)
|
df = pd.DataFrame(data_lattice_process_background)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "background_lines_1.pdf")
|
filename = os.path.join(testdir, "background_lines_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, mesh=True, process_background=True)
|
tables = camelot.read_pdf(filename, process_background=True)
|
||||||
assert df.equals(tables[1].df)
|
assert df.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -76,5 +76,5 @@ def test_lattice_copy_text():
|
||||||
df = pd.DataFrame(data_lattice_copy_text)
|
df = pd.DataFrame(data_lattice_copy_text)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "row_span_1.pdf")
|
filename = os.path.join(testdir, "row_span_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, mesh=True, line_size_scaling=60, copy_text="v")
|
tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
Loading…
Reference in New Issue