commit
481b62a9f6
|
|
@ -12,7 +12,7 @@
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
>>> import camelot
|
>>> import camelot
|
||||||
>>> tables = camelot.read_pdf('foo.pdf', mesh=True)
|
>>> tables = camelot.read_pdf('foo.pdf')
|
||||||
>>> tables
|
>>> tables
|
||||||
<TableList tables=1>
|
<TableList tables=1>
|
||||||
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html
|
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
|
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
from .plotting import plot_geometry
|
|
||||||
153
camelot/cli.py
153
camelot/cli.py
|
|
@ -5,19 +5,20 @@ import click
|
||||||
|
|
||||||
from . import __version__
|
from . import __version__
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
from .plotting import plot_geometry
|
|
||||||
from .utils import validate_input, remove_extra
|
|
||||||
|
|
||||||
|
|
||||||
class Mutex(click.Option):
|
class Config(object):
|
||||||
def handle_parse_result(self, ctx, opts, args):
|
def __init__(self):
|
||||||
mesh = opts.get('mesh', False)
|
self.config = {}
|
||||||
geometry_type = opts.get('geometry_type', False)
|
|
||||||
validate_input(opts, mesh=mesh, geometry_type=geometry_type)
|
def set_config(self, key, value):
|
||||||
return super(Mutex, self).handle_parse_result(ctx, opts, args)
|
self.config[key] = value
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
pass_config = click.make_pass_decorator(Config)
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
@click.version_option(version=__version__)
|
@click.version_option(version=__version__)
|
||||||
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
|
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
|
||||||
" to parse. Example: 1,3,4 or 1,4-end")
|
" to parse. Example: 1,3,4 or 1,4-end")
|
||||||
|
|
@ -27,11 +28,6 @@ class Mutex(click.Option):
|
||||||
help="Output file format.")
|
help="Output file format.")
|
||||||
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
|
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
|
||||||
" archive.")
|
" archive.")
|
||||||
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
|
|
||||||
" use Lattice method of parsing. Stream is used by default.")
|
|
||||||
@click.option("-T", "--table_area", default=[], multiple=True,
|
|
||||||
help="Table areas (x1,y1,x2,y2) to process.\n"
|
|
||||||
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
|
||||||
@click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
|
@click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
|
||||||
" split text if it spans across multiple cells.")
|
" split text if it spans across multiple cells.")
|
||||||
@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
|
@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
|
||||||
|
|
@ -39,76 +35,121 @@ class Mutex(click.Option):
|
||||||
" super/subscripts)")
|
" super/subscripts)")
|
||||||
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
||||||
help="char_margin, line_margin, word_margin for PDFMiner.")
|
help="char_margin, line_margin, word_margin for PDFMiner.")
|
||||||
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
|
@click.pass_context
|
||||||
help="x-coordinates of column separators.")
|
def cli(ctx, *args, **kwargs):
|
||||||
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
|
ctx.obj = Config()
|
||||||
" formed by combining text vertically within this tolerance.")
|
for key, value in kwargs.iteritems():
|
||||||
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
|
ctx.obj.set_config(key, value)
|
||||||
" be formed by combining text horizontally within this tolerance.")
|
|
||||||
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
|
|
||||||
|
@cli.command('lattice')
|
||||||
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
|
help="Table areas (x1,y1,x2,y2) to process.\n"
|
||||||
|
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
||||||
|
@click.option("-back", "--process_background", is_flag=True,
|
||||||
help="(with --mesh) Whether or not to process lines that are in"
|
help="(with --mesh) Whether or not to process lines that are in"
|
||||||
" background.")
|
" background.")
|
||||||
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
|
@click.option("-scale", "--line_size_scaling", default=15,
|
||||||
help="(with --mesh) Factor by which the page dimensions will be"
|
help="(with --mesh) Factor by which the page dimensions will be"
|
||||||
" divided to get smallest length of detected lines.")
|
" divided to get smallest length of detected lines.")
|
||||||
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
|
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
|
||||||
multiple=True, cls=Mutex, help="(with --mesh) Specify direction"
|
multiple=True, help="(with --mesh) Specify direction"
|
||||||
" in which text will be copied over in a spanning cell.")
|
" in which text will be copied over in a spanning cell.")
|
||||||
@click.option("-shift", "--shift_text", default=["l", "t"],
|
@click.option("-shift", "--shift_text", default=["l", "t"],
|
||||||
type=click.Choice(["", "l", "r", "t", "b"]), multiple=True, cls=Mutex,
|
type=click.Choice(["", "l", "r", "t", "b"]), multiple=True,
|
||||||
help="(with --mesh) Specify direction in which text in a spanning"
|
help="(with --mesh) Specify direction in which text in a spanning"
|
||||||
" cell should flow.")
|
" cell should flow.")
|
||||||
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
|
@click.option("-l", "--line_close_tol", default=2,
|
||||||
help="(with --mesh) Tolerance parameter used to merge close vertical"
|
help="(with --mesh) Tolerance parameter used to merge close vertical"
|
||||||
" lines and close horizontal lines.")
|
" lines and close horizontal lines.")
|
||||||
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
|
@click.option("-j", "--joint_close_tol", default=2,
|
||||||
help="(with --mesh) Tolerance parameter used to decide whether"
|
help="(with --mesh) Tolerance parameter used to decide whether"
|
||||||
" the detected lines and points lie close to each other.")
|
" the detected lines and points lie close to each other.")
|
||||||
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
|
@click.option("-block", "--threshold_blocksize", default=15,
|
||||||
help="(with --mesh) For adaptive thresholding, size of a pixel"
|
help="(with --mesh) For adaptive thresholding, size of a pixel"
|
||||||
" neighborhood that is used to calculate a threshold value for"
|
" neighborhood that is used to calculate a threshold value for"
|
||||||
" the pixel: 3, 5, 7, and so on.")
|
" the pixel: 3, 5, 7, and so on.")
|
||||||
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
|
@click.option("-const", "--threshold_constant", default=-2,
|
||||||
help="(with --mesh) For adaptive thresholding, constant subtracted"
|
help="(with --mesh) For adaptive thresholding, constant subtracted"
|
||||||
" from the mean or weighted mean.\nNormally, it is positive but"
|
" from the mean or weighted mean.\nNormally, it is positive but"
|
||||||
" may be zero or negative as well.")
|
" may be zero or negative as well.")
|
||||||
@click.option("-I", "--iterations", default=0, cls=Mutex,
|
@click.option("-I", "--iterations", default=0,
|
||||||
help="(with --mesh) Number of times for erosion/dilation is"
|
help="(with --mesh) Number of times for erosion/dilation is"
|
||||||
" applied.")
|
" applied.")
|
||||||
@click.option("-G", "--geometry_type",
|
@click.option("-plot", "--plot_type",
|
||||||
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
||||||
help="Plot geometry found on pdf page for debugging.\n\n"
|
help="Plot geometry found on PDF page for debugging.")
|
||||||
"text: Plot text objects. (Useful to get table_area and"
|
|
||||||
" columns coordinates)\ntable: Plot parsed table.\n"
|
|
||||||
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
|
|
||||||
" intersections.\nline (with --mesh): Plot detected lines.")
|
|
||||||
@click.argument("filepath", type=click.Path(exists=True))
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
def cli(*args, **kwargs):
|
@pass_config
|
||||||
pages = kwargs.pop("pages")
|
def lattice(c, *args, **kwargs):
|
||||||
output = kwargs.pop("output")
|
"""Use lines between text to parse table."""
|
||||||
f = kwargs.pop("format")
|
conf = c.config
|
||||||
compress = kwargs.pop("zip")
|
pages = conf.pop("pages")
|
||||||
mesh = kwargs.pop("mesh")
|
output = conf.pop("output")
|
||||||
geometry_type = kwargs.pop("geometry_type")
|
f = conf.pop("format")
|
||||||
|
compress = conf.pop("zip")
|
||||||
|
plot_type = kwargs.pop('plot_type')
|
||||||
filepath = kwargs.pop("filepath")
|
filepath = kwargs.pop("filepath")
|
||||||
|
kwargs.update(conf)
|
||||||
|
|
||||||
|
table_area = list(kwargs['table_area'])
|
||||||
|
kwargs['table_area'] = None if not table_area else table_area
|
||||||
|
copy_text = list(kwargs['copy_text'])
|
||||||
|
kwargs['copy_text'] = None if not copy_text else copy_text
|
||||||
|
kwargs['shift_text'] = list(kwargs['shift_text'])
|
||||||
|
|
||||||
|
tables = read_pdf(filepath, pages=pages, flavor='lattice', **kwargs)
|
||||||
|
click.echo(tables)
|
||||||
|
if plot_type is not None:
|
||||||
|
for table in tables:
|
||||||
|
table.plot(plot_type)
|
||||||
|
else:
|
||||||
|
if output is None:
|
||||||
|
raise click.UsageError("Please specify output filepath using --output")
|
||||||
|
if f is None:
|
||||||
|
raise click.UsageError("Please specify output format using --format")
|
||||||
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('stream')
|
||||||
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
|
help="Table areas (x1,y1,x2,y2) to process.\n"
|
||||||
|
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
||||||
|
@click.option("-C", "--columns", default=[], multiple=True,
|
||||||
|
help="x-coordinates of column separators.")
|
||||||
|
@click.option("-r", "--row_close_tol", default=2, help="Rows will be"
|
||||||
|
" formed by combining text vertically within this tolerance.")
|
||||||
|
@click.option("-c", "--col_close_tol", default=0, help="Columns will"
|
||||||
|
" be formed by combining text horizontally within this tolerance.")
|
||||||
|
@click.option("-plot", "--plot_type",
|
||||||
|
type=click.Choice(["text", "table"]),
|
||||||
|
help="Plot geometry found on PDF page for debugging.")
|
||||||
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
|
@pass_config
|
||||||
|
def stream(c, *args, **kwargs):
|
||||||
|
"""Use spaces between text to parse table."""
|
||||||
|
conf = c.config
|
||||||
|
pages = conf.pop("pages")
|
||||||
|
output = conf.pop("output")
|
||||||
|
f = conf.pop("format")
|
||||||
|
compress = conf.pop("zip")
|
||||||
|
plot_type = kwargs.pop('plot_type')
|
||||||
|
filepath = kwargs.pop("filepath")
|
||||||
|
kwargs.update(conf)
|
||||||
|
|
||||||
table_area = list(kwargs['table_area'])
|
table_area = list(kwargs['table_area'])
|
||||||
kwargs['table_area'] = None if not table_area else table_area
|
kwargs['table_area'] = None if not table_area else table_area
|
||||||
columns = list(kwargs['columns'])
|
columns = list(kwargs['columns'])
|
||||||
kwargs['columns'] = None if not columns else columns
|
kwargs['columns'] = None if not columns else columns
|
||||||
copy_text = list(kwargs['copy_text'])
|
|
||||||
kwargs['copy_text'] = None if not copy_text else copy_text
|
|
||||||
kwargs['shift_text'] = list(kwargs['shift_text'])
|
|
||||||
|
|
||||||
kwargs = remove_extra(kwargs, mesh=mesh)
|
tables = read_pdf(filepath, pages=pages, flavor='stream', **kwargs)
|
||||||
if geometry_type is None:
|
click.echo(tables)
|
||||||
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
|
if plot_type is not None:
|
||||||
click.echo(tables)
|
for table in tables:
|
||||||
if output is None:
|
table.plot(plot_type)
|
||||||
raise click.UsageError("Please specify an output filepath using --output")
|
|
||||||
if f is None:
|
|
||||||
raise click.UsageError("Please specify an output format using --format")
|
|
||||||
tables.export(output, f=f, compress=compress)
|
|
||||||
else:
|
else:
|
||||||
plot_geometry(filepath, pages=pages, mesh=mesh,
|
if output is None:
|
||||||
geometry_type=geometry_type, **kwargs)
|
raise click.UsageError("Please specify output filepath using --output")
|
||||||
|
if f is None:
|
||||||
|
raise click.UsageError("Please specify output format using --format")
|
||||||
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
@ -6,6 +6,8 @@ import tempfile
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from .plotting import *
|
||||||
|
|
||||||
|
|
||||||
class Cell(object):
|
class Cell(object):
|
||||||
"""Defines a cell in a table with coordinates relative to a
|
"""Defines a cell in a table with coordinates relative to a
|
||||||
|
|
@ -318,6 +320,33 @@ class Table(object):
|
||||||
cell.hspan = True
|
cell.hspan = True
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def plot(self, geometry_type):
|
||||||
|
"""Plot geometry found on PDF page based on geometry_type
|
||||||
|
specified, useful for debugging and playing with different
|
||||||
|
parameters to get the best output.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
geometry_type : str
|
||||||
|
The geometry type for which a plot should be generated.
|
||||||
|
Can be 'text', 'table', 'contour', 'joint', 'line'
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']:
|
||||||
|
raise NotImplementedError("{} cannot be plotted with flavor='stream'".format(
|
||||||
|
geometry_type))
|
||||||
|
|
||||||
|
if geometry_type == 'text':
|
||||||
|
plot_text(self._text)
|
||||||
|
elif geometry_type == 'table':
|
||||||
|
plot_table(self)
|
||||||
|
elif geometry_type == 'contour':
|
||||||
|
plot_contour(self._image)
|
||||||
|
elif geometry_type == 'joint':
|
||||||
|
plot_joint(self._image)
|
||||||
|
elif geometry_type == 'line':
|
||||||
|
plot_line(self._segments)
|
||||||
|
|
||||||
def to_csv(self, path, **kwargs):
|
def to_csv(self, path, **kwargs):
|
||||||
"""Writes Table to a comma-separated values (csv) file.
|
"""Writes Table to a comma-separated values (csv) file.
|
||||||
|
|
||||||
|
|
@ -416,13 +445,25 @@ class TableList(object):
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
return self._tables[idx]
|
return self._tables[idx]
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
self._n = 0
|
||||||
|
return self
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
if self._n < len(self):
|
||||||
|
r = self._tables[self._n]
|
||||||
|
self._n += 1
|
||||||
|
return r
|
||||||
|
else:
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _format_func(table, f):
|
def _format_func(table, f):
|
||||||
return getattr(table, 'to_{}'.format(f))
|
return getattr(table, 'to_{}'.format(f))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n(self):
|
def n(self):
|
||||||
return len(self._tables)
|
return len(self)
|
||||||
|
|
||||||
def _write_file(self, f=None, **kwargs):
|
def _write_file(self, f=None, **kwargs):
|
||||||
dirname = kwargs.get('dirname')
|
dirname = kwargs.get('dirname')
|
||||||
|
|
@ -488,36 +529,4 @@ class TableList(object):
|
||||||
if compress:
|
if compress:
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
||||||
|
|
||||||
class Geometry(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.text = []
|
|
||||||
self.images = ()
|
|
||||||
self.segments = ()
|
|
||||||
self.tables = []
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<{} text={} images={} segments={} tables={}>'.format(
|
|
||||||
self.__class__.__name__,
|
|
||||||
len(self.text),
|
|
||||||
len(self.images),
|
|
||||||
len(self.segments),
|
|
||||||
len(self.tables))
|
|
||||||
|
|
||||||
|
|
||||||
class GeometryList(object):
|
|
||||||
def __init__(self, geometry):
|
|
||||||
self.text = [g.text for g in geometry]
|
|
||||||
self.images = [g.images for g in geometry]
|
|
||||||
self.segments = [g.segments for g in geometry]
|
|
||||||
self.tables = [g.tables for g in geometry]
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<{} text={} images={} segments={} tables={}>'.format(
|
|
||||||
self.__class__.__name__,
|
|
||||||
len(self.text),
|
|
||||||
len(self.images),
|
|
||||||
len(self.segments),
|
|
||||||
len(self.tables))
|
|
||||||
|
|
@ -2,7 +2,7 @@ import os
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
from .core import TableList, GeometryList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice
|
from .parsers import Stream, Lattice
|
||||||
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
||||||
get_rotation)
|
get_rotation)
|
||||||
|
|
@ -17,7 +17,7 @@ class PDFHandler(object):
|
||||||
----------
|
----------
|
||||||
filename : str
|
filename : str
|
||||||
Path to pdf file.
|
Path to pdf file.
|
||||||
pages : str
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers to parse.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end
|
||||||
|
|
||||||
|
|
@ -35,7 +35,7 @@ class PDFHandler(object):
|
||||||
----------
|
----------
|
||||||
filename : str
|
filename : str
|
||||||
Path to pdf file.
|
Path to pdf file.
|
||||||
pages : str
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers to parse.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end
|
||||||
|
|
||||||
|
|
@ -112,15 +112,15 @@ class PDFHandler(object):
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
def parse(self, mesh=False, **kwargs):
|
def parse(self, flavor='lattice', **kwargs):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page pdfs.
|
page pdfs.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
mesh : bool (default: False)
|
flavor : str (default: 'lattice')
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
The parsing method to use ('lattice' or 'stream').
|
||||||
is used by default.
|
Lattice is used by default.
|
||||||
kwargs : dict
|
kwargs : dict
|
||||||
See camelot.read_pdf kwargs.
|
See camelot.read_pdf kwargs.
|
||||||
|
|
||||||
|
|
@ -134,15 +134,13 @@ class PDFHandler(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
tables = []
|
tables = []
|
||||||
geometry = []
|
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
self._save_page(self.filename, p, tempdir)
|
self._save_page(self.filename, p, tempdir)
|
||||||
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
|
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
t, g = parser.extract_tables(p)
|
t = parser.extract_tables(p)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
geometry.append(g)
|
return TableList(tables)
|
||||||
return TableList(tables), GeometryList(geometry)
|
|
||||||
|
|
@ -2,22 +2,22 @@ from .handlers import PDFHandler
|
||||||
from .utils import validate_input, remove_extra
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
|
||||||
"""Read PDF and return parsed data tables.
|
"""Read PDF and return parsed data tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with mesh=False
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
and kwargs annotated with * can only be used with mesh=True.
|
and kwargs annotated with * can only be used with flavor='lattice'.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
filepath : str
|
||||||
Path to pdf file.
|
Path to pdf file.
|
||||||
pages : str
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers to parse.
|
Comma-separated page numbers to parse.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end
|
||||||
mesh : bool (default: False)
|
flavor : str (default: 'lattice')
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
The parsing method to use ('lattice' or 'stream').
|
||||||
is used by default.
|
Lattice is used by default.
|
||||||
table_area : list, optional (default: None)
|
table_area : list, optional (default: None)
|
||||||
List of table areas to process as strings of the form
|
List of table areas to process as strings of the form
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||||
|
|
@ -85,8 +85,12 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
tables : camelot.core.TableList
|
tables : camelot.core.TableList
|
||||||
|
|
||||||
"""
|
"""
|
||||||
validate_input(kwargs, mesh=mesh)
|
if flavor not in ['lattice', 'stream']:
|
||||||
|
raise NotImplementedError("Unknown flavor specified."
|
||||||
|
" Use either 'lattice' or 'stream'")
|
||||||
|
|
||||||
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages)
|
p = PDFHandler(filepath, pages)
|
||||||
kwargs = remove_extra(kwargs, mesh=mesh)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables, __ = p.parse(mesh=mesh, **kwargs)
|
tables = p.parse(flavor=flavor, **kwargs)
|
||||||
return tables
|
return tables
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..core import Geometry
|
|
||||||
from ..utils import get_page_layout, get_text_objects
|
from ..utils import get_page_layout, get_text_objects
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -17,5 +16,4 @@ class BaseParser(object):
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
self.g = Geometry()
|
|
||||||
|
|
@ -21,7 +21,7 @@ logger = setup_logging(__name__)
|
||||||
|
|
||||||
class Lattice(BaseParser):
|
class Lattice(BaseParser):
|
||||||
"""Lattice method of parsing looks for lines between text
|
"""Lattice method of parsing looks for lines between text
|
||||||
to form a table.
|
to parse table.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -77,17 +77,13 @@ class Lattice(BaseParser):
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
debug : bool, optional (default: False)
|
|
||||||
Whether or not to return all text objects on the page
|
|
||||||
which can be used to generate a matplotlib plot, to get
|
|
||||||
values for table_area(s) and debugging.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, process_background=False,
|
def __init__(self, table_area=None, process_background=False,
|
||||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||||
split_text=False, flag_size=False, line_close_tol=2,
|
split_text=False, flag_size=False, line_close_tol=2,
|
||||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||||
iterations=0, margins=(1.0, 0.5, 0.1), debug=False):
|
iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_size_scaling = line_size_scaling
|
self.line_size_scaling = line_size_scaling
|
||||||
|
|
@ -101,7 +97,6 @@ class Lattice(BaseParser):
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.iterations = iterations
|
self.iterations = iterations
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
self.debug = debug
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
|
@ -194,7 +189,8 @@ class Lattice(BaseParser):
|
||||||
stderr=subprocess.STDOUT)
|
stderr=subprocess.STDOUT)
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background,
|
self.image, self.threshold = adaptive_threshold(
|
||||||
|
self.imagename, process_background=self.process_background,
|
||||||
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
||||||
image_width = self.image.shape[1]
|
image_width = self.image.shape[1]
|
||||||
image_height = self.image.shape[0]
|
image_height = self.image.shape[0]
|
||||||
|
|
@ -297,11 +293,20 @@ class Lattice(BaseParser):
|
||||||
table.shape = table.df.shape
|
table.shape = table.df.shape
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
whitespace = compute_whitespace(data)
|
||||||
|
table.flavor = 'lattice'
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||||
|
|
||||||
|
# for plotting
|
||||||
|
_text = []
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
|
table._text = _text
|
||||||
|
table._image = (self.image, self.table_bbox_unscaled)
|
||||||
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
|
|
@ -311,7 +316,7 @@ class Lattice(BaseParser):
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
logger.info("No tables found on {}".format(
|
logger.info("No tables found on {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
return [], self.g
|
return []
|
||||||
|
|
||||||
self._generate_image()
|
self._generate_image()
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
@ -324,13 +329,4 @@ class Lattice(BaseParser):
|
||||||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
if self.debug:
|
return _tables
|
||||||
text = []
|
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
|
||||||
self.g.text = text
|
|
||||||
self.g.images = (self.image, self.table_bbox_unscaled)
|
|
||||||
self.g.segments = (self.vertical_segments, self.horizontal_segments)
|
|
||||||
self.g.tables = _tables
|
|
||||||
|
|
||||||
return _tables, self.g
|
|
||||||
|
|
@ -16,7 +16,7 @@ logger = setup_logging(__name__)
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
"""Stream method of parsing looks for spaces between text
|
"""Stream method of parsing looks for spaces between text
|
||||||
to form a table.
|
to parse table.
|
||||||
|
|
||||||
If you want to specify columns when specifying multiple table
|
If you want to specify columns when specifying multiple table
|
||||||
areas, make sure that the length of both lists are equal.
|
areas, make sure that the length of both lists are equal.
|
||||||
|
|
@ -47,15 +47,11 @@ class Stream(BaseParser):
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
debug : bool, optional (default: False)
|
|
||||||
Whether or not to return all text objects on the page
|
|
||||||
which can be used to generate a matplotlib plot, to get
|
|
||||||
values for table_area(s), columns and debugging.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, split_text=False,
|
def __init__(self, table_area=None, columns=None, split_text=False,
|
||||||
flag_size=False, row_close_tol=2, col_close_tol=0,
|
flag_size=False, row_close_tol=2, col_close_tol=0,
|
||||||
margins=(1.0, 0.5, 0.1), debug=False):
|
margins=(1.0, 0.5, 0.1), **kwargs):
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
|
|
@ -64,7 +60,6 @@ class Stream(BaseParser):
|
||||||
self.row_close_tol = row_close_tol
|
self.row_close_tol = row_close_tol
|
||||||
self.col_close_tol = col_close_tol
|
self.col_close_tol = col_close_tol
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
self.debug = debug
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _text_bbox(t_bbox):
|
def _text_bbox(t_bbox):
|
||||||
|
|
@ -333,11 +328,20 @@ class Stream(BaseParser):
|
||||||
table.shape = table.df.shape
|
table.shape = table.df.shape
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
whitespace = compute_whitespace(data)
|
||||||
|
table.flavor = 'stream'
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
table.whitespace = whitespace
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||||
|
|
||||||
|
# for plotting
|
||||||
|
_text = []
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
|
table._text = _text
|
||||||
|
table._image = None
|
||||||
|
table._segments = None
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
|
|
@ -347,7 +351,7 @@ class Stream(BaseParser):
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
logger.info("No tables found on {}".format(
|
logger.info("No tables found on {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
return [], self.g
|
return []
|
||||||
|
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
||||||
|
|
@ -359,11 +363,4 @@ class Stream(BaseParser):
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
table = self._generate_table(table_idx, cols, rows)
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
if self.debug:
|
return _tables
|
||||||
text = []
|
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
|
||||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
|
||||||
self.g.text = text
|
|
||||||
self.g.tables = _tables
|
|
||||||
|
|
||||||
return _tables, self.g
|
|
||||||
|
|
@ -2,165 +2,107 @@ import cv2
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import matplotlib.patches as patches
|
import matplotlib.patches as patches
|
||||||
|
|
||||||
from .handlers import PDFHandler
|
|
||||||
from .utils import validate_input, remove_extra
|
|
||||||
|
|
||||||
|
def plot_text(text):
|
||||||
def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs):
|
"""Generates a plot for all text present on the PDF page.
|
||||||
"""Plot geometry found on pdf page based on type specified,
|
|
||||||
useful for debugging and playing with different parameters to get
|
|
||||||
the best output.
|
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with mesh=False
|
|
||||||
and kwargs annotated with * can only be used with mesh=True.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
text : list
|
||||||
Path to pdf file.
|
|
||||||
pages : str
|
|
||||||
Comma-separated page numbers to parse.
|
|
||||||
Example: 1,3,4 or 1,4-end
|
|
||||||
mesh : bool (default: False)
|
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
|
||||||
is used by default.
|
|
||||||
geometry_type : str, optional (default: None)
|
|
||||||
* 'text' : Plot text objects found on page. (Useful to get \
|
|
||||||
table_area and columns coordinates)
|
|
||||||
* 'table' : Plot parsed table.
|
|
||||||
* 'contour'* : Plot detected rectangles.
|
|
||||||
* 'joint'* : Plot detected line intersections.
|
|
||||||
* 'line'* : Plot detected lines.
|
|
||||||
table_area : list, optional (default: None)
|
|
||||||
List of table areas to process as strings of the form
|
|
||||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
|
||||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
|
||||||
columns^ : list, optional (default: None)
|
|
||||||
List of column x-coordinates as strings where the coordinates
|
|
||||||
are comma-separated.
|
|
||||||
split_text : bool, optional (default: False)
|
|
||||||
Whether or not to split a text line if it spans across
|
|
||||||
multiple cells.
|
|
||||||
flag_size : bool, optional (default: False)
|
|
||||||
Whether or not to highlight a substring using <s></s>
|
|
||||||
if its size is different from rest of the string. (Useful for
|
|
||||||
super and subscripts.)
|
|
||||||
row_close_tol^ : int, optional (default: 2)
|
|
||||||
Rows will be formed by combining text vertically
|
|
||||||
within this tolerance.
|
|
||||||
col_close_tol^ : int, optional (default: 0)
|
|
||||||
Columns will be formed by combining text horizontally
|
|
||||||
within this tolerance.
|
|
||||||
process_background* : bool, optional (default: False)
|
|
||||||
Whether or not to process lines that are in background.
|
|
||||||
line_size_scaling* : int, optional (default: 15)
|
|
||||||
Factor by which the page dimensions will be divided to get
|
|
||||||
smallest length of lines that should be detected.
|
|
||||||
|
|
||||||
The larger this value, smaller the detected lines. Making it
|
|
||||||
too large will lead to text being detected as lines.
|
|
||||||
copy_text* : list, optional (default: None)
|
|
||||||
{'h', 'v'}
|
|
||||||
Select one or more strings from above and pass them as a list
|
|
||||||
to specify the direction in which text should be copied over
|
|
||||||
when a cell spans multiple rows or columns.
|
|
||||||
shift_text* : list, optional (default: ['l', 't'])
|
|
||||||
{'l', 'r', 't', 'b'}
|
|
||||||
Select one or more strings from above and pass them as a list
|
|
||||||
to specify where the text in a spanning cell should flow.
|
|
||||||
line_close_tol* : int, optional (default: 2)
|
|
||||||
Tolerance parameter used to merge vertical and horizontal
|
|
||||||
detected lines which lie close to each other.
|
|
||||||
joint_close_tol* : int, optional (default: 2)
|
|
||||||
Tolerance parameter used to decide whether the detected lines
|
|
||||||
and points lie close to each other.
|
|
||||||
threshold_blocksize* : int, optional (default: 15)
|
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
|
||||||
threshold_constant* : int, optional (default: -2)
|
|
||||||
Constant subtracted from the mean or weighted mean.
|
|
||||||
Normally, it is positive but may be zero or negative as well.
|
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
|
||||||
iterations* : int, optional (default: 0)
|
|
||||||
Number of times for erosion/dilation is applied.
|
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
|
||||||
margins : tuple
|
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
|
fig = plt.figure()
|
||||||
p = PDFHandler(filepath, pages)
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
kwargs = remove_extra(kwargs, mesh=mesh)
|
xs, ys = [], []
|
||||||
debug = True if geometry_type is not None else False
|
for t in text:
|
||||||
kwargs.update({'debug': debug})
|
xs.extend([t[0], t[2]])
|
||||||
__, geometry = p.parse(mesh=mesh, **kwargs)
|
ys.extend([t[1], t[3]])
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
|
t[2] - t[0],
|
||||||
|
t[3] - t[1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
if geometry_type == 'text':
|
|
||||||
for text in geometry.text:
|
def plot_table(table):
|
||||||
fig = plt.figure()
|
"""Generates a plot for the table.
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
|
||||||
xs, ys = [], []
|
Parameters
|
||||||
for t in text:
|
----------
|
||||||
xs.extend([t[0], t[1]])
|
table : camelot.core.Table
|
||||||
ys.extend([t[2], t[3]])
|
|
||||||
ax.add_patch(
|
"""
|
||||||
patches.Rectangle(
|
for row in table.cells:
|
||||||
(t[0], t[1]),
|
for cell in row:
|
||||||
t[2] - t[0],
|
if cell.left:
|
||||||
t[3] - t[1]
|
plt.plot([cell.lb[0], cell.lt[0]],
|
||||||
)
|
[cell.lb[1], cell.lt[1]])
|
||||||
)
|
if cell.right:
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
plt.plot([cell.rb[0], cell.rt[0]],
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
[cell.rb[1], cell.rt[1]])
|
||||||
plt.show()
|
if cell.top:
|
||||||
elif geometry_type == 'table':
|
plt.plot([cell.lt[0], cell.rt[0]],
|
||||||
for tables in geometry.tables:
|
[cell.lt[1], cell.rt[1]])
|
||||||
for table in tables:
|
if cell.bottom:
|
||||||
for row in table.cells:
|
plt.plot([cell.lb[0], cell.rb[0]],
|
||||||
for cell in row:
|
[cell.lb[1], cell.rb[1]])
|
||||||
if cell.left:
|
plt.show()
|
||||||
plt.plot([cell.lb[0], cell.lt[0]],
|
|
||||||
[cell.lb[1], cell.lt[1]])
|
|
||||||
if cell.right:
|
def plot_contour(image):
|
||||||
plt.plot([cell.rb[0], cell.rt[0]],
|
"""Generates a plot for all table boundaries present on the
|
||||||
[cell.rb[1], cell.rt[1]])
|
PDF page.
|
||||||
if cell.top:
|
|
||||||
plt.plot([cell.lt[0], cell.rt[0]],
|
Parameters
|
||||||
[cell.lt[1], cell.rt[1]])
|
----------
|
||||||
if cell.bottom:
|
image : tuple
|
||||||
plt.plot([cell.lb[0], cell.rb[0]],
|
|
||||||
[cell.lb[1], cell.rb[1]])
|
"""
|
||||||
plt.show()
|
img, table_bbox = image
|
||||||
elif geometry_type == 'contour':
|
for t in table_bbox.keys():
|
||||||
for img, table_bbox in geometry.images:
|
cv2.rectangle(img, (t[0], t[1]),
|
||||||
for t in table_bbox.keys():
|
(t[2], t[3]), (255, 0, 0), 20)
|
||||||
cv2.rectangle(img, (t[0], t[1]),
|
plt.imshow(img)
|
||||||
(t[2], t[3]), (255, 0, 0), 20)
|
plt.show()
|
||||||
plt.imshow(img)
|
|
||||||
plt.show()
|
|
||||||
elif geometry_type == 'joint':
|
def plot_joint(image):
|
||||||
for img, table_bbox in geometry.images:
|
"""Generates a plot for all line intersections present on the
|
||||||
x_coord = []
|
PDF page.
|
||||||
y_coord = []
|
|
||||||
for k in table_bbox.keys():
|
Parameters
|
||||||
for coord in table_bbox[k]:
|
----------
|
||||||
x_coord.append(coord[0])
|
image : tuple
|
||||||
y_coord.append(coord[1])
|
|
||||||
max_x, max_y = max(x_coord), max(y_coord)
|
"""
|
||||||
plt.plot(x_coord, y_coord, 'ro')
|
img, table_bbox = image
|
||||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
x_coord = []
|
||||||
plt.imshow(img)
|
y_coord = []
|
||||||
plt.show()
|
for k in table_bbox.keys():
|
||||||
elif geometry_type == 'line':
|
for coord in table_bbox[k]:
|
||||||
for v_s, h_s in geometry.segments:
|
x_coord.append(coord[0])
|
||||||
for v in v_s:
|
y_coord.append(coord[1])
|
||||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
plt.plot(x_coord, y_coord, 'ro')
|
||||||
for h in h_s:
|
plt.imshow(img)
|
||||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
plt.show()
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
def plot_line(segments):
|
||||||
|
"""Generates a plot for all line segments present on the PDF page.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
segments : tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
vertical, horizontal = segments
|
||||||
|
for v in vertical:
|
||||||
|
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
|
for h in horizontal:
|
||||||
|
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
|
plt.show()
|
||||||
|
|
@ -38,25 +38,25 @@ lattice_kwargs = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def validate_input(kwargs, mesh=False, geometry_type=False):
|
def validate_input(kwargs, flavor='lattice', geometry_type=False):
|
||||||
def check_intersection(parser_kwargs, input_kwargs, message_bool):
|
def check_intersection(parser_kwargs, input_kwargs):
|
||||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
||||||
if isec:
|
if isec:
|
||||||
raise ValueError("{} can not be used with mesh set to {}".format(
|
raise ValueError("{} cannot be used with flavor='{}'".format(
|
||||||
",".join(sorted(isec)), message_bool))
|
",".join(sorted(isec)), flavor))
|
||||||
|
|
||||||
if mesh:
|
if flavor == 'lattice':
|
||||||
check_intersection(stream_kwargs, kwargs, True)
|
check_intersection(stream_kwargs, kwargs)
|
||||||
else:
|
else:
|
||||||
check_intersection(lattice_kwargs, kwargs, False)
|
check_intersection(lattice_kwargs, kwargs)
|
||||||
if geometry_type:
|
if geometry_type:
|
||||||
if not mesh and geometry_type in ['contour', 'joint', 'line']:
|
if flavor != 'lattice' and geometry_type in ['contour', 'joint', 'line']:
|
||||||
raise ValueError("Use geometry_type={} with mesh set to True".format(
|
raise ValueError("Use geometry_type='{}' with flavor='lattice'".format(
|
||||||
geometry_type))
|
geometry_type))
|
||||||
|
|
||||||
|
|
||||||
def remove_extra(kwargs, mesh=False):
|
def remove_extra(kwargs, flavor='lattice'):
|
||||||
if mesh:
|
if flavor == 'lattice':
|
||||||
for key in kwargs.keys():
|
for key in kwargs.keys():
|
||||||
if key in stream_kwargs:
|
if key in stream_kwargs:
|
||||||
kwargs.pop(key)
|
kwargs.pop(key)
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,6 @@ API Reference
|
||||||
Main Interface
|
Main Interface
|
||||||
--------------
|
--------------
|
||||||
.. autofunction:: camelot.read_pdf
|
.. autofunction:: camelot.read_pdf
|
||||||
.. autofunction:: camelot.plot_geometry
|
|
||||||
|
|
||||||
Lower-Level Classes
|
Lower-Level Classes
|
||||||
-------------------
|
-------------------
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@ Release v\ |version|. (:ref:`Installation <install>`)
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> import camelot
|
>>> import camelot
|
||||||
>>> tables = camelot.read_pdf('foo.pdf', mesh=True)
|
>>> tables = camelot.read_pdf('foo.pdf')
|
||||||
>>> tables
|
>>> tables
|
||||||
<TableList tables=1>
|
<TableList tables=1>
|
||||||
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html
|
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
Advanced Usage
|
Advanced Usage
|
||||||
==============
|
==============
|
||||||
|
|
||||||
This page covers some of the more advanced configurations for :ref:`Stream <stream>` and :ref:`Lattice <lattice>`.
|
This page covers some of the more advanced configurations for :ref:`Lattice <lattice>` and :ref:`Stream <stream>`.
|
||||||
|
|
||||||
Process background lines
|
Process background lines
|
||||||
------------------------
|
------------------------
|
||||||
|
|
@ -21,7 +21,7 @@ To process background lines, you can pass ``process_background=True``.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('background_lines.pdf', mesh=True, process_background=True)
|
>>> tables = camelot.read_pdf('background_lines.pdf', process_background=True)
|
||||||
>>> tables[1].df
|
>>> tables[1].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -30,9 +30,9 @@ To process background lines, you can pass ``process_background=True``.
|
||||||
Plot geometry
|
Plot geometry
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
You can use the :meth:`plot_geometry() <camelot.plot_geometry>` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
|
You can use a :class:`table <camelot.core.Table>` object's :meth:`plot() <camelot.core.TableList.plot>` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
|
||||||
|
|
||||||
The following geometries are available for plotting. You can pass them to the :meth:`plot_geometry() <camelot.plot_geometry>` method with the ``geometry_type`` keyword argument, which will then generate a `matplotlib <https://matplotlib.org/>`_ plot.
|
The following geometries are available for plotting. You can pass them to the :meth:`plot() <camelot.core.TableList.plot>` method, which will then generate a `matplotlib <https://matplotlib.org/>`_ plot for the passed geometry.
|
||||||
|
|
||||||
- 'text'
|
- 'text'
|
||||||
- 'table'
|
- 'table'
|
||||||
|
|
@ -40,22 +40,26 @@ The following geometries are available for plotting. You can pass them to the :m
|
||||||
- 'line'
|
- 'line'
|
||||||
- 'joint'
|
- 'joint'
|
||||||
|
|
||||||
.. note:: The last three geometries can only be used with :ref:`Lattice <lattice>`, i.e. when ``mesh=True``.
|
.. note:: The last three geometries can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``.
|
||||||
|
|
||||||
Let's generate a plot for each geometry using this `PDF <../_static/pdf/foo.pdf>`__ as an example.
|
Let's generate a plot for each geometry using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
|
||||||
|
|
||||||
.. warning:: By default, :meth:`plot_geometry() <camelot.plot_geometry>` will use the first page of the PDF. Since this method is useful only for debugging, it makes sense to use it for one page at a time. If you pass a page range to this method, multiple plots will be generated one by one, a new one popping up as you close the previous one. To abort, you can use ``Ctrl + C``.
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('foo.pdf')
|
||||||
|
>>> tables
|
||||||
|
<TableList n=1>
|
||||||
|
|
||||||
.. _geometry_text:
|
.. _geometry_text:
|
||||||
|
|
||||||
text
|
text
|
||||||
^^^^
|
^^^^
|
||||||
|
|
||||||
Passing ``geometry_type=text`` creates a plot for all the text present on a PDF page.
|
Let's plot all the text present on the table's PDF page.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot_geometry('foo.pdf', geometry_type='text')
|
>>> tables[0].plot('text')
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_text.png
|
.. figure:: ../_static/png/geometry_text.png
|
||||||
:height: 674
|
:height: 674
|
||||||
|
|
@ -64,20 +68,20 @@ Passing ``geometry_type=text`` creates a plot for all the text present on a PDF
|
||||||
:alt: A plot of all text on a PDF page
|
:alt: A plot of all text on a PDF page
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
This, as we shall later see, is very helpful with :ref:`Stream <stream>`, for noting table areas and column separators, in case Stream cannot guess them correctly.
|
This, as we shall later see, is very helpful with :ref:`Stream <stream>`, for noting table areas and column separators, in case Stream does not guess them correctly.
|
||||||
|
|
||||||
.. note:: As you can see in the image above, the *x-y* coordinates change as you move your mouse cursor, which can help you note coordinates.
|
.. note:: The *x-y* coordinates shown aboe change as you move your mouse cursor on the image, which can help you note coordinates.
|
||||||
|
|
||||||
.. _geometry_table:
|
.. _geometry_table:
|
||||||
|
|
||||||
table
|
table
|
||||||
^^^^^
|
^^^^^
|
||||||
|
|
||||||
Passing ``geometry_type=table`` creates a plot for tables detected on a PDF page. This geometry, along with contour, line and joint is useful for debugging and improving the parsing output, as we shall see later.
|
Let's plot the table (to see if it was detected correctly or not). This geometry type, along with contour, line and joint is useful for debugging and improving the parsing output, in case the table wasn't detected correctly. More on that later.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='table')
|
>>> tables[0].plot('table')
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_table.png
|
.. figure:: ../_static/png/geometry_table.png
|
||||||
:height: 674
|
:height: 674
|
||||||
|
|
@ -86,16 +90,18 @@ Passing ``geometry_type=table`` creates a plot for tables detected on a PDF page
|
||||||
:alt: A plot of all tables on a PDF page
|
:alt: A plot of all tables on a PDF page
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
|
The table is perfect!
|
||||||
|
|
||||||
.. _geometry_contour:
|
.. _geometry_contour:
|
||||||
|
|
||||||
contour
|
contour
|
||||||
^^^^^^^
|
^^^^^^^
|
||||||
|
|
||||||
Passing ``geometry_type=contour`` creates a plot for table boundaries detected on a PDF page.
|
Now, let's plot all table boundaries present on the table's PDF page.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='contour')
|
>>> tables[0].plot('contour')
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_contour.png
|
.. figure:: ../_static/png/geometry_contour.png
|
||||||
:height: 674
|
:height: 674
|
||||||
|
|
@ -109,11 +115,11 @@ Passing ``geometry_type=contour`` creates a plot for table boundaries detected o
|
||||||
line
|
line
|
||||||
^^^^
|
^^^^
|
||||||
|
|
||||||
Passing ``geometry_type=line`` creates a plot for lines detected on a PDF page.
|
Cool, let's plot all line segments present on the table's PDF page.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot_geometry('foo.pdf', geometry_type='line')
|
>>> tables[0].plot('line')
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_line.png
|
.. figure:: ../_static/png/geometry_line.png
|
||||||
:height: 674
|
:height: 674
|
||||||
|
|
@ -127,11 +133,11 @@ Passing ``geometry_type=line`` creates a plot for lines detected on a PDF page.
|
||||||
joint
|
joint
|
||||||
^^^^^
|
^^^^^
|
||||||
|
|
||||||
Passing ``geometry_type=joint`` creates a plot for line intersections detected on a PDF page.
|
Finally, let's plot all line intersections present on the table's PDF page.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='joint')
|
>>> tables[0].plot('joint')
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_joint.png
|
.. figure:: ../_static/png/geometry_joint.png
|
||||||
:height: 674
|
:height: 674
|
||||||
|
|
@ -143,7 +149,7 @@ Passing ``geometry_type=joint`` creates a plot for line intersections detected o
|
||||||
Specify table areas
|
Specify table areas
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as this `PDF <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text <geometry_text>` on this page and note the left-top and right-bottom coordinates of the table.
|
Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as `these <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text <geometry_text>` on this page and note the left-top and right-bottom coordinates of the table.
|
||||||
|
|
||||||
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
|
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
|
||||||
|
|
||||||
|
|
@ -151,7 +157,7 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('table_areas.pdf', table_areas=['316,499,566,337'])
|
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -160,19 +166,19 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
|
||||||
Specify column separators
|
Specify column separators
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
In cases like this `PDF <../_static/pdf/column_separators.pdf>`__, where the text is very close to each other, it is possible that Camelot may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by :ref:`plotting the text <geometry_text>` on the page.
|
In cases like `these <../_static/pdf/column_separators.pdf>`__, where the text is very close to each other, it is possible that Camelot may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by :ref:`plotting the text <geometry_text>` on the page.
|
||||||
|
|
||||||
You can pass the column separators as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``columns`` keyword argument.
|
You can pass the column separators as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``columns`` keyword argument.
|
||||||
|
|
||||||
In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and there is a need to specify column separators as well, **the length of both lists should be equal**. Each table area will be mapped to each column separators' string using their indices.
|
In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and there is a need to specify column separators as well, **the length of both lists should be equal**. Each table area will be mapped to each column separators' string using their indices.
|
||||||
|
|
||||||
If you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table (since you can see by looking at the table that Camelot will be able to get it perfectly!), you can pass an empty string for the second table in the column separators' list, like this, ``columns=['10,120,200,400', '']``.
|
For example, if you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list, like this, ``columns=['10,120,200,400', '']``.
|
||||||
|
|
||||||
Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_text>` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out!
|
Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_text>` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out!
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('column_separators.pdf', columns=['72,95,209,327,442,529,566,606,683'])
|
>>> tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683'])
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -182,7 +188,7 @@ Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_
|
||||||
"NUMBER TYPE DBA NAME","","","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES"
|
"NUMBER TYPE DBA NAME","","","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES"
|
||||||
"...","...","...","...","...","...","...","...","...","..."
|
"...","...","...","...","...","...","...","...","...","..."
|
||||||
|
|
||||||
Ah! Since `PDFMiner <https://euske.github.io/pdfminer/>`_ merged the strings, "NUMBER", "TYPE" and "DBA NAME", all of them were assigned to the same cell. Let's see how we can fix this in the next section.
|
Ah! Since `PDFMiner <https://euske.github.io/pdfminer/>`_ merged the strings, "NUMBER", "TYPE" and "DBA NAME"; all of them were assigned to the same cell. Let's see how we can fix this in the next section.
|
||||||
|
|
||||||
Split text along separators
|
Split text along separators
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
@ -191,7 +197,7 @@ To deal with cases like the output from the previous section, you can pass ``spl
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('column_separators.pdf', columns=['72,95,209,327,442,529,566,606,683'], split_text=True)
|
>>> tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683'], split_text=True)
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -204,13 +210,13 @@ To deal with cases like the output from the previous section, you can pass ``spl
|
||||||
Flag superscripts and subscripts
|
Flag superscripts and subscripts
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
||||||
There might be cases where you want to differentiate between the text and superscripts and subscripts, like this `PDF <../_static/pdf/superscript.pdf>`_.
|
There might be cases where you want to differentiate between the text, and superscripts or subscripts, like this `PDF <../_static/pdf/superscript.pdf>`_.
|
||||||
|
|
||||||
.. figure:: ../_static/png/superscript.png
|
.. figure:: ../_static/png/superscript.png
|
||||||
:alt: A PDF with superscripts
|
:alt: A PDF with superscripts
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
In this case, the text that `other tools`_ return, will be ``24.912``. This is harmless as long as there is that decimal point involved. When it isn't there, you'll be left wondering why the results of your data analysis were 10x bigger!
|
In this case, the text that `other tools`_ return, will be ``24.912``. This is harmless as long as there is that decimal point involved. But when it isn't there, you'll be left wondering why the results of your data analysis were 10x bigger!
|
||||||
|
|
||||||
You can solve this by passing ``flag_size=True``, which will enclose the superscripts and subscripts with ``<s></s>``, based on font size, as shown below.
|
You can solve this by passing ``flag_size=True``, which will enclose the superscripts and subscripts with ``<s></s>``, based on font size, as shown below.
|
||||||
|
|
||||||
|
|
@ -218,7 +224,7 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('superscript.pdf', flag_size=True)
|
>>> tables = camelot.read_pdf('superscript.pdf', flavor='stream', flag_size=True)
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -236,7 +242,7 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('group_rows.pdf')
|
>>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream')
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -250,7 +256,7 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('group_rows.pdf', row_close_tol=10)
|
>>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_close_tol=10)
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -266,11 +272,11 @@ Detect short lines
|
||||||
|
|
||||||
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
|
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
|
||||||
|
|
||||||
As you can already guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
|
As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
|
||||||
|
|
||||||
.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
|
.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
|
||||||
|
|
||||||
Here's one `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
|
Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
|
||||||
|
|
||||||
.. figure:: ../_static/png/short_lines.png
|
.. figure:: ../_static/png/short_lines.png
|
||||||
:alt: A PDF table with short lines
|
:alt: A PDF table with short lines
|
||||||
|
|
@ -280,7 +286,8 @@ Let's :ref:`plot the table <geometry_table>` for this PDF.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot_geometry('short_lines.pdf', mesh=True, geometry_type='table')
|
>>> tables = camelot.read_pdf('short_lines.pdf')
|
||||||
|
>>> tables[0].plot('table')
|
||||||
|
|
||||||
.. figure:: ../_static/png/short_lines_1.png
|
.. figure:: ../_static/png/short_lines_1.png
|
||||||
:alt: A plot of the PDF table with short lines
|
:alt: A plot of the PDF table with short lines
|
||||||
|
|
@ -290,17 +297,17 @@ Clearly, the smaller lines separating the headers, couldn't be detected. Let's t
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot_geometry('short_lines.pdf', mesh=True, geometry_type='table', line_size_scaling=40)
|
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
|
||||||
|
>>> tables[0].plot('table')
|
||||||
|
|
||||||
.. figure:: ../_static/png/short_lines_2.png
|
.. figure:: ../_static/png/short_lines_2.png
|
||||||
:alt: An improved plot of the PDF table with short lines
|
:alt: An improved plot of the PDF table with short lines
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
Voila! Camelot can now see those lines. Let's use this value in :meth:`read_pdf() <camelot.read_pdf>` and get our table.
|
Voila! Camelot can now see those lines. Let's get our table.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40)
|
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -332,7 +339,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40, shift_text=[''])
|
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=[''])
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -353,7 +360,7 @@ No surprises there, it did remain in place (observe the strings "2400" and "All
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40, shift_text=['r', 'b'])
|
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b'])
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -381,7 +388,7 @@ Let's try it out on this `PDF <../_static/pdf/copy_text.pdf>`__. First, let's ch
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('copy_text.pdf', mesh=True)
|
>>> tables = camelot.read_pdf('copy_text.pdf')
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -398,7 +405,7 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('copy_text.pdf', mesh=True, copy_text=['v'])
|
>>> tables = camelot.read_pdf('copy_text.pdf', copy_text=['v'])
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
|
||||||
|
|
@ -5,25 +5,21 @@ Command-line interface
|
||||||
|
|
||||||
Camelot comes with a command-line interface.
|
Camelot comes with a command-line interface.
|
||||||
|
|
||||||
You can print the help for the interface, by typing ``camelot --help`` in your favorite terminal program, as shown below.
|
You can print the help for the interface, by typing ``camelot --help`` in your favorite terminal program, as shown below. Furthermore, you can print the help for each command, by typing ``camelot <command> --help``, try it out!
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
$ camelot --help
|
$ camelot --help
|
||||||
Usage: camelot [OPTIONS] FILEPATH
|
Usage: camelot [OPTIONS] COMMAND [ARGS]...
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
|
--version Show the version and exit.
|
||||||
-p, --pages TEXT Comma-separated page numbers to parse.
|
-p, --pages TEXT Comma-separated page numbers to parse.
|
||||||
Example: 1,3,4 or 1,4-end
|
Example: 1,3,4 or 1,4-end
|
||||||
-o, --output TEXT Output filepath.
|
-o, --output TEXT Output filepath.
|
||||||
-f, --format [csv|json|excel|html]
|
-f, --format [csv|json|excel|html]
|
||||||
Output file format.
|
Output file format.
|
||||||
-z, --zip Whether or not to create a ZIP archive.
|
-z, --zip Whether or not to create a ZIP archive.
|
||||||
-m, --mesh Whether or not to use Lattice method of
|
|
||||||
parsing. Stream is used by default.
|
|
||||||
-T, --table_area TEXT Table areas (x1,y1,x2,y2) to process.
|
|
||||||
x1, y1
|
|
||||||
-> left-top and x2, y2 -> right-bottom
|
|
||||||
-split, --split_text Whether or not to split text if it spans
|
-split, --split_text Whether or not to split text if it spans
|
||||||
across multiple cells.
|
across multiple cells.
|
||||||
-flag, --flag_size (inactive) Whether or not to flag text which
|
-flag, --flag_size (inactive) Whether or not to flag text which
|
||||||
|
|
@ -32,47 +28,8 @@ You can print the help for the interface, by typing ``camelot --help`` in your f
|
||||||
-M, --margins <FLOAT FLOAT FLOAT>...
|
-M, --margins <FLOAT FLOAT FLOAT>...
|
||||||
char_margin, line_margin, word_margin for
|
char_margin, line_margin, word_margin for
|
||||||
PDFMiner.
|
PDFMiner.
|
||||||
-C, --columns TEXT x-coordinates of column separators.
|
--help Show this message and exit.
|
||||||
-r, --row_close_tol INTEGER Rows will be formed by combining text
|
|
||||||
vertically within this tolerance.
|
Commands:
|
||||||
-c, --col_close_tol INTEGER Columns will be formed by combining text
|
lattice Use lines between text to parse table.
|
||||||
horizontally within this tolerance.
|
stream Use spaces between text to parse table.
|
||||||
-back, --process_background (with --mesh) Whether or not to process
|
|
||||||
lines that are in background.
|
|
||||||
-scale, --line_size_scaling INTEGER
|
|
||||||
(with --mesh) Factor by which the page
|
|
||||||
dimensions will be divided to get smallest
|
|
||||||
length of detected lines.
|
|
||||||
-copy, --copy_text [h|v] (with --mesh) Specify direction in which
|
|
||||||
text will be copied over in a spanning cell.
|
|
||||||
-shift, --shift_text [|l|r|t|b] (with --mesh) Specify direction in which
|
|
||||||
text in a spanning cell should flow.
|
|
||||||
-l, --line_close_tol INTEGER (with --mesh) Tolerance parameter used to
|
|
||||||
merge close vertical lines and close
|
|
||||||
horizontal lines.
|
|
||||||
-j, --joint_close_tol INTEGER (with --mesh) Tolerance parameter used to
|
|
||||||
decide whether the detected lines and points
|
|
||||||
lie close to each other.
|
|
||||||
-block, --threshold_blocksize INTEGER
|
|
||||||
(with --mesh) For adaptive thresholding,
|
|
||||||
size of a pixel neighborhood that is used to
|
|
||||||
calculate a threshold value for the pixel:
|
|
||||||
3, 5, 7, and so on.
|
|
||||||
-const, --threshold_constant INTEGER
|
|
||||||
(with --mesh) For adaptive thresholding,
|
|
||||||
constant subtracted from the mean or
|
|
||||||
weighted mean.
|
|
||||||
Normally, it is positive but
|
|
||||||
may be zero or negative as well.
|
|
||||||
-I, --iterations INTEGER (with --mesh) Number of times for
|
|
||||||
erosion/dilation is applied.
|
|
||||||
-G, --geometry_type [text|table|contour|joint|line]
|
|
||||||
Plot geometry found on pdf page for
|
|
||||||
debugging.
|
|
||||||
text: Plot text objects. (Useful to get
|
|
||||||
table_area and columns coordinates)
|
|
||||||
table: Plot parsed table.
|
|
||||||
contour (with --mesh): Plot detected rectangles.
|
|
||||||
joint (with --mesh): Plot detected line intersections.
|
|
||||||
line (with --mesh): Plot detected lines.
|
|
||||||
--help Show this message and exit.
|
|
||||||
|
|
@ -20,7 +20,7 @@ It is built on top of PDFMiner's functionality of grouping characters on a page
|
||||||
|
|
||||||
.. _margins: https://euske.github.io/pdfminer/#tools
|
.. _margins: https://euske.github.io/pdfminer/#tools
|
||||||
|
|
||||||
.. note:: By default, Stream treats the whole PDF page as a table. Automatic table detection for Stream is `in the works`_.
|
.. note:: By default, Stream treats the whole PDF page as a table, which isn't ideal when there are more than two tables on a page with different number of columns. Automatic table detection for Stream is `in the works`_.
|
||||||
|
|
||||||
.. _in the works: https://github.com/socialcopsdev/camelot/issues/102
|
.. _in the works: https://github.com/socialcopsdev/camelot/issues/102
|
||||||
|
|
||||||
|
|
@ -29,13 +29,13 @@ It is built on top of PDFMiner's functionality of grouping characters on a page
|
||||||
Lattice
|
Lattice
|
||||||
-------
|
-------
|
||||||
|
|
||||||
Lattice is more deterministic in nature, and does not rely on guesses. It can be used to parse tables that have demarcated lines between cells.
|
Lattice is more deterministic in nature, and does not rely on guesses. It can be used to parse tables that have demarcated lines between cells, and can automatically parse multiple tables present on a page.
|
||||||
|
|
||||||
It starts by converting the PDF page to an image using ghostscript and then processing it to get horizontal and vertical line segments by applying a set of morphological transformations (erosion and dilation) using OpenCV.
|
It starts by converting the PDF page to an image using ghostscript and then processing it to get horizontal and vertical line segments by applying a set of morphological transformations (erosion and dilation) using OpenCV.
|
||||||
|
|
||||||
Let's see how Lattice processes the `second page of this PDF`_, step-by-step.
|
Let's see how Lattice processes the second page of `this PDF`_, step-by-step.
|
||||||
|
|
||||||
.. _second page of this PDF: ../_static/pdf/us-030.pdf
|
.. _this PDF: ../_static/pdf/us-030.pdf
|
||||||
|
|
||||||
1. Line segments are detected.
|
1. Line segments are detected.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,16 +8,20 @@ This part of the documentation covers the installation of Camelot. First, you'll
|
||||||
.. _tk: https://packages.ubuntu.com/trusty/python-tk
|
.. _tk: https://packages.ubuntu.com/trusty/python-tk
|
||||||
.. _ghostscript: https://www.ghostscript.com/
|
.. _ghostscript: https://www.ghostscript.com/
|
||||||
|
|
||||||
These can be installed using your system's package manager. If you use Ubuntu, run the following:
|
These can be installed using your system's package manager. You can run the following based on your OS.
|
||||||
::
|
|
||||||
|
|
||||||
$ sudo apt install python-tk ghostscript
|
For Ubuntu::
|
||||||
|
|
||||||
|
$ apt install python-tk ghostscript
|
||||||
|
|
||||||
|
For macOS::
|
||||||
|
|
||||||
|
$ brew install tcl-tk ghostscript
|
||||||
|
|
||||||
$ pip install camelot-py
|
$ pip install camelot-py
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
After installing the dependencies, you can simply use pip to install Camelot:
|
After installing the dependencies, you can simply use pip to install Camelot::
|
||||||
::
|
|
||||||
|
|
||||||
$ pip install camelot-py
|
$ pip install camelot-py
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,9 +6,9 @@ Introduction
|
||||||
The Camelot Project
|
The Camelot Project
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
The Portable Document Format (PDF) was born out of `The Camelot Project`_ when a need was felt for "a universal to communicate documents across a wide variety of machine configurations, operating systems and communication networks". The goal was to make these documents viewable on any display and printable on any modern printers. The invention of the `PostScript`_ page description language, which enabled the creation of fixed-layout flat documents (with text, fonts, graphics, images encapsulated), solved the problem.
|
The Portable Document Format (PDF) was born out of `The Camelot Project`_ when a need was felt for "a universal to communicate documents across a wide variety of machine configurations, operating systems and communication networks". The goal was to make these documents viewable on any display and printable on any modern printers. The invention of the `PostScript`_ page description language, which enabled the creation of *fixed-layout* flat documents (with text, fonts, graphics, images encapsulated), solved the problem.
|
||||||
|
|
||||||
At a very high level, PostScript defines instructions, such as, "place this character at this x,y coordinate on a plane". Spaces can be *simulated* by placing characters relatively far apart. Similarly, tables can be *simulated* by placing characters (and words) in two-dimensional grids. A PDF viewer just takes these instructions and draws everything for the user to view. Since it's just characters on a plane, there is no table data structure which can be directly extracted and used for analysis!
|
At a very high level, PostScript defines instructions, such as, "place this character at this x,y coordinate on a plane". Spaces can be *simulated* by placing characters relatively far apart. Extending from that, tables can be *simulated* by placing characters (which constitute words) in two-dimensional grids. A PDF viewer just takes these instructions and draws everything for the user to view. Since it's just characters on a plane, there is no table data structure which can be extracted and used for analysis!
|
||||||
|
|
||||||
Sadly, a lot of open data is given out as tables which are trapped inside PDF files.
|
Sadly, a lot of open data is given out as tables which are trapped inside PDF files.
|
||||||
|
|
||||||
|
|
@ -17,13 +17,14 @@ Sadly, a lot of open data is given out as tables which are trapped inside PDF fi
|
||||||
Why another PDF Table Parsing library?
|
Why another PDF Table Parsing library?
|
||||||
--------------------------------------
|
--------------------------------------
|
||||||
|
|
||||||
There are both open (`Tabula`_) and closed-source (`PDFTables`_, `smallpdf`_) tools that are used widely to extract tables from PDF files. They either give a nice output, or fail miserably. There is no in-between. This does not help most users, since everything in the real world, including PDF table extraction, is fuzzy. Which leads to creation of adhoc table extraction scripts for each different type of PDF that the user wants to parse.
|
There are both open (`Tabula`_, `pdf-table-extract`_) and closed-source (`smallpdf`_, `PDFTables`_) tools that are widely used, to extract tables from PDF files. They either give a nice output, or fail miserably. There is no in-between. This is not helpful, since everything in the real world, including PDF table extraction, is fuzzy, leading to creation of adhoc table extraction scripts for each different type of PDF that the user wants to parse.
|
||||||
|
|
||||||
Camelot was created with the goal of offering its users complete control over table extraction. If the users are not able to get the desired output with the default configuration, they should be able to tweak the parameters and get the tables out!
|
Camelot was created with the goal of offering its users complete control over table extraction. If the users are not able to get the desired output with the default configuration, they should be able to tweak it and get the job done!
|
||||||
|
|
||||||
Here is a `comparison`_ of Camelot's output with outputs from other PDF parsing libraries and tools.
|
Here is a `comparison`_ of Camelot's output with outputs from other open-source PDF parsing libraries and tools.
|
||||||
|
|
||||||
.. _Tabula: http://tabula.technology/
|
.. _Tabula: http://tabula.technology/
|
||||||
|
.. _pdf-table-extract: https://github.com/ashima/pdf-table-extract
|
||||||
.. _PDFTables: https://pdftables.com/
|
.. _PDFTables: https://pdftables.com/
|
||||||
.. _Smallpdf: https://smallpdf.com
|
.. _Smallpdf: https://smallpdf.com
|
||||||
.. _comparison: https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools
|
.. _comparison: https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools
|
||||||
|
|
@ -31,7 +32,7 @@ Here is a `comparison`_ of Camelot's output with outputs from other PDF parsing
|
||||||
What's in a name?
|
What's in a name?
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
As you can already guess, this library is named after `The Camelot Project`_. The image on the left is taken from `Monty Python and the Holy Grail`_. In the movie, it is the castle "Camelot" where Arthur leads his men, the Knights of the Round Table, and then sets off elsewhere after deciding that it is "a silly place". Interestingly, the language in which this library is written was named after Monty Python.
|
As you can already guess, this library is named after `The Camelot Project`_. Fun fact, "Camelot" is the name of the castle in `Monty Python and the Holy Grail`_, where Arthur leads his men, the Knights of the Round Table, and then sets off elsewhere after deciding that it is "a silly place". Interestingly, the language in which this library is written (Python) was named after Monty Python.
|
||||||
|
|
||||||
.. _The Camelot Project: http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf
|
.. _The Camelot Project: http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf
|
||||||
.. _Monty Python and the Holy Grail: https://en.wikipedia.org/wiki/Monty_Python_and_the_Holy_Grail
|
.. _Monty Python and the Holy Grail: https://en.wikipedia.org/wiki/Monty_Python_and_the_Holy_Grail
|
||||||
|
|
|
||||||
|
|
@ -16,13 +16,13 @@ Begin by importing the Camelot module::
|
||||||
|
|
||||||
Now, let's try to read a PDF. You can check out the PDF used in this example, `here`_. Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here. To do that we will set the ``mesh`` keyword argument to ``True``.
|
Now, let's try to read a PDF. You can check out the PDF used in this example, `here`_. Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here. To do that we will set the ``mesh`` keyword argument to ``True``.
|
||||||
|
|
||||||
.. note:: :ref:`Stream <stream>` is used by default.
|
.. note:: :ref:`Lattice <lattice>` is used by default. You can use :ref:`Stream <stream>` with ``flavor='stream'``.
|
||||||
|
|
||||||
.. _here: ../_static/pdf/foo.pdf
|
.. _here: ../_static/pdf/foo.pdf
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('foo.pdf', mesh=True)
|
>>> tables = camelot.read_pdf('foo.pdf')
|
||||||
>>> tables
|
>>> tables
|
||||||
<TableList n=1>
|
<TableList n=1>
|
||||||
|
|
||||||
|
|
@ -47,7 +47,7 @@ Let's print the parsing report.
|
||||||
'page': 1
|
'page': 1
|
||||||
}
|
}
|
||||||
|
|
||||||
Woah! The accuracy is top-notch and whitespace is less, that means the table was parsed correctly (most probably). You can access the table as a pandas DataFrame by using the :class:`table <camelot.core.Table> object's` ``df`` property.
|
Woah! The accuracy is top-notch and whitespace is less, that means the table was parsed correctly (most probably). You can access the table as a pandas DataFrame by using the :class:`table <camelot.core.Table>` object's ``df`` property.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
|
|
@ -64,7 +64,7 @@ Looks good! You can be export the table as a CSV file using its :meth:`to_csv()
|
||||||
|
|
||||||
This will export the table as a CSV file at the path specified. In this case, it is ``foo.csv`` in the current directory.
|
This will export the table as a CSV file at the path specified. In this case, it is ``foo.csv`` in the current directory.
|
||||||
|
|
||||||
You can also export all tables at once, using the ``tables`` object's :meth:`export() <camelot.core.TableList.export>` method.
|
You can also export all tables at once, using the :class:`tables <camelot.core.TableList>` object's :meth:`export() <camelot.core.TableList.export>` method.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
|
|
@ -72,11 +72,11 @@ You can also export all tables at once, using the ``tables`` object's :meth:`exp
|
||||||
|
|
||||||
This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``.
|
This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``.
|
||||||
|
|
||||||
.. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP archive at your path with all the exported files.
|
.. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files.
|
||||||
|
|
||||||
.. note:: Camelot handles rotated PDF pages automatically. As an exercise, try to extract the table out of `this PDF file`_.
|
.. note:: Camelot handles rotated PDF pages automatically. As an exercise, try to extract the table out of `this PDF`_.
|
||||||
|
|
||||||
.. _this PDF file: ../_static/pdf/rotated.pdf
|
.. _this PDF: ../_static/pdf/rotated.pdf
|
||||||
|
|
||||||
Specify page numbers
|
Specify page numbers
|
||||||
--------------------
|
--------------------
|
||||||
|
|
|
||||||
2
setup.py
2
setup.py
|
|
@ -9,7 +9,7 @@ with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
|
||||||
exec(f.read(), about)
|
exec(f.read(), about)
|
||||||
|
|
||||||
# TODO: Move these to __version__.py
|
# TODO: Move these to __version__.py
|
||||||
NAME = 'camelot'
|
NAME = 'camelot-py'
|
||||||
VERSION = about['__version__']
|
VERSION = about['__version__']
|
||||||
DESCRIPTION = 'PDF Table Parsing for Humans'
|
DESCRIPTION = 'PDF Table Parsing for Humans'
|
||||||
with open('README.md') as f:
|
with open('README.md') as f:
|
||||||
|
|
|
||||||
|
|
@ -18,11 +18,11 @@ def test_stream_table_rotated():
|
||||||
df = pd.DataFrame(data_stream_table_rotated)
|
df = pd.DataFrame(data_stream_table_rotated)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -30,7 +30,7 @@ def test_stream_table_area():
|
||||||
df = pd.DataFrame(data_stream_table_area_single)
|
df = pd.DataFrame(data_stream_table_area_single)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
tables = camelot.read_pdf(filename, table_area=["320,500,573,335"])
|
tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"])
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -39,7 +39,7 @@ def test_stream_columns():
|
||||||
|
|
||||||
filename = os.path.join(testdir, "mexican_towns.pdf")
|
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, columns=["67,180,230,425,475"], row_close_tol=10)
|
filename, flavor="stream", columns=["67,180,230,425,475"], row_close_tol=10)
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -48,7 +48,7 @@ def test_lattice():
|
||||||
|
|
||||||
filename = os.path.join(testdir,
|
filename = os.path.join(testdir,
|
||||||
"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
|
"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
|
||||||
tables = camelot.read_pdf(filename, pages="2", mesh=True)
|
tables = camelot.read_pdf(filename, pages="2")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -56,11 +56,11 @@ def test_lattice_table_rotated():
|
||||||
df = pd.DataFrame(data_lattice_table_rotated)
|
df = pd.DataFrame(data_lattice_table_rotated)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, mesh=True)
|
tables = camelot.read_pdf(filename)
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, mesh=True)
|
tables = camelot.read_pdf(filename)
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -68,7 +68,7 @@ def test_lattice_process_background():
|
||||||
df = pd.DataFrame(data_lattice_process_background)
|
df = pd.DataFrame(data_lattice_process_background)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "background_lines_1.pdf")
|
filename = os.path.join(testdir, "background_lines_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, mesh=True, process_background=True)
|
tables = camelot.read_pdf(filename, process_background=True)
|
||||||
assert df.equals(tables[1].df)
|
assert df.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -76,5 +76,5 @@ def test_lattice_copy_text():
|
||||||
df = pd.DataFrame(data_lattice_copy_text)
|
df = pd.DataFrame(data_lattice_copy_text)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "row_span_1.pdf")
|
filename = os.path.join(testdir, "row_span_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, mesh=True, line_size_scaling=60, copy_text="v")
|
tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
Loading…
Reference in New Issue