Add flavors

pull/2/head
Vinayak Mehta 2018-09-23 10:53:32 +05:30
parent 4a30c5a514
commit 3170a9689f
11 changed files with 207 additions and 305 deletions

View File

@ -1,4 +1,3 @@
from .__version__ import __version__ from .__version__ import __version__
from .io import read_pdf from .io import read_pdf
from .plotting import plot_geometry

View File

@ -5,18 +5,9 @@ import click
from . import __version__ from . import __version__
from .io import read_pdf from .io import read_pdf
from .plotting import plot_geometry
from .utils import validate_input, remove_extra from .utils import validate_input, remove_extra
class Mutex(click.Option):
def handle_parse_result(self, ctx, opts, args):
mesh = opts.get('mesh', False)
geometry_type = opts.get('geometry_type', False)
validate_input(opts, mesh=mesh, geometry_type=geometry_type)
return super(Mutex, self).handle_parse_result(ctx, opts, args)
@click.command() @click.command()
@click.version_option(version=__version__) @click.version_option(version=__version__)
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers" @click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
@ -27,8 +18,6 @@ class Mutex(click.Option):
help="Output file format.") help="Output file format.")
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP" @click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
" archive.") " archive.")
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
" use Lattice method of parsing. Stream is used by default.")
@click.option("-T", "--table_area", default=[], multiple=True, @click.option("-T", "--table_area", default=[], multiple=True,
help="Table areas (x1,y1,x2,y2) to process.\n" help="Table areas (x1,y1,x2,y2) to process.\n"
" x1, y1 -> left-top and x2, y2 -> right-bottom") " x1, y1 -> left-top and x2, y2 -> right-bottom")
@ -39,12 +28,44 @@ class Mutex(click.Option):
" super/subscripts)") " super/subscripts)")
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1), @click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
help="char_margin, line_margin, word_margin for PDFMiner.") help="char_margin, line_margin, word_margin for PDFMiner.")
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex, @click.option("-G", "--geometry_type",
help="x-coordinates of column separators.") type=click.Choice(["text", "table", "contour", "joint", "line"]),
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be" help="Plot geometry found on pdf page for debugging.\n\n"
" formed by combining text vertically within this tolerance.") "text: Plot text objects. (Useful to get table_area and"
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will" " columns coordinates)\ntable: Plot parsed table.\n"
" be formed by combining text horizontally within this tolerance.") "contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
" intersections.\nline (with --mesh): Plot detected lines.")
@click.argument("filepath", type=click.Path(exists=True))
def cli(*args, **kwargs):
pages = kwargs.pop("pages")
output = kwargs.pop("output")
f = kwargs.pop("format")
compress = kwargs.pop("zip")
mesh = kwargs.pop("mesh")
geometry_type = kwargs.pop("geometry_type")
filepath = kwargs.pop("filepath")
table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area
columns = list(kwargs['columns'])
kwargs['columns'] = None if not columns else columns
copy_text = list(kwargs['copy_text'])
kwargs['copy_text'] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text'])
kwargs = remove_extra(kwargs, mesh=mesh)
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
click.echo(tables)
if output is None:
raise click.UsageError("Please specify an output filepath using --output")
if f is None:
raise click.UsageError("Please specify an output format using --format")
tables.export(output, f=f, compress=compress)
@click.option("-T", "--table_area", default=[], multiple=True,
help="Table areas (x1,y1,x2,y2) to process.\n"
" x1, y1 -> left-top and x2, y2 -> right-bottom")
@click.option("-back", "--process_background", is_flag=True, cls=Mutex, @click.option("-back", "--process_background", is_flag=True, cls=Mutex,
help="(with --mesh) Whether or not to process lines that are in" help="(with --mesh) Whether or not to process lines that are in"
" background.") " background.")
@ -75,40 +96,18 @@ class Mutex(click.Option):
@click.option("-I", "--iterations", default=0, cls=Mutex, @click.option("-I", "--iterations", default=0, cls=Mutex,
help="(with --mesh) Number of times for erosion/dilation is" help="(with --mesh) Number of times for erosion/dilation is"
" applied.") " applied.")
@click.option("-G", "--geometry_type", def lattice(*args, **kwargs):
type=click.Choice(["text", "table", "contour", "joint", "line"]), pass
help="Plot geometry found on pdf page for debugging.\n\n"
"text: Plot text objects. (Useful to get table_area and"
" columns coordinates)\ntable: Plot parsed table.\n"
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
" intersections.\nline (with --mesh): Plot detected lines.")
@click.argument("filepath", type=click.Path(exists=True))
def cli(*args, **kwargs):
pages = kwargs.pop("pages")
output = kwargs.pop("output")
f = kwargs.pop("format")
compress = kwargs.pop("zip")
mesh = kwargs.pop("mesh")
geometry_type = kwargs.pop("geometry_type")
filepath = kwargs.pop("filepath")
table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area
columns = list(kwargs['columns'])
kwargs['columns'] = None if not columns else columns
copy_text = list(kwargs['copy_text'])
kwargs['copy_text'] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text'])
kwargs = remove_extra(kwargs, mesh=mesh) @click.option("-T", "--table_area", default=[], multiple=True,
if geometry_type is None: help="Table areas (x1,y1,x2,y2) to process.\n"
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs) " x1, y1 -> left-top and x2, y2 -> right-bottom")
click.echo(tables) @click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
if output is None: help="x-coordinates of column separators.")
raise click.UsageError("Please specify an output filepath using --output") @click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
if f is None: " formed by combining text vertically within this tolerance.")
raise click.UsageError("Please specify an output format using --format") @click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
tables.export(output, f=f, compress=compress) " be formed by combining text horizontally within this tolerance.")
else: def stream(*args, **kwargs):
plot_geometry(filepath, pages=pages, mesh=mesh, pass
geometry_type=geometry_type, **kwargs)

View File

@ -6,6 +6,8 @@ import tempfile
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from .plotting import *
class Cell(object): class Cell(object):
"""Defines a cell in a table with coordinates relative to a """Defines a cell in a table with coordinates relative to a
@ -318,6 +320,32 @@ class Table(object):
cell.hspan = True cell.hspan = True
return self return self
def plot(self, geometry_type):
"""Plot geometry found on PDF page based on geometry_type
specified, useful for debugging and playing with different
parameters to get the best output.
Parameters
----------
geometry_type : str
The geometry type for which a plot should be generated.
Can be 'text', 'table', 'contour', 'joint', 'line'
"""
if self.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']:
raise NotImplementedError("{} cannot be plotted with flavor='stream'")
if geometry_type == 'text':
plot_text(self._text)
elif geometry_type == 'table':
plot_table(self)
elif geometry_type == 'contour':
plot_contour(self._image)
elif geometry_type == 'joint':
plot_joint(self._image)
elif geometry_type == 'line':
plot_line(self._segments)
def to_csv(self, path, **kwargs): def to_csv(self, path, **kwargs):
"""Writes Table to a comma-separated values (csv) file. """Writes Table to a comma-separated values (csv) file.
@ -489,35 +517,3 @@ class TableList(object):
zipname = os.path.join(os.path.dirname(path), root) + '.zip' zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))
class Geometry(object):
def __init__(self):
self.text = []
self.images = ()
self.segments = ()
self.tables = []
def __repr__(self):
return '<{} text={} images={} segments={} tables={}>'.format(
self.__class__.__name__,
len(self.text),
len(self.images),
len(self.segments),
len(self.tables))
class GeometryList(object):
def __init__(self, geometry):
self.text = [g.text for g in geometry]
self.images = [g.images for g in geometry]
self.segments = [g.segments for g in geometry]
self.tables = [g.tables for g in geometry]
def __repr__(self):
return '<{} text={} images={} segments={} tables={}>'.format(
self.__class__.__name__,
len(self.text),
len(self.images),
len(self.segments),
len(self.tables))

View File

@ -2,7 +2,7 @@ import os
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList, GeometryList from .core import TableList
from .parsers import Stream, Lattice from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects, from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
get_rotation) get_rotation)
@ -17,7 +17,7 @@ class PDFHandler(object):
---------- ----------
filename : str filename : str
Path to pdf file. Path to pdf file.
pages : str pages : str, optional (default: '1')
Comma-separated page numbers to parse. Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end Example: 1,3,4 or 1,4-end
@ -35,7 +35,7 @@ class PDFHandler(object):
---------- ----------
filename : str filename : str
Path to pdf file. Path to pdf file.
pages : str pages : str, optional (default: '1')
Comma-separated page numbers to parse. Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end Example: 1,3,4 or 1,4-end
@ -112,15 +112,15 @@ class PDFHandler(object):
with open(fpath, 'wb') as f: with open(fpath, 'wb') as f:
outfile.write(f) outfile.write(f)
def parse(self, mesh=False, **kwargs): def parse(self, flavor='lattice', **kwargs):
"""Extracts tables by calling parser.get_tables on all single """Extracts tables by calling parser.get_tables on all single
page pdfs. page pdfs.
Parameters Parameters
---------- ----------
mesh : bool (default: False) flavor : str (default: 'lattice')
Whether or not to use Lattice method of parsing. Stream The parsing method to use ('lattice' or 'stream').
is used by default. Lattice is used by default.
kwargs : dict kwargs : dict
See camelot.read_pdf kwargs. See camelot.read_pdf kwargs.
@ -134,15 +134,13 @@ class PDFHandler(object):
""" """
tables = [] tables = []
geometry = []
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
for p in self.pages: for p in self.pages:
self._save_page(self.filename, p, tempdir) self._save_page(self.filename, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages] for p in self.pages]
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs) parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
for p in pages: for p in pages:
t, g = parser.extract_tables(p) t = parser.extract_tables(p)
tables.extend(t) tables.extend(t)
geometry.append(g) return TableList(tables)
return TableList(tables), GeometryList(geometry)

View File

@ -2,22 +2,22 @@ from .handlers import PDFHandler
from .utils import validate_input, remove_extra from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', mesh=False, **kwargs): def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
"""Read PDF and return parsed data tables. """Read PDF and return parsed data tables.
Note: kwargs annotated with ^ can only be used with mesh=False Note: kwargs annotated with ^ can only be used with flavor='stream'
and kwargs annotated with * can only be used with mesh=True. and kwargs annotated with * can only be used with flavor='lattice'.
Parameters Parameters
---------- ----------
filepath : str filepath : str
Path to pdf file. Path to pdf file.
pages : str pages : str, optional (default: '1')
Comma-separated page numbers to parse. Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end Example: 1,3,4 or 1,4-end
mesh : bool (default: False) flavor : str (default: 'lattice')
Whether or not to use Lattice method of parsing. Stream The parsing method to use ('lattice' or 'stream').
is used by default. Lattice is used by default.
table_area : list, optional (default: None) table_area : list, optional (default: None)
List of table areas to process as strings of the form List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and x1,y1,x2,y2 where (x1, y1) -> left-top and
@ -85,8 +85,8 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
tables : camelot.core.TableList tables : camelot.core.TableList
""" """
validate_input(kwargs, mesh=mesh) validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
kwargs = remove_extra(kwargs, mesh=mesh) kwargs = remove_extra(kwargs, flavor=flavor)
tables, __ = p.parse(mesh=mesh, **kwargs) tables, __ = p.parse(flavor=flavor, **kwargs)
return tables return tables

View File

@ -194,7 +194,8 @@ class Lattice(BaseParser):
stderr=subprocess.STDOUT) stderr=subprocess.STDOUT)
def _generate_table_bbox(self): def _generate_table_bbox(self):
self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background, self.image, self.threshold = adaptive_threshold(
self.imagename, process_background=self.process_background,
blocksize=self.threshold_blocksize, c=self.threshold_constant) blocksize=self.threshold_blocksize, c=self.threshold_constant)
image_width = self.image.shape[1] image_width = self.image.shape[1]
image_height = self.image.shape[0] image_height = self.image.shape[0]
@ -297,11 +298,20 @@ class Lattice(BaseParser):
table.shape = table.df.shape table.shape = table.df.shape
whitespace = compute_whitespace(data) whitespace = compute_whitespace(data)
table.flavor = 'lattice'
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace table.whitespace = whitespace
table.order = table_idx + 1 table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace('page-', '')) table.page = int(os.path.basename(self.rootname).replace('page-', ''))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = (self.image, self.table_bbox_unscaled)
table._segments = (self.vertical_segments, self.horizontal_segments)
return table return table
def extract_tables(self, filename): def extract_tables(self, filename):
@ -311,7 +321,7 @@ class Lattice(BaseParser):
if not self.horizontal_text: if not self.horizontal_text:
logger.info("No tables found on {}".format( logger.info("No tables found on {}".format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [], self.g return []
self._generate_image() self._generate_image()
self._generate_table_bbox() self._generate_table_bbox()
@ -324,13 +334,4 @@ class Lattice(BaseParser):
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
_tables.append(table) _tables.append(table)
if self.debug: return _tables
text = []
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
self.g.text = text
self.g.images = (self.image, self.table_bbox_unscaled)
self.g.segments = (self.vertical_segments, self.horizontal_segments)
self.g.tables = _tables
return _tables, self.g

View File

@ -333,11 +333,20 @@ class Stream(BaseParser):
table.shape = table.df.shape table.shape = table.df.shape
whitespace = compute_whitespace(data) whitespace = compute_whitespace(data)
table.flavor = 'stream'
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace table.whitespace = whitespace
table.order = table_idx + 1 table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace('page-', '')) table.page = int(os.path.basename(self.rootname).replace('page-', ''))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = None
table._segments = None
return table return table
def extract_tables(self, filename): def extract_tables(self, filename):
@ -347,7 +356,7 @@ class Stream(BaseParser):
if not self.horizontal_text: if not self.horizontal_text:
logger.info("No tables found on {}".format( logger.info("No tables found on {}".format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [], self.g return []
self._generate_table_bbox() self._generate_table_bbox()
@ -359,11 +368,4 @@ class Stream(BaseParser):
table = self._generate_table(table_idx, cols, rows) table = self._generate_table(table_idx, cols, rows)
_tables.append(table) _tables.append(table)
if self.debug: return _tables
text = []
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
self.g.text = text
self.g.tables = _tables
return _tables, self.g

View File

@ -2,165 +2,72 @@ import cv2
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.patches as patches import matplotlib.patches as patches
from .handlers import PDFHandler
from .utils import validate_input, remove_extra def plot_text(text):
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
xs, ys = [], []
for t in text:
xs.extend([t[0], t[1]])
ys.extend([t[2], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1]
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs): def plot_table(table):
"""Plot geometry found on pdf page based on type specified, for row in table.cells:
useful for debugging and playing with different parameters to get for cell in row:
the best output. if cell.left:
plt.plot([cell.lb[0], cell.lt[0]],
[cell.lb[1], cell.lt[1]])
if cell.right:
plt.plot([cell.rb[0], cell.rt[0]],
[cell.rb[1], cell.rt[1]])
if cell.top:
plt.plot([cell.lt[0], cell.rt[0]],
[cell.lt[1], cell.rt[1]])
if cell.bottom:
plt.plot([cell.lb[0], cell.rb[0]],
[cell.lb[1], cell.rb[1]])
plt.show()
Note: kwargs annotated with ^ can only be used with mesh=False
and kwargs annotated with * can only be used with mesh=True.
Parameters def plot_contour(image):
---------- img, table_bbox = image
filepath : str for t in table_bbox.keys():
Path to pdf file. cv2.rectangle(img, (t[0], t[1]),
pages : str (t[2], t[3]), (255, 0, 0), 20)
Comma-separated page numbers to parse. plt.imshow(img)
Example: 1,3,4 or 1,4-end plt.show()
mesh : bool (default: False)
Whether or not to use Lattice method of parsing. Stream
is used by default.
geometry_type : str, optional (default: None)
* 'text' : Plot text objects found on page. (Useful to get \
table_area and columns coordinates)
* 'table' : Plot parsed table.
* 'contour'* : Plot detected rectangles.
* 'joint'* : Plot detected line intersections.
* 'line'* : Plot detected lines.
table_area : list, optional (default: None)
List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
columns^ : list, optional (default: None)
List of column x-coordinates as strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Whether or not to split a text line if it spans across
multiple cells.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts.)
row_close_tol^ : int, optional (default: 2)
Rows will be formed by combining text vertically
within this tolerance.
col_close_tol^ : int, optional (default: 0)
Columns will be formed by combining text horizontally
within this tolerance.
process_background* : bool, optional (default: False)
Whether or not to process lines that are in background.
line_size_scaling* : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines.
copy_text* : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Select one or more strings from above and pass them as a list
to specify where the text in a spanning cell should flow.
line_close_tol* : int, optional (default: 2)
Tolerance parameter used to merge vertical and horizontal
detected lines which lie close to each other.
joint_close_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. def plot_joint(image):
threshold_constant* : int, optional (default: -2) img, table_bbox = image
Constant subtracted from the mean or weighted mean. x_coord = []
Normally, it is positive but may be zero or negative as well. y_coord = []
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
max_x, max_y = max(x_coord), max(y_coord)
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
plt.show()
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. def plot_line(segments):
margins : tuple vertical, horizontal = segments
PDFMiner margins. (char_margin, line_margin, word_margin) for v in vertical:
plt.plot([v[0], v[2]], [v[1], v[3]])
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. for h in horizontal:
plt.plot([h[0], h[2]], [h[1], h[3]])
""" plt.show()
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
p = PDFHandler(filepath, pages)
kwargs = remove_extra(kwargs, mesh=mesh)
debug = True if geometry_type is not None else False
kwargs.update({'debug': debug})
__, geometry = p.parse(mesh=mesh, **kwargs)
if geometry_type == 'text':
for text in geometry.text:
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
xs, ys = [], []
for t in text:
xs.extend([t[0], t[1]])
ys.extend([t[2], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1]
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
elif geometry_type == 'table':
for tables in geometry.tables:
for table in tables:
for row in table.cells:
for cell in row:
if cell.left:
plt.plot([cell.lb[0], cell.lt[0]],
[cell.lb[1], cell.lt[1]])
if cell.right:
plt.plot([cell.rb[0], cell.rt[0]],
[cell.rb[1], cell.rt[1]])
if cell.top:
plt.plot([cell.lt[0], cell.rt[0]],
[cell.lt[1], cell.rt[1]])
if cell.bottom:
plt.plot([cell.lb[0], cell.rb[0]],
[cell.lb[1], cell.rb[1]])
plt.show()
elif geometry_type == 'contour':
for img, table_bbox in geometry.images:
for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]),
(t[2], t[3]), (255, 0, 0), 20)
plt.imshow(img)
plt.show()
elif geometry_type == 'joint':
for img, table_bbox in geometry.images:
x_coord = []
y_coord = []
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
max_x, max_y = max(x_coord), max(y_coord)
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
plt.show()
elif geometry_type == 'line':
for v_s, h_s in geometry.segments:
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
for h in h_s:
plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show()

View File

@ -38,25 +38,25 @@ lattice_kwargs = [
] ]
def validate_input(kwargs, mesh=False, geometry_type=False): def validate_input(kwargs, flavor='lattice', geometry_type=False):
def check_intersection(parser_kwargs, input_kwargs, message_bool): def check_intersection(parser_kwargs, input_kwargs):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec: if isec:
raise ValueError("{} can not be used with mesh set to {}".format( raise ValueError("{} cannot be used with flavor='{}'".format(
",".join(sorted(isec)), message_bool)) ",".join(sorted(isec)), flavor))
if mesh: if flavor == 'lattice':
check_intersection(stream_kwargs, kwargs, True) check_intersection(stream_kwargs, kwargs)
else: else:
check_intersection(lattice_kwargs, kwargs, False) check_intersection(lattice_kwargs, kwargs)
if geometry_type: if geometry_type:
if not mesh and geometry_type in ['contour', 'joint', 'line']: if flavor != 'lattice' and geometry_type in ['contour', 'joint', 'line']:
raise ValueError("Use geometry_type={} with mesh set to True".format( raise ValueError("Use geometry_type='{}' with flavor='lattice'".format(
geometry_type)) geometry_type))
def remove_extra(kwargs, mesh=False): def remove_extra(kwargs, flavor='lattice'):
if mesh: if flavor == 'lattice':
for key in kwargs.keys(): for key in kwargs.keys():
if key in stream_kwargs: if key in stream_kwargs:
kwargs.pop(key) kwargs.pop(key)

View File

@ -9,7 +9,7 @@ with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
exec(f.read(), about) exec(f.read(), about)
# TODO: Move these to __version__.py # TODO: Move these to __version__.py
NAME = 'camelot' NAME = 'camelot-py'
VERSION = about['__version__'] VERSION = about['__version__']
DESCRIPTION = 'PDF Table Parsing for Humans' DESCRIPTION = 'PDF Table Parsing for Humans'
with open('README.md') as f: with open('README.md') as f:

View File

@ -18,11 +18,11 @@ def test_stream_table_rotated():
df = pd.DataFrame(data_stream_table_rotated) df = pd.DataFrame(data_stream_table_rotated)
filename = os.path.join(testdir, "clockwise_table_2.pdf") filename = os.path.join(testdir, "clockwise_table_2.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename, flavor="stream")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_2.pdf") filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename, flavor="stream")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -30,7 +30,7 @@ def test_stream_table_area():
df = pd.DataFrame(data_stream_table_area_single) df = pd.DataFrame(data_stream_table_area_single)
filename = os.path.join(testdir, "tabula/us-007.pdf") filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(filename, table_area=["320,500,573,335"]) tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"])
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -39,7 +39,7 @@ def test_stream_columns():
filename = os.path.join(testdir, "mexican_towns.pdf") filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf( tables = camelot.read_pdf(
filename, columns=["67,180,230,425,475"], row_close_tol=10) filename, flavor="stream", columns=["67,180,230,425,475"], row_close_tol=10)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -48,7 +48,7 @@ def test_lattice():
filename = os.path.join(testdir, filename = os.path.join(testdir,
"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf") "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
tables = camelot.read_pdf(filename, pages="2", mesh=True) tables = camelot.read_pdf(filename, pages="2")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -56,11 +56,11 @@ def test_lattice_table_rotated():
df = pd.DataFrame(data_lattice_table_rotated) df = pd.DataFrame(data_lattice_table_rotated)
filename = os.path.join(testdir, "clockwise_table_1.pdf") filename = os.path.join(testdir, "clockwise_table_1.pdf")
tables = camelot.read_pdf(filename, mesh=True) tables = camelot.read_pdf(filename)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_1.pdf") filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
tables = camelot.read_pdf(filename, mesh=True) tables = camelot.read_pdf(filename)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -68,7 +68,7 @@ def test_lattice_process_background():
df = pd.DataFrame(data_lattice_process_background) df = pd.DataFrame(data_lattice_process_background)
filename = os.path.join(testdir, "background_lines_1.pdf") filename = os.path.join(testdir, "background_lines_1.pdf")
tables = camelot.read_pdf(filename, mesh=True, process_background=True) tables = camelot.read_pdf(filename, process_background=True)
assert df.equals(tables[1].df) assert df.equals(tables[1].df)
@ -76,5 +76,5 @@ def test_lattice_copy_text():
df = pd.DataFrame(data_lattice_copy_text) df = pd.DataFrame(data_lattice_copy_text)
filename = os.path.join(testdir, "row_span_1.pdf") filename = os.path.join(testdir, "row_span_1.pdf")
tables = camelot.read_pdf(filename, mesh=True, line_size_scaling=60, copy_text="v") tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)