From 3170a9689f851f500be9fc3202eaad8e469449ab Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Sun, 23 Sep 2018 10:53:32 +0530 Subject: [PATCH] Add flavors --- camelot/__init__.py | 3 +- camelot/cli.py | 103 +++++++++--------- camelot/core.py | 62 +++++------ camelot/handlers.py | 22 ++-- camelot/io.py | 20 ++-- camelot/parsers/lattice.py | 25 ++--- camelot/parsers/stream.py | 20 ++-- camelot/plotting.py | 215 +++++++++++-------------------------- camelot/utils.py | 22 ++-- setup.py | 2 +- tests/test_common.py | 18 ++-- 11 files changed, 207 insertions(+), 305 deletions(-) diff --git a/camelot/__init__.py b/camelot/__init__.py index 6e416e4..b762cea 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -1,4 +1,3 @@ from .__version__ import __version__ -from .io import read_pdf -from .plotting import plot_geometry \ No newline at end of file +from .io import read_pdf \ No newline at end of file diff --git a/camelot/cli.py b/camelot/cli.py index 98bb681..822bd44 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -5,18 +5,9 @@ import click from . import __version__ from .io import read_pdf -from .plotting import plot_geometry from .utils import validate_input, remove_extra -class Mutex(click.Option): - def handle_parse_result(self, ctx, opts, args): - mesh = opts.get('mesh', False) - geometry_type = opts.get('geometry_type', False) - validate_input(opts, mesh=mesh, geometry_type=geometry_type) - return super(Mutex, self).handle_parse_result(ctx, opts, args) - - @click.command() @click.version_option(version=__version__) @click.option("-p", "--pages", default="1", help="Comma-separated page numbers" @@ -27,8 +18,6 @@ class Mutex(click.Option): help="Output file format.") @click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP" " archive.") -@click.option("-m", "--mesh", is_flag=True, help="Whether or not to" - " use Lattice method of parsing. Stream is used by default.") @click.option("-T", "--table_area", default=[], multiple=True, help="Table areas (x1,y1,x2,y2) to process.\n" " x1, y1 -> left-top and x2, y2 -> right-bottom") @@ -39,12 +28,44 @@ class Mutex(click.Option): " super/subscripts)") @click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1), help="char_margin, line_margin, word_margin for PDFMiner.") -@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex, - help="x-coordinates of column separators.") -@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be" - " formed by combining text vertically within this tolerance.") -@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will" - " be formed by combining text horizontally within this tolerance.") +@click.option("-G", "--geometry_type", + type=click.Choice(["text", "table", "contour", "joint", "line"]), + help="Plot geometry found on pdf page for debugging.\n\n" + "text: Plot text objects. (Useful to get table_area and" + " columns coordinates)\ntable: Plot parsed table.\n" + "contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line" + " intersections.\nline (with --mesh): Plot detected lines.") +@click.argument("filepath", type=click.Path(exists=True)) +def cli(*args, **kwargs): + pages = kwargs.pop("pages") + output = kwargs.pop("output") + f = kwargs.pop("format") + compress = kwargs.pop("zip") + mesh = kwargs.pop("mesh") + geometry_type = kwargs.pop("geometry_type") + filepath = kwargs.pop("filepath") + + table_area = list(kwargs['table_area']) + kwargs['table_area'] = None if not table_area else table_area + columns = list(kwargs['columns']) + kwargs['columns'] = None if not columns else columns + copy_text = list(kwargs['copy_text']) + kwargs['copy_text'] = None if not copy_text else copy_text + kwargs['shift_text'] = list(kwargs['shift_text']) + + kwargs = remove_extra(kwargs, mesh=mesh) + tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs) + click.echo(tables) + if output is None: + raise click.UsageError("Please specify an output filepath using --output") + if f is None: + raise click.UsageError("Please specify an output format using --format") + tables.export(output, f=f, compress=compress) + + +@click.option("-T", "--table_area", default=[], multiple=True, + help="Table areas (x1,y1,x2,y2) to process.\n" + " x1, y1 -> left-top and x2, y2 -> right-bottom") @click.option("-back", "--process_background", is_flag=True, cls=Mutex, help="(with --mesh) Whether or not to process lines that are in" " background.") @@ -75,40 +96,18 @@ class Mutex(click.Option): @click.option("-I", "--iterations", default=0, cls=Mutex, help="(with --mesh) Number of times for erosion/dilation is" " applied.") -@click.option("-G", "--geometry_type", - type=click.Choice(["text", "table", "contour", "joint", "line"]), - help="Plot geometry found on pdf page for debugging.\n\n" - "text: Plot text objects. (Useful to get table_area and" - " columns coordinates)\ntable: Plot parsed table.\n" - "contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line" - " intersections.\nline (with --mesh): Plot detected lines.") -@click.argument("filepath", type=click.Path(exists=True)) -def cli(*args, **kwargs): - pages = kwargs.pop("pages") - output = kwargs.pop("output") - f = kwargs.pop("format") - compress = kwargs.pop("zip") - mesh = kwargs.pop("mesh") - geometry_type = kwargs.pop("geometry_type") - filepath = kwargs.pop("filepath") +def lattice(*args, **kwargs): + pass - table_area = list(kwargs['table_area']) - kwargs['table_area'] = None if not table_area else table_area - columns = list(kwargs['columns']) - kwargs['columns'] = None if not columns else columns - copy_text = list(kwargs['copy_text']) - kwargs['copy_text'] = None if not copy_text else copy_text - kwargs['shift_text'] = list(kwargs['shift_text']) - kwargs = remove_extra(kwargs, mesh=mesh) - if geometry_type is None: - tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs) - click.echo(tables) - if output is None: - raise click.UsageError("Please specify an output filepath using --output") - if f is None: - raise click.UsageError("Please specify an output format using --format") - tables.export(output, f=f, compress=compress) - else: - plot_geometry(filepath, pages=pages, mesh=mesh, - geometry_type=geometry_type, **kwargs) \ No newline at end of file +@click.option("-T", "--table_area", default=[], multiple=True, + help="Table areas (x1,y1,x2,y2) to process.\n" + " x1, y1 -> left-top and x2, y2 -> right-bottom") +@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex, + help="x-coordinates of column separators.") +@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be" + " formed by combining text vertically within this tolerance.") +@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will" + " be formed by combining text horizontally within this tolerance.") +def stream(*args, **kwargs): + pass \ No newline at end of file diff --git a/camelot/core.py b/camelot/core.py index 22b7442..3813e60 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -6,6 +6,8 @@ import tempfile import numpy as np import pandas as pd +from .plotting import * + class Cell(object): """Defines a cell in a table with coordinates relative to a @@ -318,6 +320,32 @@ class Table(object): cell.hspan = True return self + def plot(self, geometry_type): + """Plot geometry found on PDF page based on geometry_type + specified, useful for debugging and playing with different + parameters to get the best output. + + Parameters + ---------- + geometry_type : str + The geometry type for which a plot should be generated. + Can be 'text', 'table', 'contour', 'joint', 'line' + + """ + if self.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']: + raise NotImplementedError("{} cannot be plotted with flavor='stream'") + + if geometry_type == 'text': + plot_text(self._text) + elif geometry_type == 'table': + plot_table(self) + elif geometry_type == 'contour': + plot_contour(self._image) + elif geometry_type == 'joint': + plot_joint(self._image) + elif geometry_type == 'line': + plot_line(self._segments) + def to_csv(self, path, **kwargs): """Writes Table to a comma-separated values (csv) file. @@ -488,36 +516,4 @@ class TableList(object): if compress: zipname = os.path.join(os.path.dirname(path), root) + '.zip' with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: - z.write(filepath, os.path.basename(filepath)) - - -class Geometry(object): - def __init__(self): - self.text = [] - self.images = () - self.segments = () - self.tables = [] - - def __repr__(self): - return '<{} text={} images={} segments={} tables={}>'.format( - self.__class__.__name__, - len(self.text), - len(self.images), - len(self.segments), - len(self.tables)) - - -class GeometryList(object): - def __init__(self, geometry): - self.text = [g.text for g in geometry] - self.images = [g.images for g in geometry] - self.segments = [g.segments for g in geometry] - self.tables = [g.tables for g in geometry] - - def __repr__(self): - return '<{} text={} images={} segments={} tables={}>'.format( - self.__class__.__name__, - len(self.text), - len(self.images), - len(self.segments), - len(self.tables)) \ No newline at end of file + z.write(filepath, os.path.basename(filepath)) \ No newline at end of file diff --git a/camelot/handlers.py b/camelot/handlers.py index 59b31c3..0ea9785 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -2,7 +2,7 @@ import os from PyPDF2 import PdfFileReader, PdfFileWriter -from .core import TableList, GeometryList +from .core import TableList from .parsers import Stream, Lattice from .utils import (TemporaryDirectory, get_page_layout, get_text_objects, get_rotation) @@ -17,7 +17,7 @@ class PDFHandler(object): ---------- filename : str Path to pdf file. - pages : str + pages : str, optional (default: '1') Comma-separated page numbers to parse. Example: 1,3,4 or 1,4-end @@ -35,7 +35,7 @@ class PDFHandler(object): ---------- filename : str Path to pdf file. - pages : str + pages : str, optional (default: '1') Comma-separated page numbers to parse. Example: 1,3,4 or 1,4-end @@ -112,15 +112,15 @@ class PDFHandler(object): with open(fpath, 'wb') as f: outfile.write(f) - def parse(self, mesh=False, **kwargs): + def parse(self, flavor='lattice', **kwargs): """Extracts tables by calling parser.get_tables on all single page pdfs. Parameters ---------- - mesh : bool (default: False) - Whether or not to use Lattice method of parsing. Stream - is used by default. + flavor : str (default: 'lattice') + The parsing method to use ('lattice' or 'stream'). + Lattice is used by default. kwargs : dict See camelot.read_pdf kwargs. @@ -134,15 +134,13 @@ class PDFHandler(object): """ tables = [] - geometry = [] with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filename, p, tempdir) pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) for p in self.pages] - parser = Stream(**kwargs) if not mesh else Lattice(**kwargs) + parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) for p in pages: - t, g = parser.extract_tables(p) + t = parser.extract_tables(p) tables.extend(t) - geometry.append(g) - return TableList(tables), GeometryList(geometry) \ No newline at end of file + return TableList(tables) \ No newline at end of file diff --git a/camelot/io.py b/camelot/io.py index 328b107..f581735 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -2,22 +2,22 @@ from .handlers import PDFHandler from .utils import validate_input, remove_extra -def read_pdf(filepath, pages='1', mesh=False, **kwargs): +def read_pdf(filepath, pages='1', flavor='lattice', **kwargs): """Read PDF and return parsed data tables. - Note: kwargs annotated with ^ can only be used with mesh=False - and kwargs annotated with * can only be used with mesh=True. + Note: kwargs annotated with ^ can only be used with flavor='stream' + and kwargs annotated with * can only be used with flavor='lattice'. Parameters ---------- filepath : str Path to pdf file. - pages : str + pages : str, optional (default: '1') Comma-separated page numbers to parse. Example: 1,3,4 or 1,4-end - mesh : bool (default: False) - Whether or not to use Lattice method of parsing. Stream - is used by default. + flavor : str (default: 'lattice') + The parsing method to use ('lattice' or 'stream'). + Lattice is used by default. table_area : list, optional (default: None) List of table areas to process as strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and @@ -85,8 +85,8 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs): tables : camelot.core.TableList """ - validate_input(kwargs, mesh=mesh) + validate_input(kwargs, flavor=flavor) p = PDFHandler(filepath, pages) - kwargs = remove_extra(kwargs, mesh=mesh) - tables, __ = p.parse(mesh=mesh, **kwargs) + kwargs = remove_extra(kwargs, flavor=flavor) + tables, __ = p.parse(flavor=flavor, **kwargs) return tables \ No newline at end of file diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 9e569ab..5de6faa 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -194,7 +194,8 @@ class Lattice(BaseParser): stderr=subprocess.STDOUT) def _generate_table_bbox(self): - self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background, + self.image, self.threshold = adaptive_threshold( + self.imagename, process_background=self.process_background, blocksize=self.threshold_blocksize, c=self.threshold_constant) image_width = self.image.shape[1] image_height = self.image.shape[0] @@ -297,11 +298,20 @@ class Lattice(BaseParser): table.shape = table.df.shape whitespace = compute_whitespace(data) + table.flavor = 'lattice' table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 table.page = int(os.path.basename(self.rootname).replace('page-', '')) + # for plotting + _text = [] + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + table._text = _text + table._image = (self.image, self.table_bbox_unscaled) + table._segments = (self.vertical_segments, self.horizontal_segments) + return table def extract_tables(self, filename): @@ -311,7 +321,7 @@ class Lattice(BaseParser): if not self.horizontal_text: logger.info("No tables found on {}".format( os.path.basename(self.rootname))) - return [], self.g + return [] self._generate_image() self._generate_table_bbox() @@ -324,13 +334,4 @@ class Lattice(BaseParser): table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) _tables.append(table) - if self.debug: - text = [] - text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) - text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) - self.g.text = text - self.g.images = (self.image, self.table_bbox_unscaled) - self.g.segments = (self.vertical_segments, self.horizontal_segments) - self.g.tables = _tables - - return _tables, self.g \ No newline at end of file + return _tables \ No newline at end of file diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 6d29a05..b3acf38 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -333,11 +333,20 @@ class Stream(BaseParser): table.shape = table.df.shape whitespace = compute_whitespace(data) + table.flavor = 'stream' table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 table.page = int(os.path.basename(self.rootname).replace('page-', '')) + # for plotting + _text = [] + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + table._text = _text + table._image = None + table._segments = None + return table def extract_tables(self, filename): @@ -347,7 +356,7 @@ class Stream(BaseParser): if not self.horizontal_text: logger.info("No tables found on {}".format( os.path.basename(self.rootname))) - return [], self.g + return [] self._generate_table_bbox() @@ -359,11 +368,4 @@ class Stream(BaseParser): table = self._generate_table(table_idx, cols, rows) _tables.append(table) - if self.debug: - text = [] - text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) - text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) - self.g.text = text - self.g.tables = _tables - - return _tables, self.g \ No newline at end of file + return _tables \ No newline at end of file diff --git a/camelot/plotting.py b/camelot/plotting.py index 7a94b53..9c06887 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -2,165 +2,72 @@ import cv2 import matplotlib.pyplot as plt import matplotlib.patches as patches -from .handlers import PDFHandler -from .utils import validate_input, remove_extra + +def plot_text(text): + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + xs, ys = [], [] + for t in text: + xs.extend([t[0], t[1]]) + ys.extend([t[2], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1] + ) + ) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + plt.show() -def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs): - """Plot geometry found on pdf page based on type specified, - useful for debugging and playing with different parameters to get - the best output. +def plot_table(table): + for row in table.cells: + for cell in row: + if cell.left: + plt.plot([cell.lb[0], cell.lt[0]], + [cell.lb[1], cell.lt[1]]) + if cell.right: + plt.plot([cell.rb[0], cell.rt[0]], + [cell.rb[1], cell.rt[1]]) + if cell.top: + plt.plot([cell.lt[0], cell.rt[0]], + [cell.lt[1], cell.rt[1]]) + if cell.bottom: + plt.plot([cell.lb[0], cell.rb[0]], + [cell.lb[1], cell.rb[1]]) + plt.show() - Note: kwargs annotated with ^ can only be used with mesh=False - and kwargs annotated with * can only be used with mesh=True. - Parameters - ---------- - filepath : str - Path to pdf file. - pages : str - Comma-separated page numbers to parse. - Example: 1,3,4 or 1,4-end - mesh : bool (default: False) - Whether or not to use Lattice method of parsing. Stream - is used by default. - geometry_type : str, optional (default: None) - * 'text' : Plot text objects found on page. (Useful to get \ - table_area and columns coordinates) - * 'table' : Plot parsed table. - * 'contour'* : Plot detected rectangles. - * 'joint'* : Plot detected line intersections. - * 'line'* : Plot detected lines. - table_area : list, optional (default: None) - List of table areas to process as strings of the form - x1,y1,x2,y2 where (x1, y1) -> left-top and - (x2, y2) -> right-bottom in pdf coordinate space. - columns^ : list, optional (default: None) - List of column x-coordinates as strings where the coordinates - are comma-separated. - split_text : bool, optional (default: False) - Whether or not to split a text line if it spans across - multiple cells. - flag_size : bool, optional (default: False) - Whether or not to highlight a substring using - if its size is different from rest of the string. (Useful for - super and subscripts.) - row_close_tol^ : int, optional (default: 2) - Rows will be formed by combining text vertically - within this tolerance. - col_close_tol^ : int, optional (default: 0) - Columns will be formed by combining text horizontally - within this tolerance. - process_background* : bool, optional (default: False) - Whether or not to process lines that are in background. - line_size_scaling* : int, optional (default: 15) - Factor by which the page dimensions will be divided to get - smallest length of lines that should be detected. +def plot_contour(image): + img, table_bbox = image + for t in table_bbox.keys(): + cv2.rectangle(img, (t[0], t[1]), + (t[2], t[3]), (255, 0, 0), 20) + plt.imshow(img) + plt.show() - The larger this value, smaller the detected lines. Making it - too large will lead to text being detected as lines. - copy_text* : list, optional (default: None) - {'h', 'v'} - Select one or more strings from above and pass them as a list - to specify the direction in which text should be copied over - when a cell spans multiple rows or columns. - shift_text* : list, optional (default: ['l', 't']) - {'l', 'r', 't', 'b'} - Select one or more strings from above and pass them as a list - to specify where the text in a spanning cell should flow. - line_close_tol* : int, optional (default: 2) - Tolerance parameter used to merge vertical and horizontal - detected lines which lie close to each other. - joint_close_tol* : int, optional (default: 2) - Tolerance parameter used to decide whether the detected lines - and points lie close to each other. - threshold_blocksize* : int, optional (default: 15) - Size of a pixel neighborhood that is used to calculate a - threshold value for the pixel: 3, 5, 7, and so on. - For more information, refer `OpenCV's adaptiveThreshold `_. - threshold_constant* : int, optional (default: -2) - Constant subtracted from the mean or weighted mean. - Normally, it is positive but may be zero or negative as well. +def plot_joint(image): + img, table_bbox = image + x_coord = [] + y_coord = [] + for k in table_bbox.keys(): + for coord in table_bbox[k]: + x_coord.append(coord[0]) + y_coord.append(coord[1]) + max_x, max_y = max(x_coord), max(y_coord) + plt.plot(x_coord, y_coord, 'ro') + plt.axis([0, max_x + 100, max_y + 100, 0]) + plt.imshow(img) + plt.show() - For more information, refer `OpenCV's adaptiveThreshold `_. - iterations* : int, optional (default: 0) - Number of times for erosion/dilation is applied. - For more information, refer `OpenCV's dilate `_. - margins : tuple - PDFMiner margins. (char_margin, line_margin, word_margin) - - For more information, refer `PDFMiner docs `_. - - """ - validate_input(kwargs, mesh=mesh, geometry_type=geometry_type) - p = PDFHandler(filepath, pages) - kwargs = remove_extra(kwargs, mesh=mesh) - debug = True if geometry_type is not None else False - kwargs.update({'debug': debug}) - __, geometry = p.parse(mesh=mesh, **kwargs) - - if geometry_type == 'text': - for text in geometry.text: - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - xs, ys = [], [] - for t in text: - xs.extend([t[0], t[1]]) - ys.extend([t[2], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1] - ) - ) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) - plt.show() - elif geometry_type == 'table': - for tables in geometry.tables: - for table in tables: - for row in table.cells: - for cell in row: - if cell.left: - plt.plot([cell.lb[0], cell.lt[0]], - [cell.lb[1], cell.lt[1]]) - if cell.right: - plt.plot([cell.rb[0], cell.rt[0]], - [cell.rb[1], cell.rt[1]]) - if cell.top: - plt.plot([cell.lt[0], cell.rt[0]], - [cell.lt[1], cell.rt[1]]) - if cell.bottom: - plt.plot([cell.lb[0], cell.rb[0]], - [cell.lb[1], cell.rb[1]]) - plt.show() - elif geometry_type == 'contour': - for img, table_bbox in geometry.images: - for t in table_bbox.keys(): - cv2.rectangle(img, (t[0], t[1]), - (t[2], t[3]), (255, 0, 0), 20) - plt.imshow(img) - plt.show() - elif geometry_type == 'joint': - for img, table_bbox in geometry.images: - x_coord = [] - y_coord = [] - for k in table_bbox.keys(): - for coord in table_bbox[k]: - x_coord.append(coord[0]) - y_coord.append(coord[1]) - max_x, max_y = max(x_coord), max(y_coord) - plt.plot(x_coord, y_coord, 'ro') - plt.axis([0, max_x + 100, max_y + 100, 0]) - plt.imshow(img) - plt.show() - elif geometry_type == 'line': - for v_s, h_s in geometry.segments: - for v in v_s: - plt.plot([v[0], v[2]], [v[1], v[3]]) - for h in h_s: - plt.plot([h[0], h[2]], [h[1], h[3]]) - plt.show() \ No newline at end of file +def plot_line(segments): + vertical, horizontal = segments + for v in vertical: + plt.plot([v[0], v[2]], [v[1], v[3]]) + for h in horizontal: + plt.plot([h[0], h[2]], [h[1], h[3]]) + plt.show() \ No newline at end of file diff --git a/camelot/utils.py b/camelot/utils.py index c0f4a59..156373e 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -38,25 +38,25 @@ lattice_kwargs = [ ] -def validate_input(kwargs, mesh=False, geometry_type=False): - def check_intersection(parser_kwargs, input_kwargs, message_bool): +def validate_input(kwargs, flavor='lattice', geometry_type=False): + def check_intersection(parser_kwargs, input_kwargs): isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) if isec: - raise ValueError("{} can not be used with mesh set to {}".format( - ",".join(sorted(isec)), message_bool)) + raise ValueError("{} cannot be used with flavor='{}'".format( + ",".join(sorted(isec)), flavor)) - if mesh: - check_intersection(stream_kwargs, kwargs, True) + if flavor == 'lattice': + check_intersection(stream_kwargs, kwargs) else: - check_intersection(lattice_kwargs, kwargs, False) + check_intersection(lattice_kwargs, kwargs) if geometry_type: - if not mesh and geometry_type in ['contour', 'joint', 'line']: - raise ValueError("Use geometry_type={} with mesh set to True".format( + if flavor != 'lattice' and geometry_type in ['contour', 'joint', 'line']: + raise ValueError("Use geometry_type='{}' with flavor='lattice'".format( geometry_type)) -def remove_extra(kwargs, mesh=False): - if mesh: +def remove_extra(kwargs, flavor='lattice'): + if flavor == 'lattice': for key in kwargs.keys(): if key in stream_kwargs: kwargs.pop(key) diff --git a/setup.py b/setup.py index d37bcf2..00d6e8f 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f: exec(f.read(), about) # TODO: Move these to __version__.py -NAME = 'camelot' +NAME = 'camelot-py' VERSION = about['__version__'] DESCRIPTION = 'PDF Table Parsing for Humans' with open('README.md') as f: diff --git a/tests/test_common.py b/tests/test_common.py index 52f966a..065a9e2 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -18,11 +18,11 @@ def test_stream_table_rotated(): df = pd.DataFrame(data_stream_table_rotated) filename = os.path.join(testdir, "clockwise_table_2.pdf") - tables = camelot.read_pdf(filename) + tables = camelot.read_pdf(filename, flavor="stream") assert df.equals(tables[0].df) filename = os.path.join(testdir, "anticlockwise_table_2.pdf") - tables = camelot.read_pdf(filename) + tables = camelot.read_pdf(filename, flavor="stream") assert df.equals(tables[0].df) @@ -30,7 +30,7 @@ def test_stream_table_area(): df = pd.DataFrame(data_stream_table_area_single) filename = os.path.join(testdir, "tabula/us-007.pdf") - tables = camelot.read_pdf(filename, table_area=["320,500,573,335"]) + tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"]) assert df.equals(tables[0].df) @@ -39,7 +39,7 @@ def test_stream_columns(): filename = os.path.join(testdir, "mexican_towns.pdf") tables = camelot.read_pdf( - filename, columns=["67,180,230,425,475"], row_close_tol=10) + filename, flavor="stream", columns=["67,180,230,425,475"], row_close_tol=10) assert df.equals(tables[0].df) @@ -48,7 +48,7 @@ def test_lattice(): filename = os.path.join(testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf") - tables = camelot.read_pdf(filename, pages="2", mesh=True) + tables = camelot.read_pdf(filename, pages="2") assert df.equals(tables[0].df) @@ -56,11 +56,11 @@ def test_lattice_table_rotated(): df = pd.DataFrame(data_lattice_table_rotated) filename = os.path.join(testdir, "clockwise_table_1.pdf") - tables = camelot.read_pdf(filename, mesh=True) + tables = camelot.read_pdf(filename) assert df.equals(tables[0].df) filename = os.path.join(testdir, "anticlockwise_table_1.pdf") - tables = camelot.read_pdf(filename, mesh=True) + tables = camelot.read_pdf(filename) assert df.equals(tables[0].df) @@ -68,7 +68,7 @@ def test_lattice_process_background(): df = pd.DataFrame(data_lattice_process_background) filename = os.path.join(testdir, "background_lines_1.pdf") - tables = camelot.read_pdf(filename, mesh=True, process_background=True) + tables = camelot.read_pdf(filename, process_background=True) assert df.equals(tables[1].df) @@ -76,5 +76,5 @@ def test_lattice_copy_text(): df = pd.DataFrame(data_lattice_copy_text) filename = os.path.join(testdir, "row_span_1.pdf") - tables = camelot.read_pdf(filename, mesh=True, line_size_scaling=60, copy_text="v") + tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") assert df.equals(tables[0].df) \ No newline at end of file