diff --git a/camelot/__init__.py b/camelot/__init__.py index 72f362e..f2c7e32 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -1,5 +1,19 @@ # -*- coding: utf-8 -*- +import logging + + +# set up logging +logger = logging.getLogger('camelot') + +format_string = '%(asctime)s - %(levelname)s - %(message)s' +formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') +handler = logging.StreamHandler() +handler.setFormatter(formatter) + +logger.addHandler(handler) + + from .__version__ import __version__ from .io import read_pdf \ No newline at end of file diff --git a/camelot/cli.py b/camelot/cli.py index af09b24..2c187ba 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- -from pprint import pprint +import logging + +logger = logging.getLogger('camelot') +logger.setLevel(logging.INFO) import click @@ -38,7 +41,7 @@ pass_config = click.make_pass_decorator(Config) def cli(ctx, *args, **kwargs): """Camelot: PDF Table Extraction for Humans""" ctx.obj = Config() - for key, value in kwargs.iteritems(): + for key, value in kwargs.items(): ctx.obj.set_config(key, value) diff --git a/camelot/core.py b/camelot/core.py index 14421f7..2dfe445 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -447,18 +447,6 @@ class TableList(object): def __getitem__(self, idx): return self._tables[idx] - def __iter__(self): - self._n = 0 - return self - - def next(self): - if self._n < len(self): - r = self._tables[self._n] - self._n += 1 - return r - else: - raise StopIteration - @staticmethod def _format_func(table, f): return getattr(table, 'to_{}'.format(f)) diff --git a/camelot/handlers.py b/camelot/handlers.py index 40f4074..c557584 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -27,7 +27,7 @@ class PDFHandler(object): def __init__(self, filename, pages='1'): self.filename = filename if not self.filename.endswith('.pdf'): - raise TypeError("File format not supported.") + raise NotImplementedError("File format not supported") self.pages = self._get_pages(self.filename, pages) def _get_pages(self, filename, pages): diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 9303752..834146d 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -4,6 +4,7 @@ from __future__ import division import os import copy import logging +import warnings import subprocess import numpy as np @@ -13,12 +14,12 @@ from .base import BaseParser from ..core import Table from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, merge_close_lines, get_table_index, compute_accuracy, - compute_whitespace, setup_logging) + compute_whitespace) from ..image_processing import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) -logger = setup_logging(__name__) +logger = logging.getLogger('camelot') class Lattice(BaseParser): @@ -305,11 +306,11 @@ class Lattice(BaseParser): return table def extract_tables(self, filename): - logger.info('Processing {}'.format(os.path.basename(filename))) self._generate_layout(filename) + logger.info('Processing {}'.format(os.path.basename(self.rootname))) if not self.horizontal_text: - logger.info("No tables found on {}".format( + warnings.warn("No tables found on {}".format( os.path.basename(self.rootname))) return [] diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index b1e2983..d78743a 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -3,6 +3,7 @@ from __future__ import division import os import logging +import warnings import numpy as np import pandas as pd @@ -10,10 +11,10 @@ import pandas as pd from .base import BaseParser from ..core import Table from ..utils import (text_in_bbox, get_table_index, compute_accuracy, - compute_whitespace, setup_logging) + compute_whitespace) -logger = setup_logging(__name__) +logger = logging.getLogger('camelot') class Stream(BaseParser): @@ -287,7 +288,7 @@ class Stream(BaseParser): else: ncols = max(set(elements), key=elements.count) if ncols == 1: - logger.info("No tables found on {}".format( + warnings.warn("No tables found on {}".format( os.path.basename(self.rootname))) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) @@ -344,11 +345,11 @@ class Stream(BaseParser): return table def extract_tables(self, filename): - logger.info('Processing {}'.format(os.path.basename(filename))) self._generate_layout(filename) + logger.info('Processing {}'.format(os.path.basename(self.rootname))) if not self.horizontal_text: - logger.info("No tables found on {}".format( + warnings.warn("No tables found on {}".format( os.path.basename(self.rootname))) return [] diff --git a/camelot/utils.py b/camelot/utils.py index 8bcbb87..3f30f88 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,8 +1,8 @@ from __future__ import division import os import shutil -import logging import tempfile +import warnings from itertools import groupby from operator import itemgetter @@ -38,7 +38,7 @@ lattice_kwargs = [ ] -def validate_input(kwargs, flavor='lattice', geometry_type=False): +def validate_input(kwargs, flavor='lattice'): def check_intersection(parser_kwargs, input_kwargs): isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) if isec: @@ -49,10 +49,6 @@ def validate_input(kwargs, flavor='lattice', geometry_type=False): check_intersection(stream_kwargs, kwargs) else: check_intersection(lattice_kwargs, kwargs) - if geometry_type: - if flavor != 'lattice' and geometry_type in ['contour', 'joint', 'line']: - raise ValueError("Use geometry_type='{}' with flavor='lattice'".format( - geometry_type)) def remove_extra(kwargs, flavor='lattice'): @@ -77,35 +73,6 @@ class TemporaryDirectory(object): shutil.rmtree(self.name) -def setup_logging(name): - """Sets up a logger with StreamHandler. - - Parameters - ---------- - name : str - - Returns - ------- - logger : logging.Logger - - """ - logger = logging.getLogger(name) - - format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s' - formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') - - handler = logging.StreamHandler() - handler.setLevel(logging.INFO) - handler.setFormatter(formatter) - - logger.addHandler(handler) - - return logger - - -logger = setup_logging(__name__) - - def translate(x1, x2): """Translates x2 by x1. @@ -140,35 +107,6 @@ def scale(x, s): return x -def rotate(x1, y1, x2, y2, angle): - """Rotates point x2, y2 about point x1, y1 by angle. - - Parameters - ---------- - x1 : float - y1 : float - x2 : float - y2 : float - angle : float - Angle in radians. - - Returns - ------- - xnew : float - ynew : float - - """ - s = np.sin(angle) - c = np.cos(angle) - x2 = translate(-x1, x2) - y2 = translate(-y1, y2) - xnew = c * x2 - s * y2 - ynew = s * x2 + c * y2 - xnew = translate(x1, xnew) - ynew = translate(y1, ynew) - return xnew, ynew - - def scale_pdf(k, factors): """Translates and scales pdf coordinate space to image coordinate space. @@ -345,33 +283,6 @@ def text_in_bbox(bbox, text): return t_bbox -def remove_close_lines(ar, line_close_tol=2): - """Removes lines which are within a tolerance, based on their x or - y axis projections. - - Parameters - ---------- - ar : list - line_close_tol : int, optional (default: 2) - - Returns - ------- - ret : list - - """ - ret = [] - for a in ar: - if not ret: - ret.append(a) - else: - temp = ret[-1] - if np.isclose(temp, a, atol=line_close_tol): - pass - else: - ret.append(a) - return ret - - def merge_close_lines(ar, line_close_tol=2): """Merges lines which are within a tolerance by calculating a moving mean, based on their x or y axis projections. @@ -564,7 +475,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False): text = t.get_text().strip('\n') text_range = (t.x0, t.x1) col_range = (table.cols[0][0], table.cols[-1][1]) - logger.info("{} {} does not lie in column range {}".format( + warnings.warn("{} {} does not lie in column range {}".format( text, text_range, col_range)) r_idx = r c_idx = lt_col_overlap.index(max(lt_col_overlap)) @@ -648,27 +559,6 @@ def compute_whitespace(d): return whitespace -def remove_empty(d): - """Removes empty rows and columns from a two-dimensional list. - - Parameters - ---------- - d : list - - Returns - ------- - d : list - - """ - for i, row in enumerate(d): - if row == [''] * len(row): - d.pop(i) - d = zip(*d) - d = [list(row) for row in d if any(row)] - d = zip(*d) - return d - - def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, detect_vertical=True, all_texts=True): """Returns a PDFMiner LTPage object and page dimension of a single @@ -755,17 +645,14 @@ def get_text_objects(layout, ltype="char", t=None): def merge_tuples(tuples): """Merges a list of overlapping tuples. - - Parameters + Parameters ---------- tuples : list List of tuples where a tuple is a single axis coordinate pair. - - Yields + Yields ------ tuple - - """ + """ merged = list(tuples[0]) for s, e in tuples: if s <= merged[1]: diff --git a/tests/files/blank.pdf b/tests/files/blank.pdf new file mode 100755 index 0000000..99540f1 Binary files /dev/null and b/tests/files/blank.pdf differ diff --git a/tests/files/foo.csv b/tests/files/foo.csv new file mode 100644 index 0000000..44a7d95 --- /dev/null +++ b/tests/files/foo.csv @@ -0,0 +1,2 @@ +"a","b" +"1","2" diff --git a/tests/files/foo.pdf b/tests/files/foo.pdf new file mode 100644 index 0000000..742e018 Binary files /dev/null and b/tests/files/foo.pdf differ diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100755 index 0000000..7c68785 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/tests/test_errors.py b/tests/test_errors.py new file mode 100755 index 0000000..c8a1f98 --- /dev/null +++ b/tests/test_errors.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +import os +import warnings + +import pytest + +import camelot + + +testdir = os.path.dirname(os.path.abspath(__file__)) +testdir = os.path.join(testdir, "files") +filename = os.path.join(testdir, 'foo.pdf') + + +def test_unknown_flavor(): + message = ("Unknown flavor specified." + " Use either 'lattice' or 'stream'") + with pytest.raises(NotImplementedError, message=message): + tables = camelot.read_pdf(filename, flavor='chocolate') + + +def test_input_kwargs(): + message = "columns cannot be used with flavor='lattice'" + with pytest.raises(ValueError, message=message): + tables = camelot.read_pdf(filename, columns=['10,20,30,40']) + + +def test_unsupported_format(): + message = 'File format not supported' + filename = os.path.join(testdir, 'foo.csv') + with pytest.raises(NotImplementedError, message=message): + tables = camelot.read_pdf(filename) + + +def test_stream_equal_length(): + message = ("Length of table_area and columns" + " should be equal") + with pytest.raises(ValueError, message=message): + tables = camelot.read_pdf(filename, flavor='stream', + table_area=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40']) + + +def test_no_tables_found(): + filename = os.path.join(testdir, 'blank.pdf') + # TODO: use pytest.warns + with warnings.catch_warnings(): + warnings.simplefilter('error') + try: + tables = camelot.read_pdf(filename) + except Exception as e: + assert type(e).__name__ == 'UserWarning' + assert str(e) == 'No tables found on page-1' \ No newline at end of file diff --git a/tests/test_plotting.py b/tests/test_plotting.py new file mode 100755 index 0000000..7c68785 --- /dev/null +++ b/tests/test_plotting.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- \ No newline at end of file