[MRG] Add error/warning tests (#113)

* Add unknown flavor test

* Add input kwargs test

* Remove unused utils

* Add unsupported format test

* Add stream unequal tables-columns length test

* Add python3 compat

* Add no tables found test

* Convert util info log to warning
pull/2/head
Vinayak Mehta 2018-10-02 19:28:42 +05:30 committed by GitHub
parent f1bf4309ec
commit c5bde5e2ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 94 additions and 143 deletions

View File

@ -1,5 +1,19 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging
# set up logging
logger = logging.getLogger('camelot')
format_string = '%(asctime)s - %(levelname)s - %(message)s'
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
from .__version__ import __version__ from .__version__ import __version__
from .io import read_pdf from .io import read_pdf

View File

@ -1,6 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from pprint import pprint import logging
logger = logging.getLogger('camelot')
logger.setLevel(logging.INFO)
import click import click
@ -38,7 +41,7 @@ pass_config = click.make_pass_decorator(Config)
def cli(ctx, *args, **kwargs): def cli(ctx, *args, **kwargs):
"""Camelot: PDF Table Extraction for Humans""" """Camelot: PDF Table Extraction for Humans"""
ctx.obj = Config() ctx.obj = Config()
for key, value in kwargs.iteritems(): for key, value in kwargs.items():
ctx.obj.set_config(key, value) ctx.obj.set_config(key, value)

View File

@ -447,18 +447,6 @@ class TableList(object):
def __getitem__(self, idx): def __getitem__(self, idx):
return self._tables[idx] return self._tables[idx]
def __iter__(self):
self._n = 0
return self
def next(self):
if self._n < len(self):
r = self._tables[self._n]
self._n += 1
return r
else:
raise StopIteration
@staticmethod @staticmethod
def _format_func(table, f): def _format_func(table, f):
return getattr(table, 'to_{}'.format(f)) return getattr(table, 'to_{}'.format(f))

View File

@ -27,7 +27,7 @@ class PDFHandler(object):
def __init__(self, filename, pages='1'): def __init__(self, filename, pages='1'):
self.filename = filename self.filename = filename
if not self.filename.endswith('.pdf'): if not self.filename.endswith('.pdf'):
raise TypeError("File format not supported.") raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages) self.pages = self._get_pages(self.filename, pages)
def _get_pages(self, filename, pages): def _get_pages(self, filename, pages):

View File

@ -4,6 +4,7 @@ from __future__ import division
import os import os
import copy import copy
import logging import logging
import warnings
import subprocess import subprocess
import numpy as np import numpy as np
@ -13,12 +14,12 @@ from .base import BaseParser
from ..core import Table from ..core import Table
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
merge_close_lines, get_table_index, compute_accuracy, merge_close_lines, get_table_index, compute_accuracy,
compute_whitespace, setup_logging) compute_whitespace)
from ..image_processing import (adaptive_threshold, find_lines, from ..image_processing import (adaptive_threshold, find_lines,
find_table_contours, find_table_joints) find_table_contours, find_table_joints)
logger = setup_logging(__name__) logger = logging.getLogger('camelot')
class Lattice(BaseParser): class Lattice(BaseParser):
@ -305,11 +306,11 @@ class Lattice(BaseParser):
return table return table
def extract_tables(self, filename): def extract_tables(self, filename):
logger.info('Processing {}'.format(os.path.basename(filename)))
self._generate_layout(filename) self._generate_layout(filename)
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
logger.info("No tables found on {}".format( warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [] return []

View File

@ -3,6 +3,7 @@
from __future__ import division from __future__ import division
import os import os
import logging import logging
import warnings
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -10,10 +11,10 @@ import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import Table from ..core import Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy, from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace, setup_logging) compute_whitespace)
logger = setup_logging(__name__) logger = logging.getLogger('camelot')
class Stream(BaseParser): class Stream(BaseParser):
@ -287,7 +288,7 @@ class Stream(BaseParser):
else: else:
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)
if ncols == 1: if ncols == 1:
logger.info("No tables found on {}".format( warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
@ -344,11 +345,11 @@ class Stream(BaseParser):
return table return table
def extract_tables(self, filename): def extract_tables(self, filename):
logger.info('Processing {}'.format(os.path.basename(filename)))
self._generate_layout(filename) self._generate_layout(filename)
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
logger.info("No tables found on {}".format( warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [] return []

View File

@ -1,8 +1,8 @@
from __future__ import division from __future__ import division
import os import os
import shutil import shutil
import logging
import tempfile import tempfile
import warnings
from itertools import groupby from itertools import groupby
from operator import itemgetter from operator import itemgetter
@ -38,7 +38,7 @@ lattice_kwargs = [
] ]
def validate_input(kwargs, flavor='lattice', geometry_type=False): def validate_input(kwargs, flavor='lattice'):
def check_intersection(parser_kwargs, input_kwargs): def check_intersection(parser_kwargs, input_kwargs):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec: if isec:
@ -49,10 +49,6 @@ def validate_input(kwargs, flavor='lattice', geometry_type=False):
check_intersection(stream_kwargs, kwargs) check_intersection(stream_kwargs, kwargs)
else: else:
check_intersection(lattice_kwargs, kwargs) check_intersection(lattice_kwargs, kwargs)
if geometry_type:
if flavor != 'lattice' and geometry_type in ['contour', 'joint', 'line']:
raise ValueError("Use geometry_type='{}' with flavor='lattice'".format(
geometry_type))
def remove_extra(kwargs, flavor='lattice'): def remove_extra(kwargs, flavor='lattice'):
@ -77,35 +73,6 @@ class TemporaryDirectory(object):
shutil.rmtree(self.name) shutil.rmtree(self.name)
def setup_logging(name):
"""Sets up a logger with StreamHandler.
Parameters
----------
name : str
Returns
-------
logger : logging.Logger
"""
logger = logging.getLogger(name)
format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
logger = setup_logging(__name__)
def translate(x1, x2): def translate(x1, x2):
"""Translates x2 by x1. """Translates x2 by x1.
@ -140,35 +107,6 @@ def scale(x, s):
return x return x
def rotate(x1, y1, x2, y2, angle):
"""Rotates point x2, y2 about point x1, y1 by angle.
Parameters
----------
x1 : float
y1 : float
x2 : float
y2 : float
angle : float
Angle in radians.
Returns
-------
xnew : float
ynew : float
"""
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
y2 = translate(-y1, y2)
xnew = c * x2 - s * y2
ynew = s * x2 + c * y2
xnew = translate(x1, xnew)
ynew = translate(y1, ynew)
return xnew, ynew
def scale_pdf(k, factors): def scale_pdf(k, factors):
"""Translates and scales pdf coordinate space to image """Translates and scales pdf coordinate space to image
coordinate space. coordinate space.
@ -345,33 +283,6 @@ def text_in_bbox(bbox, text):
return t_bbox return t_bbox
def remove_close_lines(ar, line_close_tol=2):
"""Removes lines which are within a tolerance, based on their x or
y axis projections.
Parameters
----------
ar : list
line_close_tol : int, optional (default: 2)
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=line_close_tol):
pass
else:
ret.append(a)
return ret
def merge_close_lines(ar, line_close_tol=2): def merge_close_lines(ar, line_close_tol=2):
"""Merges lines which are within a tolerance by calculating a """Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections. moving mean, based on their x or y axis projections.
@ -564,7 +475,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
text = t.get_text().strip('\n') text = t.get_text().strip('\n')
text_range = (t.x0, t.x1) text_range = (t.x0, t.x1)
col_range = (table.cols[0][0], table.cols[-1][1]) col_range = (table.cols[0][0], table.cols[-1][1])
logger.info("{} {} does not lie in column range {}".format( warnings.warn("{} {} does not lie in column range {}".format(
text, text_range, col_range)) text, text_range, col_range))
r_idx = r r_idx = r
c_idx = lt_col_overlap.index(max(lt_col_overlap)) c_idx = lt_col_overlap.index(max(lt_col_overlap))
@ -648,27 +559,6 @@ def compute_whitespace(d):
return whitespace return whitespace
def remove_empty(d):
"""Removes empty rows and columns from a two-dimensional list.
Parameters
----------
d : list
Returns
-------
d : list
"""
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)
d = zip(*d)
d = [list(row) for row in d if any(row)]
d = zip(*d)
return d
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
detect_vertical=True, all_texts=True): detect_vertical=True, all_texts=True):
"""Returns a PDFMiner LTPage object and page dimension of a single """Returns a PDFMiner LTPage object and page dimension of a single
@ -755,16 +645,13 @@ def get_text_objects(layout, ltype="char", t=None):
def merge_tuples(tuples): def merge_tuples(tuples):
"""Merges a list of overlapping tuples. """Merges a list of overlapping tuples.
Parameters Parameters
---------- ----------
tuples : list tuples : list
List of tuples where a tuple is a single axis coordinate pair. List of tuples where a tuple is a single axis coordinate pair.
Yields Yields
------ ------
tuple tuple
""" """
merged = list(tuples[0]) merged = list(tuples[0])
for s, e in tuples: for s, e in tuples:

Binary file not shown.

View File

@ -0,0 +1,2 @@
"a","b"
"1","2"
1 a b
2 1 2

BIN
tests/files/foo.pdf 100644

Binary file not shown.

View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
import os
import warnings
import pytest
import camelot
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")
filename = os.path.join(testdir, 'foo.pdf')
def test_unknown_flavor():
message = ("Unknown flavor specified."
" Use either 'lattice' or 'stream'")
with pytest.raises(NotImplementedError, message=message):
tables = camelot.read_pdf(filename, flavor='chocolate')
def test_input_kwargs():
message = "columns cannot be used with flavor='lattice'"
with pytest.raises(ValueError, message=message):
tables = camelot.read_pdf(filename, columns=['10,20,30,40'])
def test_unsupported_format():
message = 'File format not supported'
filename = os.path.join(testdir, 'foo.csv')
with pytest.raises(NotImplementedError, message=message):
tables = camelot.read_pdf(filename)
def test_stream_equal_length():
message = ("Length of table_area and columns"
" should be equal")
with pytest.raises(ValueError, message=message):
tables = camelot.read_pdf(filename, flavor='stream',
table_area=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
def test_no_tables_found():
filename = os.path.join(testdir, 'blank.pdf')
# TODO: use pytest.warns
with warnings.catch_warnings():
warnings.simplefilter('error')
try:
tables = camelot.read_pdf(filename)
except Exception as e:
assert type(e).__name__ == 'UserWarning'
assert str(e) == 'No tables found on page-1'

View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-