[MRG] Add error/warning tests (#113)
* Add unknown flavor test * Add input kwargs test * Remove unused utils * Add unsupported format test * Add stream unequal tables-columns length test * Add python3 compat * Add no tables found test * Convert util info log to warningpull/2/head
parent
f1bf4309ec
commit
c5bde5e2ad
|
|
@ -1,5 +1,19 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
# set up logging
|
||||||
|
logger = logging.getLogger('camelot')
|
||||||
|
|
||||||
|
format_string = '%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
|
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from pprint import pprint
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger('camelot')
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
import click
|
import click
|
||||||
|
|
||||||
|
|
@ -38,7 +41,7 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
def cli(ctx, *args, **kwargs):
|
def cli(ctx, *args, **kwargs):
|
||||||
"""Camelot: PDF Table Extraction for Humans"""
|
"""Camelot: PDF Table Extraction for Humans"""
|
||||||
ctx.obj = Config()
|
ctx.obj = Config()
|
||||||
for key, value in kwargs.iteritems():
|
for key, value in kwargs.items():
|
||||||
ctx.obj.set_config(key, value)
|
ctx.obj.set_config(key, value)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -447,18 +447,6 @@ class TableList(object):
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
return self._tables[idx]
|
return self._tables[idx]
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
self._n = 0
|
|
||||||
return self
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
if self._n < len(self):
|
|
||||||
r = self._tables[self._n]
|
|
||||||
self._n += 1
|
|
||||||
return r
|
|
||||||
else:
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _format_func(table, f):
|
def _format_func(table, f):
|
||||||
return getattr(table, 'to_{}'.format(f))
|
return getattr(table, 'to_{}'.format(f))
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ class PDFHandler(object):
|
||||||
def __init__(self, filename, pages='1'):
|
def __init__(self, filename, pages='1'):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
if not self.filename.endswith('.pdf'):
|
if not self.filename.endswith('.pdf'):
|
||||||
raise TypeError("File format not supported.")
|
raise NotImplementedError("File format not supported")
|
||||||
self.pages = self._get_pages(self.filename, pages)
|
self.pages = self._get_pages(self.filename, pages)
|
||||||
|
|
||||||
def _get_pages(self, filename, pages):
|
def _get_pages(self, filename, pages):
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ from __future__ import division
|
||||||
import os
|
import os
|
||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
|
import warnings
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -13,12 +14,12 @@ from .base import BaseParser
|
||||||
from ..core import Table
|
from ..core import Table
|
||||||
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
||||||
merge_close_lines, get_table_index, compute_accuracy,
|
merge_close_lines, get_table_index, compute_accuracy,
|
||||||
compute_whitespace, setup_logging)
|
compute_whitespace)
|
||||||
from ..image_processing import (adaptive_threshold, find_lines,
|
from ..image_processing import (adaptive_threshold, find_lines,
|
||||||
find_table_contours, find_table_joints)
|
find_table_contours, find_table_joints)
|
||||||
|
|
||||||
|
|
||||||
logger = setup_logging(__name__)
|
logger = logging.getLogger('camelot')
|
||||||
|
|
||||||
|
|
||||||
class Lattice(BaseParser):
|
class Lattice(BaseParser):
|
||||||
|
|
@ -305,11 +306,11 @@ class Lattice(BaseParser):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
logger.info('Processing {}'.format(os.path.basename(filename)))
|
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename)
|
||||||
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
logger.info("No tables found on {}".format(
|
warnings.warn("No tables found on {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
@ -10,10 +11,10 @@ import pandas as pd
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import Table
|
from ..core import Table
|
||||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||||
compute_whitespace, setup_logging)
|
compute_whitespace)
|
||||||
|
|
||||||
|
|
||||||
logger = setup_logging(__name__)
|
logger = logging.getLogger('camelot')
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
|
|
@ -287,7 +288,7 @@ class Stream(BaseParser):
|
||||||
else:
|
else:
|
||||||
ncols = max(set(elements), key=elements.count)
|
ncols = max(set(elements), key=elements.count)
|
||||||
if ncols == 1:
|
if ncols == 1:
|
||||||
logger.info("No tables found on {}".format(
|
warnings.warn("No tables found on {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
|
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
|
||||||
|
|
@ -344,11 +345,11 @@ class Stream(BaseParser):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
logger.info('Processing {}'.format(os.path.basename(filename)))
|
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename)
|
||||||
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
logger.info("No tables found on {}".format(
|
warnings.warn("No tables found on {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
|
||||||
119
camelot/utils.py
119
camelot/utils.py
|
|
@ -1,8 +1,8 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import logging
|
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import warnings
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
|
|
@ -38,7 +38,7 @@ lattice_kwargs = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def validate_input(kwargs, flavor='lattice', geometry_type=False):
|
def validate_input(kwargs, flavor='lattice'):
|
||||||
def check_intersection(parser_kwargs, input_kwargs):
|
def check_intersection(parser_kwargs, input_kwargs):
|
||||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
||||||
if isec:
|
if isec:
|
||||||
|
|
@ -49,10 +49,6 @@ def validate_input(kwargs, flavor='lattice', geometry_type=False):
|
||||||
check_intersection(stream_kwargs, kwargs)
|
check_intersection(stream_kwargs, kwargs)
|
||||||
else:
|
else:
|
||||||
check_intersection(lattice_kwargs, kwargs)
|
check_intersection(lattice_kwargs, kwargs)
|
||||||
if geometry_type:
|
|
||||||
if flavor != 'lattice' and geometry_type in ['contour', 'joint', 'line']:
|
|
||||||
raise ValueError("Use geometry_type='{}' with flavor='lattice'".format(
|
|
||||||
geometry_type))
|
|
||||||
|
|
||||||
|
|
||||||
def remove_extra(kwargs, flavor='lattice'):
|
def remove_extra(kwargs, flavor='lattice'):
|
||||||
|
|
@ -77,35 +73,6 @@ class TemporaryDirectory(object):
|
||||||
shutil.rmtree(self.name)
|
shutil.rmtree(self.name)
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(name):
|
|
||||||
"""Sets up a logger with StreamHandler.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
name : str
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
logger : logging.Logger
|
|
||||||
|
|
||||||
"""
|
|
||||||
logger = logging.getLogger(name)
|
|
||||||
|
|
||||||
format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
|
|
||||||
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
|
||||||
|
|
||||||
handler = logging.StreamHandler()
|
|
||||||
handler.setLevel(logging.INFO)
|
|
||||||
handler.setFormatter(formatter)
|
|
||||||
|
|
||||||
logger.addHandler(handler)
|
|
||||||
|
|
||||||
return logger
|
|
||||||
|
|
||||||
|
|
||||||
logger = setup_logging(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def translate(x1, x2):
|
def translate(x1, x2):
|
||||||
"""Translates x2 by x1.
|
"""Translates x2 by x1.
|
||||||
|
|
||||||
|
|
@ -140,35 +107,6 @@ def scale(x, s):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def rotate(x1, y1, x2, y2, angle):
|
|
||||||
"""Rotates point x2, y2 about point x1, y1 by angle.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x1 : float
|
|
||||||
y1 : float
|
|
||||||
x2 : float
|
|
||||||
y2 : float
|
|
||||||
angle : float
|
|
||||||
Angle in radians.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
xnew : float
|
|
||||||
ynew : float
|
|
||||||
|
|
||||||
"""
|
|
||||||
s = np.sin(angle)
|
|
||||||
c = np.cos(angle)
|
|
||||||
x2 = translate(-x1, x2)
|
|
||||||
y2 = translate(-y1, y2)
|
|
||||||
xnew = c * x2 - s * y2
|
|
||||||
ynew = s * x2 + c * y2
|
|
||||||
xnew = translate(x1, xnew)
|
|
||||||
ynew = translate(y1, ynew)
|
|
||||||
return xnew, ynew
|
|
||||||
|
|
||||||
|
|
||||||
def scale_pdf(k, factors):
|
def scale_pdf(k, factors):
|
||||||
"""Translates and scales pdf coordinate space to image
|
"""Translates and scales pdf coordinate space to image
|
||||||
coordinate space.
|
coordinate space.
|
||||||
|
|
@ -345,33 +283,6 @@ def text_in_bbox(bbox, text):
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
def remove_close_lines(ar, line_close_tol=2):
|
|
||||||
"""Removes lines which are within a tolerance, based on their x or
|
|
||||||
y axis projections.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
ar : list
|
|
||||||
line_close_tol : int, optional (default: 2)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
ret : list
|
|
||||||
|
|
||||||
"""
|
|
||||||
ret = []
|
|
||||||
for a in ar:
|
|
||||||
if not ret:
|
|
||||||
ret.append(a)
|
|
||||||
else:
|
|
||||||
temp = ret[-1]
|
|
||||||
if np.isclose(temp, a, atol=line_close_tol):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
ret.append(a)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def merge_close_lines(ar, line_close_tol=2):
|
def merge_close_lines(ar, line_close_tol=2):
|
||||||
"""Merges lines which are within a tolerance by calculating a
|
"""Merges lines which are within a tolerance by calculating a
|
||||||
moving mean, based on their x or y axis projections.
|
moving mean, based on their x or y axis projections.
|
||||||
|
|
@ -564,7 +475,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||||
text = t.get_text().strip('\n')
|
text = t.get_text().strip('\n')
|
||||||
text_range = (t.x0, t.x1)
|
text_range = (t.x0, t.x1)
|
||||||
col_range = (table.cols[0][0], table.cols[-1][1])
|
col_range = (table.cols[0][0], table.cols[-1][1])
|
||||||
logger.info("{} {} does not lie in column range {}".format(
|
warnings.warn("{} {} does not lie in column range {}".format(
|
||||||
text, text_range, col_range))
|
text, text_range, col_range))
|
||||||
r_idx = r
|
r_idx = r
|
||||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||||
|
|
@ -648,27 +559,6 @@ def compute_whitespace(d):
|
||||||
return whitespace
|
return whitespace
|
||||||
|
|
||||||
|
|
||||||
def remove_empty(d):
|
|
||||||
"""Removes empty rows and columns from a two-dimensional list.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
d : list
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
d : list
|
|
||||||
|
|
||||||
"""
|
|
||||||
for i, row in enumerate(d):
|
|
||||||
if row == [''] * len(row):
|
|
||||||
d.pop(i)
|
|
||||||
d = zip(*d)
|
|
||||||
d = [list(row) for row in d if any(row)]
|
|
||||||
d = zip(*d)
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||||
detect_vertical=True, all_texts=True):
|
detect_vertical=True, all_texts=True):
|
||||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||||
|
|
@ -755,16 +645,13 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
|
|
||||||
def merge_tuples(tuples):
|
def merge_tuples(tuples):
|
||||||
"""Merges a list of overlapping tuples.
|
"""Merges a list of overlapping tuples.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
tuples : list
|
tuples : list
|
||||||
List of tuples where a tuple is a single axis coordinate pair.
|
List of tuples where a tuple is a single axis coordinate pair.
|
||||||
|
|
||||||
Yields
|
Yields
|
||||||
------
|
------
|
||||||
tuple
|
tuple
|
||||||
|
|
||||||
"""
|
"""
|
||||||
merged = list(tuples[0])
|
merged = list(tuples[0])
|
||||||
for s, e in tuples:
|
for s, e in tuples:
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -0,0 +1,2 @@
|
||||||
|
"a","b"
|
||||||
|
"1","2"
|
||||||
|
Binary file not shown.
|
|
@ -0,0 +1 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
@ -0,0 +1,53 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import camelot
|
||||||
|
|
||||||
|
|
||||||
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
testdir = os.path.join(testdir, "files")
|
||||||
|
filename = os.path.join(testdir, 'foo.pdf')
|
||||||
|
|
||||||
|
|
||||||
|
def test_unknown_flavor():
|
||||||
|
message = ("Unknown flavor specified."
|
||||||
|
" Use either 'lattice' or 'stream'")
|
||||||
|
with pytest.raises(NotImplementedError, message=message):
|
||||||
|
tables = camelot.read_pdf(filename, flavor='chocolate')
|
||||||
|
|
||||||
|
|
||||||
|
def test_input_kwargs():
|
||||||
|
message = "columns cannot be used with flavor='lattice'"
|
||||||
|
with pytest.raises(ValueError, message=message):
|
||||||
|
tables = camelot.read_pdf(filename, columns=['10,20,30,40'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_unsupported_format():
|
||||||
|
message = 'File format not supported'
|
||||||
|
filename = os.path.join(testdir, 'foo.csv')
|
||||||
|
with pytest.raises(NotImplementedError, message=message):
|
||||||
|
tables = camelot.read_pdf(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_equal_length():
|
||||||
|
message = ("Length of table_area and columns"
|
||||||
|
" should be equal")
|
||||||
|
with pytest.raises(ValueError, message=message):
|
||||||
|
tables = camelot.read_pdf(filename, flavor='stream',
|
||||||
|
table_area=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_tables_found():
|
||||||
|
filename = os.path.join(testdir, 'blank.pdf')
|
||||||
|
# TODO: use pytest.warns
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter('error')
|
||||||
|
try:
|
||||||
|
tables = camelot.read_pdf(filename)
|
||||||
|
except Exception as e:
|
||||||
|
assert type(e).__name__ == 'UserWarning'
|
||||||
|
assert str(e) == 'No tables found on page-1'
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
Loading…
Reference in New Issue