Lint and address PDFMiner version impact on tests

pull/127/head
Francois Huet 2020-04-06 12:47:23 -07:00
parent f0b2cffb17
commit f54e1563e1
5 changed files with 197 additions and 86 deletions

View File

@ -17,6 +17,7 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4
# maximum number of columns over which a header can spread # maximum number of columns over which a header can spread
MAX_COL_SPREAD_IN_HEADER = 3 MAX_COL_SPREAD_IN_HEADER = 3
class TextEdge(object): class TextEdge(object):
"""Defines a text edge coordinates relative to a left-bottom """Defines a text edge coordinates relative to a left-bottom
origin. (PDF coordinate space) origin. (PDF coordinate space)
@ -64,7 +65,8 @@ class TextEdge(object):
the is_valid attribute. the is_valid attribute.
""" """
if np.isclose(self.y0, y0, atol=edge_tol): if np.isclose(self.y0, y0, atol=edge_tol):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.x = (self.intersections * self.x + x) / \
float(self.intersections + 1)
self.y0 = y0 self.y0 = y0
self.intersections += 1 self.intersections += 1
# a textedge is valid only if it extends uninterrupted # a textedge is valid only if it extends uninterrupted
@ -140,26 +142,38 @@ class TextEdges(object):
""" """
intersections_sum = { intersections_sum = {
"left": sum( "left": sum(
te.intersections for te in self._textedges["left"] if te.is_valid te.intersections for te in self._textedges["left"]
if te.is_valid
), ),
"right": sum( "right": sum(
te.intersections for te in self._textedges["right"] if te.is_valid te.intersections for te in self._textedges["right"]
if te.is_valid
), ),
"middle": sum( "middle": sum(
te.intersections for te in self._textedges["middle"] if te.is_valid te.intersections for te in self._textedges["middle"]
if te.is_valid
), ),
} }
# TODO: naive # TODO: naive
# get vertical textedges that intersect maximum number of # get vertical textedges that intersect maximum number of
# times with horizontal textlines # times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] relevant_align = max(
return list(filter(lambda te: te.is_valid, self._textedges[relevant_align])) intersections_sum.items(),
key=itemgetter(1)
)[0]
return list(filter(
lambda te: te.is_valid,
self._textedges[relevant_align])
)
def _expand_area_for_header(self, area, textlines, col_anchors, average_row_height): @staticmethod
"""The core algorithm is based on fairly strict alignment of text. It works def _expand_area_for_header(area, textlines, col_anchors,
ok for the table body, but might fail on tables' headers since they average_row_height):
tend to be in a different font, alignment (e.g. vertical), etc. """The core algorithm is based on fairly strict alignment of text.
It works ok for the table body, but might fail on tables' headers
since they tend to be in a different font, alignment (e.g. vertical),
etc.
The section below tries to identify whether what's above the bbox The section below tries to identify whether what's above the bbox
identified so far has the characteristics of a table header: identified so far has the characteristics of a table header:
Close to the top of the body, with cells that fit within the bounds Close to the top of the body, with cells that fit within the bounds
@ -174,10 +188,12 @@ class TextEdges(object):
crossed by an element covering left to right. crossed by an element covering left to right.
""" """
indexLeft = 0 indexLeft = 0
while indexLeft < len(col_anchors) and col_anchors[indexLeft] < left: while indexLeft < len(col_anchors) \
and col_anchors[indexLeft] < left:
indexLeft += 1 indexLeft += 1
indexRight = indexLeft indexRight = indexLeft
while indexRight < len(col_anchors) and col_anchors[indexRight] < right: while indexRight < len(col_anchors) \
and col_anchors[indexRight] < right:
indexRight += 1 indexRight += 1
return indexRight - indexLeft return indexRight - indexLeft
@ -193,14 +209,14 @@ class TextEdges(object):
# higher than the table, directly within its bounds # higher than the table, directly within its bounds
if te.y0 > top and te.x0 > left and te.x1 < right: if te.y0 > top and te.x0 > left and te.x1 < right:
all_above.append(te) all_above.append(te)
if closest_above == None or closest_above.y0 > te.y0: if closest_above is None or closest_above.y0 > te.y0:
closest_above = te closest_above = te
if closest_above and \ if closest_above and \
closest_above.y0 < top + average_row_height: closest_above.y0 < top + average_row_height:
# b/ We have a candidate cell that is within the correct vertical band, # b/ We have a candidate cell that is within the correct
# and directly above the table. Starting from this anchor, we list # vertical band, and directly above the table. Starting from
# all the textlines within the same row. # this anchor, we list all the textlines within the same row.
tls_in_new_row = [] tls_in_new_row = []
top = closest_above.y1 top = closest_above.y1
pushed_up = True pushed_up = True
@ -222,18 +238,20 @@ class TextEdges(object):
top = te.y1 top = te.y1
pushed_up = True pushed_up = True
# Get the x-ranges for all the textlines, and merge the x-ranges that overlap # Get the x-ranges for all the textlines, and merge the
# x-ranges that overlap
zones = zones + \ zones = zones + \
list(map(lambda tl: [tl.x0, tl.x1], tls_in_new_row)) list(map(lambda tl: [tl.x0, tl.x1], tls_in_new_row))
zones.sort(key=lambda z: z[0]) # Sort by left coordinate zones.sort(key=lambda z: z[0]) # Sort by left coordinate
# Starting from the right, if two zones overlap horizontally, merge them # Starting from the right, if two zones overlap horizontally,
# merge them
merged_something = True merged_something = True
while merged_something: while merged_something:
merged_something = False merged_something = False
for i in range(len(zones) - 1, 0, -1): for i in range(len(zones) - 1, 0, -1):
zone_right = zones[i] zone_right = zones[i]
zone_left = zones[i-1] zone_left = zones[i-1]
if (zone_left[1] >= zone_right[0]): if zone_left[1] >= zone_right[0]:
zone_left[1] = max(zone_right[1], zone_left[1]) zone_left[1] = max(zone_right[1], zone_left[1])
zones.pop(i) zones.pop(i)
merged_something = True merged_something = True
@ -248,8 +266,8 @@ class TextEdges(object):
) )
) )
if max_spread <= MAX_COL_SPREAD_IN_HEADER: if max_spread <= MAX_COL_SPREAD_IN_HEADER:
# Combined, the elements we've identified don't cross more than the # Combined, the elements we've identified don't cross more
# authorized number of columns. # than the authorized number of columns.
# We're trying to avoid # We're trying to avoid
# 0: <BAD: Added header spans too broad> # 0: <BAD: Added header spans too broad>
# 1: <A1> <B1> <C1> <D1> <E1> # 1: <A1> <B1> <C1> <D1> <E1>
@ -257,7 +275,8 @@ class TextEdges(object):
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS: # if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
new_area = (left, bottom, right, top) new_area = (left, bottom, right, top)
# At this stage we've identified a plausible row (or beginning of one). # At this stage we've identified a plausible row (or the
# beginning of one).
keep_searching = True keep_searching = True
return new_area return new_area
@ -318,8 +337,8 @@ class TextEdges(object):
) )
table_areas[updated_area] = None table_areas[updated_area] = None
# Apply a heuristic to salvage headers which formatting might be off compared to # Apply a heuristic to salvage headers which formatting might be off
# the rest of the table. # compared to the rest of the table.
average_textline_height = sum_textline_height / \ average_textline_height = sum_textline_height / \
float(len(textlines)) float(len(textlines))
@ -398,7 +417,10 @@ class Cell(object):
def __repr__(self): def __repr__(self):
return "<Cell x1={} y1={} x2={} y2={}>".format( return "<Cell x1={} y1={} x2={} y2={}>".format(
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2) round(self.x1, 2),
round(self.y1, 2),
round(self.x2, 2),
round(self.y2, 2)
) )
@property @property
@ -448,7 +470,9 @@ class Table(object):
def __init__(self, cols, rows): def __init__(self, cols, rows):
self.cols = cols self.cols = cols
self.rows = rows self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] self.cells = [
[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows
]
self.df = None self.df = None
self.shape = (0, 0) self.shape = (0, 0)
self.accuracy = 0 self.accuracy = 0
@ -685,7 +709,8 @@ class Table(object):
Output filepath. Output filepath.
""" """
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1} kw = {"encoding": "utf-8", "index": False, "header": False,
"quoting": 1}
kw.update(kwargs) kw.update(kwargs)
self.df.to_csv(path, **kw) self.df.to_csv(path, **kw)
@ -798,7 +823,8 @@ class TableList(object):
ext = kwargs.get("ext") ext = kwargs.get("ext")
for table in self._tables: for table in self._tables:
filename = os.path.join( filename = os.path.join(
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) "{}-page-{}-table-{}{}".format(root, table.page, table.order,
ext)
) )
filepath = os.path.join(dirname, filename) filepath = os.path.join(dirname, filename)
to_format = self._format_func(table, f) to_format = self._format_func(table, f)
@ -813,7 +839,10 @@ class TableList(object):
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
for table in self._tables: for table in self._tables:
filename = os.path.join( filename = os.path.join(
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) "{}-page-{}-table-{}{}".format(root,
table.page,
table.order,
ext)
) )
filepath = os.path.join(dirname, filename) filepath = os.path.join(dirname, filename)
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))
@ -848,7 +877,8 @@ class TableList(object):
writer = pd.ExcelWriter(filepath) writer = pd.ExcelWriter(filepath)
for table in self._tables: for table in self._tables:
sheet_name = "page-{}-table-{}".format(table.page, table.order) sheet_name = "page-{}-table-{}".format(table.page, table.order)
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8") table.df.to_excel(writer, sheet_name=sheet_name,
encoding="utf-8")
writer.save() writer.save()
if compress: if compress:
zipname = os.path.join(os.path.dirname(path), root) + ".zip" zipname = os.path.join(os.path.dirname(path), root) + ".zip"

View File

@ -10,7 +10,8 @@ import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import TextEdges, Table from ..core import TextEdges, Table
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace)
logger = logging.getLogger("camelot") logger = logging.getLogger("camelot")
@ -124,8 +125,8 @@ class Stream(BaseParser):
temp = [] temp = []
for t in text: for t in text:
# is checking for upright necessary? # is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs if # if t.get_text().strip() and all([obj.upright for obj in t._objs
# type(obj) is LTChar]): # if type(obj) is LTChar]):
if t.get_text().strip(): if t.get_text().strip():
if not np.isclose(row_y, t.y0, atol=row_tol): if not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0)) rows.append(sorted(temp, key=lambda t: t.x0))
@ -170,7 +171,8 @@ class Stream(BaseParser):
merged.append(higher) merged.append(higher)
elif column_tol < 0: elif column_tol < 0:
if higher[0] <= lower[1]: if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1], atol=abs(column_tol)): if np.isclose(higher[0], lower[1],
atol=abs(column_tol)):
merged.append(higher) merged.append(higher)
else: else:
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
@ -200,8 +202,8 @@ class Stream(BaseParser):
""" """
row_boundaries = [ row_boundaries = [
[ [
max([t.y1 for t in r]), max(t.y1 for t in r),
min([t.y0 for t in r]) min(t.y0 for t in r)
] ]
for r in rows_grouped for r in rows_grouped
] ]
@ -236,7 +238,9 @@ class Stream(BaseParser):
text = Stream._group_rows(text, row_tol=row_tol) text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text] elements = [len(r) for r in text]
new_cols = [ new_cols = [
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r (t.x0, t.x1)
for r in text if len(r) == max(elements)
for t in r
] ]
cols.extend(Stream._merge_columns(sorted(new_cols))) cols.extend(Stream._merge_columns(sorted(new_cols)))
return cols return cols
@ -268,7 +272,8 @@ class Stream(BaseParser):
def _validate_columns(self): def _validate_columns(self):
if self.table_areas is not None and self.columns is not None: if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns): if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns" " should be equal") raise ValueError("Length of table_areas and columns"
" should be equal")
def _nurminen_table_detection(self, textlines): def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm """A general implementation of the table detection algorithm
@ -290,7 +295,7 @@ class Stream(BaseParser):
# guess table areas using textlines and relevant edges # guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges) table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found # treat whole page as table area if no table areas found
if not len(table_bbox): if not table_bbox:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox return table_bbox
@ -339,7 +344,8 @@ class Stream(BaseParser):
self.t_bbox = t_bbox self.t_bbox = t_bbox
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) text_x_min, text_y_min, text_x_max, text_y_max = \
self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(t_bbox_all, row_tol=self.row_tol) rows_grouped = self._group_rows(t_bbox_all, row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
@ -365,14 +371,19 @@ class Stream(BaseParser):
# see if the list contains elements, if yes, then use # see if the list contains elements, if yes, then use
# the mode after removing 1s # the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements)) elements = list(filter(lambda x: x != 1, elements))
if len(elements): if elements:
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)
else: else:
warnings.warn( warnings.warn(
"No tables found in table area {}".format(table_idx + 1) "No tables found in table area {}"
.format(table_idx + 1)
) )
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = [
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) (t.x0, t.x1) for r in rows_grouped if len(r) == ncols
for t in r
]
cols = self._merge_columns(sorted(cols),
column_tol=self.column_tol)
inner_text = [] inner_text = []
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
@ -442,20 +453,24 @@ class Stream(BaseParser):
return table return table
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): def extract_tables(self, filename, suppress_stdout=False,
layout_kwargs={}):
self._generate_layout(filename, layout_kwargs) self._generate_layout(filename, layout_kwargs)
if not suppress_stdout: if not suppress_stdout:
logger.info("Processing {}".format(os.path.basename(self.rootname))) logger.info("Processing {}".format(
os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
warnings.warn( warnings.warn(
"{} is image-based, camelot only works on" "{} is image-based, camelot only works on"
" text-based pages.".format(os.path.basename(self.rootname)) " text-based pages.".format(
os.path.basename(self.rootname))
) )
else: else:
warnings.warn( warnings.warn(
"No tables found on {}".format(os.path.basename(self.rootname)) "No tables found on {}".format(
os.path.basename(self.rootname))
) )
return [] return []

View File

@ -2742,21 +2742,28 @@ data_stream_vertical_headers = [
'', 'Daniel G. Gauthier', 'Craig M. Clemens', 'Craig Johnston', '', 'Daniel G. Gauthier', 'Craig M. Clemens', 'Craig Johnston',
'Carolyn Brummund', 'Adam Brege', 'David Bielusiak', ''], 'Carolyn Brummund', 'Adam Brege', 'David Bielusiak', ''],
['Alcona', '963', '439', '55', '26', '47', '164', '173', '111', '', '268', ['Alcona', '963', '439', '55', '26', '47', '164', '173', '111', '', '268',
'', '272', '275', '269', '', '271', '', '224', '76', '', '', '', '', ''], '', '272', '275', '269', '', '271', '', '224', '76', '', '', '', '',
['Caledonia', '923', '393', '40', '23', '45', '158', '150', '103', '', '244', ''],
'', '247', '254', '255', '', '244', '', '139', '143', '', '', '', '', ''], ['Caledonia', '923', '393', '40', '23', '45', '158', '150', '103', '',
'244', '', '247', '254', '255', '', '244', '', '139', '143', '', '',
'', '', ''],
['Curtis', '1026', '349', '30', '30', '25', '102', '95', '84', '', '159', ['Curtis', '1026', '349', '30', '30', '25', '102', '95', '84', '', '159',
'', '164', '162', '161', '', '157', '', '', '', '', '', '', '', ''], '', '164', '162', '161', '', '157', '', '', '', '', '', '', '', ''],
['Greenbush', '1212', '423', '56', '26', '40', '126', '104', '131', '', '208', ['Greenbush', '1212', '423', '56', '26', '40', '126', '104', '131', '',
'', '213', '214', '215', '', '208', '', '', '', '', '208', '', '', ''], '208', '', '213', '214', '215', '', '208', '', '', '', '', '208', '',
'', ''],
['Gustin', '611', '180', '22', '35', '17', '55', '73', '45', '', '108', ['Gustin', '611', '180', '22', '35', '17', '55', '73', '45', '', '108',
'', '104', '111', '111', '', '109', '', '', '', '', '', '81', '42', ''], '', '104', '111', '111', '', '109', '', '', '', '', '', '81', '42',
['Harrisville', '1142', '430', '45', '90', '29', '101', '155', '94', '', '226', ''],
'', '226', '232', '244', '', '226', '', '', '', '232', '', '', '', ''], ['Harrisville', '1142', '430', '45', '90', '29', '101', '155', '94', '',
'226', '', '226', '232', '244', '', '226', '', '', '', '232', '', '',
'', ''],
['Hawes', '884', '293', '38', '36', '27', '109', '121', '84', '', '192', ['Hawes', '884', '293', '38', '36', '27', '109', '121', '84', '', '192',
'', '195', '195', '193', '', '184', '', '', '', '', '', '118', '87', ''], '', '195', '195', '193', '', '184', '', '', '', '', '', '118', '87',
''],
['Haynes', '626', '275', '31', '20', '32', '104', '121', '53', '', '163', ['Haynes', '626', '275', '31', '20', '32', '104', '121', '53', '', '163',
'', '163', '173', '161', '', '152', '', '', '', '76', '', '69', '31', ''], '', '163', '173', '161', '', '152', '', '', '', '76', '', '69', '31',
''],
['Mikado', '781', '208', '19', '39', '17', '81', '90', '63', '', '149', ['Mikado', '781', '208', '19', '39', '17', '81', '90', '63', '', '149',
'', '149', '145', '147', '', '143', '', '', '', '', '113', '', '', ''], '', '149', '145', '147', '', '143', '', '', '', '', '113', '', '', ''],
['Millen', '353', '139', '7', '16', '13', '38', '49', '19', '', '62', ['Millen', '353', '139', '7', '16', '13', '38', '49', '19', '', '62',
@ -2764,7 +2771,9 @@ data_stream_vertical_headers = [
['Mitchell', '327', '96', '12', '17', '7', '29', '41', '17', '', '57', ['Mitchell', '327', '96', '12', '17', '7', '29', '41', '17', '', '57',
'', '55', '57', '60', '', '56', '', '', '', '', '', '', '', ''], '', '55', '57', '60', '', '56', '', '', '', '', '', '', '', ''],
['City Harrisville', '389', '171', '16', '15', '18', '35', '49', '31', '', ['City Harrisville', '389', '171', '16', '15', '18', '35', '49', '31', '',
'78', '', '80', '82', '81', '', '77', '', '', '', '73', '', '', '', ''], '78', '', '80', '82', '81', '', '77', '', '', '', '73', '', '', '',
['Totals', '9237', '3396', '371', '373', '317', '1102', '1221', '835', '0', '1914', '0', ''],
'1934', '1967', '1963', '0', '1889', '0', '363', '219', '381', '321', '268', '160', '0'] ['Totals', '9237', '3396', '371', '373', '317', '1102', '1221', '835', '0',
'1914', '0', '1934', '1967', '1963', '0', '1889', '0', '363', '219',
'381', '321', '268', '160', '0']
] ]

View File

@ -2,6 +2,8 @@
import os import os
import pytest
import pandas as pd import pandas as pd
from pandas.testing import assert_frame_equal from pandas.testing import assert_frame_equal
@ -11,12 +13,30 @@ from camelot.__version__ import generate_version
from .data import * from .data import *
import pdfminer
# The version of PDFMiner has an impact on some of the tests. Unfortunately,
# we can't enforce usage of a recent version of PDFMiner without dropping
# support for Python 2.
# To check the version of pdfminer.six installed:
# pip freeze | grep pdfminer.six
# To force upgrade:
# pip install --upgrade --force-reinstall pdfminer.six
# To force usage of a Python 2 compatible version:
# pip install "pdfminer.six==20191110"
# This condition can be removed in favor of a version requirement bump for
# pdfminer.six once support for Python 2 is dropped.
LEGACY_PDF_MINER = pdfminer.__version__ < "20200402"
testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files") testdir = os.path.join(testdir, "files")
def test_parsing_report(): def test_parsing_report():
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1} parsing_report = {
"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1
}
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
@ -64,6 +84,8 @@ def test_stream_table_rotated():
assert_frame_equal(df, result_without_first_row) assert_frame_equal(df, result_without_first_row)
@pytest.mark.skipif(LEGACY_PDF_MINER,
reason="depends on a recent version of PDFMiner")
def test_stream_two_tables(): def test_stream_two_tables():
df1 = pd.DataFrame(data_stream_two_tables_1) df1 = pd.DataFrame(data_stream_two_tables_1)
df2 = pd.DataFrame(data_stream_two_tables_2) df2 = pd.DataFrame(data_stream_two_tables_2)
@ -106,6 +128,8 @@ def test_stream_columns():
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
@pytest.mark.skipif(LEGACY_PDF_MINER,
reason="depends on a recent version of PDFMiner")
def test_stream_split_text(): def test_stream_split_text():
df = pd.DataFrame(data_stream_split_text) df = pd.DataFrame(data_stream_split_text)
@ -143,6 +167,8 @@ def test_stream_edge_tol():
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
@pytest.mark.skipif(LEGACY_PDF_MINER,
reason="depends on a recent version of PDFMiner")
def test_stream_layout_kwargs(): def test_stream_layout_kwargs():
df = pd.DataFrame(data_stream_layout_kwargs) df = pd.DataFrame(data_stream_layout_kwargs)
@ -248,7 +274,8 @@ def test_repr():
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0])
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
@ -258,21 +285,24 @@ def test_pages():
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0])
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
tables = camelot.read_pdf(url, pages="1-end") tables = camelot.read_pdf(url, pages="1-end")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0])
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
tables = camelot.read_pdf(url, pages="all") tables = camelot.read_pdf(url, pages="all")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0])
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
@ -282,7 +312,8 @@ def test_url():
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0])
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
@ -302,7 +333,12 @@ def test_table_order():
return t return t
table_list = TableList( table_list = TableList(
[_make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2)] [
_make_table(2, 1),
_make_table(1, 1),
_make_table(3, 4),
_make_table(1, 2)
]
) )
assert [(t.page, t.order) for t in sorted(table_list)] == [ assert [(t.page, t.order) for t in sorted(table_list)] == [

View File

@ -4,13 +4,30 @@ import os
import pytest import pytest
import pdfminer
import camelot import camelot
# The version of PDFMiner has an impact on some of the tests. Unfortunately,
# we can't enforce usage of a recent version of PDFMiner without dropping
# support for Python 2.
# To check the version of pdfminer.six installed:
# pip freeze | grep pdfminer.six
# To force upgrade:
# pip install --upgrade --force-reinstall pdfminer.six
# To force usage of a Python 2 compatible version:
# pip install "pdfminer.six==20191110"
# This condition can be removed in favor of a version requirement bump for
# pdfminer.six once support for Python 2 is dropped.
LEGACY_PDF_MINER = pdfminer.__version__ < "20200402"
testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files") testdir = os.path.join(testdir, "files")
@pytest.mark.skipif(LEGACY_PDF_MINER,
reason="depends on a recent version of PDFMiner")
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_text_plot(): def test_text_plot():
@ -35,6 +52,8 @@ def test_lattice_contour_plot():
return camelot.plot(tables[0], kind='contour') return camelot.plot(tables[0], kind='contour')
@pytest.mark.skipif(LEGACY_PDF_MINER,
reason="depends on a recent version of PDFMiner")
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_stream_contour_plot(): def test_stream_contour_plot():
@ -59,6 +78,8 @@ def test_joint_plot():
return camelot.plot(tables[0], kind='joint') return camelot.plot(tables[0], kind='joint')
@pytest.mark.skipif(LEGACY_PDF_MINER,
reason="depends on a recent version of PDFMiner")
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_textedge_plot(): def test_textedge_plot():