Lint and address PDFMiner version impact on tests
parent
f0b2cffb17
commit
f54e1563e1
|
|
@ -17,6 +17,7 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||||
# maximum number of columns over which a header can spread
|
# maximum number of columns over which a header can spread
|
||||||
MAX_COL_SPREAD_IN_HEADER = 3
|
MAX_COL_SPREAD_IN_HEADER = 3
|
||||||
|
|
||||||
|
|
||||||
class TextEdge(object):
|
class TextEdge(object):
|
||||||
"""Defines a text edge coordinates relative to a left-bottom
|
"""Defines a text edge coordinates relative to a left-bottom
|
||||||
origin. (PDF coordinate space)
|
origin. (PDF coordinate space)
|
||||||
|
|
@ -64,7 +65,8 @@ class TextEdge(object):
|
||||||
the is_valid attribute.
|
the is_valid attribute.
|
||||||
"""
|
"""
|
||||||
if np.isclose(self.y0, y0, atol=edge_tol):
|
if np.isclose(self.y0, y0, atol=edge_tol):
|
||||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
self.x = (self.intersections * self.x + x) / \
|
||||||
|
float(self.intersections + 1)
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.intersections += 1
|
self.intersections += 1
|
||||||
# a textedge is valid only if it extends uninterrupted
|
# a textedge is valid only if it extends uninterrupted
|
||||||
|
|
@ -140,26 +142,38 @@ class TextEdges(object):
|
||||||
"""
|
"""
|
||||||
intersections_sum = {
|
intersections_sum = {
|
||||||
"left": sum(
|
"left": sum(
|
||||||
te.intersections for te in self._textedges["left"] if te.is_valid
|
te.intersections for te in self._textedges["left"]
|
||||||
|
if te.is_valid
|
||||||
),
|
),
|
||||||
"right": sum(
|
"right": sum(
|
||||||
te.intersections for te in self._textedges["right"] if te.is_valid
|
te.intersections for te in self._textedges["right"]
|
||||||
|
if te.is_valid
|
||||||
),
|
),
|
||||||
"middle": sum(
|
"middle": sum(
|
||||||
te.intersections for te in self._textedges["middle"] if te.is_valid
|
te.intersections for te in self._textedges["middle"]
|
||||||
|
if te.is_valid
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO: naive
|
# TODO: naive
|
||||||
# get vertical textedges that intersect maximum number of
|
# get vertical textedges that intersect maximum number of
|
||||||
# times with horizontal textlines
|
# times with horizontal textlines
|
||||||
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
relevant_align = max(
|
||||||
return list(filter(lambda te: te.is_valid, self._textedges[relevant_align]))
|
intersections_sum.items(),
|
||||||
|
key=itemgetter(1)
|
||||||
|
)[0]
|
||||||
|
return list(filter(
|
||||||
|
lambda te: te.is_valid,
|
||||||
|
self._textedges[relevant_align])
|
||||||
|
)
|
||||||
|
|
||||||
def _expand_area_for_header(self, area, textlines, col_anchors, average_row_height):
|
@staticmethod
|
||||||
"""The core algorithm is based on fairly strict alignment of text. It works
|
def _expand_area_for_header(area, textlines, col_anchors,
|
||||||
ok for the table body, but might fail on tables' headers since they
|
average_row_height):
|
||||||
tend to be in a different font, alignment (e.g. vertical), etc.
|
"""The core algorithm is based on fairly strict alignment of text.
|
||||||
|
It works ok for the table body, but might fail on tables' headers
|
||||||
|
since they tend to be in a different font, alignment (e.g. vertical),
|
||||||
|
etc.
|
||||||
The section below tries to identify whether what's above the bbox
|
The section below tries to identify whether what's above the bbox
|
||||||
identified so far has the characteristics of a table header:
|
identified so far has the characteristics of a table header:
|
||||||
Close to the top of the body, with cells that fit within the bounds
|
Close to the top of the body, with cells that fit within the bounds
|
||||||
|
|
@ -174,10 +188,12 @@ class TextEdges(object):
|
||||||
crossed by an element covering left to right.
|
crossed by an element covering left to right.
|
||||||
"""
|
"""
|
||||||
indexLeft = 0
|
indexLeft = 0
|
||||||
while indexLeft < len(col_anchors) and col_anchors[indexLeft] < left:
|
while indexLeft < len(col_anchors) \
|
||||||
|
and col_anchors[indexLeft] < left:
|
||||||
indexLeft += 1
|
indexLeft += 1
|
||||||
indexRight = indexLeft
|
indexRight = indexLeft
|
||||||
while indexRight < len(col_anchors) and col_anchors[indexRight] < right:
|
while indexRight < len(col_anchors) \
|
||||||
|
and col_anchors[indexRight] < right:
|
||||||
indexRight += 1
|
indexRight += 1
|
||||||
|
|
||||||
return indexRight - indexLeft
|
return indexRight - indexLeft
|
||||||
|
|
@ -193,14 +209,14 @@ class TextEdges(object):
|
||||||
# higher than the table, directly within its bounds
|
# higher than the table, directly within its bounds
|
||||||
if te.y0 > top and te.x0 > left and te.x1 < right:
|
if te.y0 > top and te.x0 > left and te.x1 < right:
|
||||||
all_above.append(te)
|
all_above.append(te)
|
||||||
if closest_above == None or closest_above.y0 > te.y0:
|
if closest_above is None or closest_above.y0 > te.y0:
|
||||||
closest_above = te
|
closest_above = te
|
||||||
|
|
||||||
if closest_above and \
|
if closest_above and \
|
||||||
closest_above.y0 < top + average_row_height:
|
closest_above.y0 < top + average_row_height:
|
||||||
# b/ We have a candidate cell that is within the correct vertical band,
|
# b/ We have a candidate cell that is within the correct
|
||||||
# and directly above the table. Starting from this anchor, we list
|
# vertical band, and directly above the table. Starting from
|
||||||
# all the textlines within the same row.
|
# this anchor, we list all the textlines within the same row.
|
||||||
tls_in_new_row = []
|
tls_in_new_row = []
|
||||||
top = closest_above.y1
|
top = closest_above.y1
|
||||||
pushed_up = True
|
pushed_up = True
|
||||||
|
|
@ -222,18 +238,20 @@ class TextEdges(object):
|
||||||
top = te.y1
|
top = te.y1
|
||||||
pushed_up = True
|
pushed_up = True
|
||||||
|
|
||||||
# Get the x-ranges for all the textlines, and merge the x-ranges that overlap
|
# Get the x-ranges for all the textlines, and merge the
|
||||||
|
# x-ranges that overlap
|
||||||
zones = zones + \
|
zones = zones + \
|
||||||
list(map(lambda tl: [tl.x0, tl.x1], tls_in_new_row))
|
list(map(lambda tl: [tl.x0, tl.x1], tls_in_new_row))
|
||||||
zones.sort(key=lambda z: z[0]) # Sort by left coordinate
|
zones.sort(key=lambda z: z[0]) # Sort by left coordinate
|
||||||
# Starting from the right, if two zones overlap horizontally, merge them
|
# Starting from the right, if two zones overlap horizontally,
|
||||||
|
# merge them
|
||||||
merged_something = True
|
merged_something = True
|
||||||
while merged_something:
|
while merged_something:
|
||||||
merged_something = False
|
merged_something = False
|
||||||
for i in range(len(zones) - 1, 0, -1):
|
for i in range(len(zones) - 1, 0, -1):
|
||||||
zone_right = zones[i]
|
zone_right = zones[i]
|
||||||
zone_left = zones[i-1]
|
zone_left = zones[i-1]
|
||||||
if (zone_left[1] >= zone_right[0]):
|
if zone_left[1] >= zone_right[0]:
|
||||||
zone_left[1] = max(zone_right[1], zone_left[1])
|
zone_left[1] = max(zone_right[1], zone_left[1])
|
||||||
zones.pop(i)
|
zones.pop(i)
|
||||||
merged_something = True
|
merged_something = True
|
||||||
|
|
@ -248,8 +266,8 @@ class TextEdges(object):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if max_spread <= MAX_COL_SPREAD_IN_HEADER:
|
if max_spread <= MAX_COL_SPREAD_IN_HEADER:
|
||||||
# Combined, the elements we've identified don't cross more than the
|
# Combined, the elements we've identified don't cross more
|
||||||
# authorized number of columns.
|
# than the authorized number of columns.
|
||||||
# We're trying to avoid
|
# We're trying to avoid
|
||||||
# 0: <BAD: Added header spans too broad>
|
# 0: <BAD: Added header spans too broad>
|
||||||
# 1: <A1> <B1> <C1> <D1> <E1>
|
# 1: <A1> <B1> <C1> <D1> <E1>
|
||||||
|
|
@ -257,7 +275,8 @@ class TextEdges(object):
|
||||||
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
|
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||||
new_area = (left, bottom, right, top)
|
new_area = (left, bottom, right, top)
|
||||||
|
|
||||||
# At this stage we've identified a plausible row (or beginning of one).
|
# At this stage we've identified a plausible row (or the
|
||||||
|
# beginning of one).
|
||||||
keep_searching = True
|
keep_searching = True
|
||||||
|
|
||||||
return new_area
|
return new_area
|
||||||
|
|
@ -318,8 +337,8 @@ class TextEdges(object):
|
||||||
)
|
)
|
||||||
table_areas[updated_area] = None
|
table_areas[updated_area] = None
|
||||||
|
|
||||||
# Apply a heuristic to salvage headers which formatting might be off compared to
|
# Apply a heuristic to salvage headers which formatting might be off
|
||||||
# the rest of the table.
|
# compared to the rest of the table.
|
||||||
average_textline_height = sum_textline_height / \
|
average_textline_height = sum_textline_height / \
|
||||||
float(len(textlines))
|
float(len(textlines))
|
||||||
|
|
||||||
|
|
@ -398,7 +417,10 @@ class Cell(object):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<Cell x1={} y1={} x2={} y2={}>".format(
|
return "<Cell x1={} y1={} x2={} y2={}>".format(
|
||||||
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2)
|
round(self.x1, 2),
|
||||||
|
round(self.y1, 2),
|
||||||
|
round(self.x2, 2),
|
||||||
|
round(self.y2, 2)
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -448,7 +470,9 @@ class Table(object):
|
||||||
def __init__(self, cols, rows):
|
def __init__(self, cols, rows):
|
||||||
self.cols = cols
|
self.cols = cols
|
||||||
self.rows = rows
|
self.rows = rows
|
||||||
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
|
self.cells = [
|
||||||
|
[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows
|
||||||
|
]
|
||||||
self.df = None
|
self.df = None
|
||||||
self.shape = (0, 0)
|
self.shape = (0, 0)
|
||||||
self.accuracy = 0
|
self.accuracy = 0
|
||||||
|
|
@ -685,7 +709,8 @@ class Table(object):
|
||||||
Output filepath.
|
Output filepath.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
|
kw = {"encoding": "utf-8", "index": False, "header": False,
|
||||||
|
"quoting": 1}
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
self.df.to_csv(path, **kw)
|
self.df.to_csv(path, **kw)
|
||||||
|
|
||||||
|
|
@ -798,7 +823,8 @@ class TableList(object):
|
||||||
ext = kwargs.get("ext")
|
ext = kwargs.get("ext")
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
filename = os.path.join(
|
filename = os.path.join(
|
||||||
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
|
"{}-page-{}-table-{}{}".format(root, table.page, table.order,
|
||||||
|
ext)
|
||||||
)
|
)
|
||||||
filepath = os.path.join(dirname, filename)
|
filepath = os.path.join(dirname, filename)
|
||||||
to_format = self._format_func(table, f)
|
to_format = self._format_func(table, f)
|
||||||
|
|
@ -813,7 +839,10 @@ class TableList(object):
|
||||||
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
filename = os.path.join(
|
filename = os.path.join(
|
||||||
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
|
"{}-page-{}-table-{}{}".format(root,
|
||||||
|
table.page,
|
||||||
|
table.order,
|
||||||
|
ext)
|
||||||
)
|
)
|
||||||
filepath = os.path.join(dirname, filename)
|
filepath = os.path.join(dirname, filename)
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
@ -848,7 +877,8 @@ class TableList(object):
|
||||||
writer = pd.ExcelWriter(filepath)
|
writer = pd.ExcelWriter(filepath)
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
sheet_name = "page-{}-table-{}".format(table.page, table.order)
|
sheet_name = "page-{}-table-{}".format(table.page, table.order)
|
||||||
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
|
table.df.to_excel(writer, sheet_name=sheet_name,
|
||||||
|
encoding="utf-8")
|
||||||
writer.save()
|
writer.save()
|
||||||
if compress:
|
if compress:
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,8 @@ import pandas as pd
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges, Table
|
from ..core import TextEdges, Table
|
||||||
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
|
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||||
|
compute_whitespace)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("camelot")
|
logger = logging.getLogger("camelot")
|
||||||
|
|
@ -124,8 +125,8 @@ class Stream(BaseParser):
|
||||||
temp = []
|
temp = []
|
||||||
for t in text:
|
for t in text:
|
||||||
# is checking for upright necessary?
|
# is checking for upright necessary?
|
||||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
# if t.get_text().strip() and all([obj.upright for obj in t._objs
|
||||||
# type(obj) is LTChar]):
|
# if type(obj) is LTChar]):
|
||||||
if t.get_text().strip():
|
if t.get_text().strip():
|
||||||
if not np.isclose(row_y, t.y0, atol=row_tol):
|
if not np.isclose(row_y, t.y0, atol=row_tol):
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
|
|
@ -170,7 +171,8 @@ class Stream(BaseParser):
|
||||||
merged.append(higher)
|
merged.append(higher)
|
||||||
elif column_tol < 0:
|
elif column_tol < 0:
|
||||||
if higher[0] <= lower[1]:
|
if higher[0] <= lower[1]:
|
||||||
if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
|
if np.isclose(higher[0], lower[1],
|
||||||
|
atol=abs(column_tol)):
|
||||||
merged.append(higher)
|
merged.append(higher)
|
||||||
else:
|
else:
|
||||||
upper_bound = max(lower[1], higher[1])
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
|
@ -200,8 +202,8 @@ class Stream(BaseParser):
|
||||||
"""
|
"""
|
||||||
row_boundaries = [
|
row_boundaries = [
|
||||||
[
|
[
|
||||||
max([t.y1 for t in r]),
|
max(t.y1 for t in r),
|
||||||
min([t.y0 for t in r])
|
min(t.y0 for t in r)
|
||||||
]
|
]
|
||||||
for r in rows_grouped
|
for r in rows_grouped
|
||||||
]
|
]
|
||||||
|
|
@ -236,7 +238,9 @@ class Stream(BaseParser):
|
||||||
text = Stream._group_rows(text, row_tol=row_tol)
|
text = Stream._group_rows(text, row_tol=row_tol)
|
||||||
elements = [len(r) for r in text]
|
elements = [len(r) for r in text]
|
||||||
new_cols = [
|
new_cols = [
|
||||||
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
|
(t.x0, t.x1)
|
||||||
|
for r in text if len(r) == max(elements)
|
||||||
|
for t in r
|
||||||
]
|
]
|
||||||
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
||||||
return cols
|
return cols
|
||||||
|
|
@ -268,7 +272,8 @@ class Stream(BaseParser):
|
||||||
def _validate_columns(self):
|
def _validate_columns(self):
|
||||||
if self.table_areas is not None and self.columns is not None:
|
if self.table_areas is not None and self.columns is not None:
|
||||||
if len(self.table_areas) != len(self.columns):
|
if len(self.table_areas) != len(self.columns):
|
||||||
raise ValueError("Length of table_areas and columns" " should be equal")
|
raise ValueError("Length of table_areas and columns"
|
||||||
|
" should be equal")
|
||||||
|
|
||||||
def _nurminen_table_detection(self, textlines):
|
def _nurminen_table_detection(self, textlines):
|
||||||
"""A general implementation of the table detection algorithm
|
"""A general implementation of the table detection algorithm
|
||||||
|
|
@ -290,7 +295,7 @@ class Stream(BaseParser):
|
||||||
# guess table areas using textlines and relevant edges
|
# guess table areas using textlines and relevant edges
|
||||||
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||||
# treat whole page as table area if no table areas found
|
# treat whole page as table area if no table areas found
|
||||||
if not len(table_bbox):
|
if not table_bbox:
|
||||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||||
|
|
||||||
return table_bbox
|
return table_bbox
|
||||||
|
|
@ -339,7 +344,8 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
self.t_bbox = t_bbox
|
self.t_bbox = t_bbox
|
||||||
|
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
text_x_min, text_y_min, text_x_max, text_y_max = \
|
||||||
|
self._text_bbox(self.t_bbox)
|
||||||
rows_grouped = self._group_rows(t_bbox_all, row_tol=self.row_tol)
|
rows_grouped = self._group_rows(t_bbox_all, row_tol=self.row_tol)
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
elements = [len(r) for r in rows_grouped]
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
|
@ -365,14 +371,19 @@ class Stream(BaseParser):
|
||||||
# see if the list contains elements, if yes, then use
|
# see if the list contains elements, if yes, then use
|
||||||
# the mode after removing 1s
|
# the mode after removing 1s
|
||||||
elements = list(filter(lambda x: x != 1, elements))
|
elements = list(filter(lambda x: x != 1, elements))
|
||||||
if len(elements):
|
if elements:
|
||||||
ncols = max(set(elements), key=elements.count)
|
ncols = max(set(elements), key=elements.count)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"No tables found in table area {}".format(table_idx + 1)
|
"No tables found in table area {}"
|
||||||
|
.format(table_idx + 1)
|
||||||
)
|
)
|
||||||
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
cols = [
|
||||||
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
(t.x0, t.x1) for r in rows_grouped if len(r) == ncols
|
||||||
|
for t in r
|
||||||
|
]
|
||||||
|
cols = self._merge_columns(sorted(cols),
|
||||||
|
column_tol=self.column_tol)
|
||||||
inner_text = []
|
inner_text = []
|
||||||
for i in range(1, len(cols)):
|
for i in range(1, len(cols)):
|
||||||
left = cols[i - 1][1]
|
left = cols[i - 1][1]
|
||||||
|
|
@ -442,20 +453,24 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
def extract_tables(self, filename, suppress_stdout=False,
|
||||||
|
layout_kwargs={}):
|
||||||
self._generate_layout(filename, layout_kwargs)
|
self._generate_layout(filename, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info("Processing {}".format(os.path.basename(self.rootname)))
|
logger.info("Processing {}".format(
|
||||||
|
os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
if self.images:
|
if self.images:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"{} is image-based, camelot only works on"
|
"{} is image-based, camelot only works on"
|
||||||
" text-based pages.".format(os.path.basename(self.rootname))
|
" text-based pages.".format(
|
||||||
|
os.path.basename(self.rootname))
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"No tables found on {}".format(os.path.basename(self.rootname))
|
"No tables found on {}".format(
|
||||||
|
os.path.basename(self.rootname))
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2742,21 +2742,28 @@ data_stream_vertical_headers = [
|
||||||
'', 'Daniel G. Gauthier', 'Craig M. Clemens', 'Craig Johnston',
|
'', 'Daniel G. Gauthier', 'Craig M. Clemens', 'Craig Johnston',
|
||||||
'Carolyn Brummund', 'Adam Brege', 'David Bielusiak', ''],
|
'Carolyn Brummund', 'Adam Brege', 'David Bielusiak', ''],
|
||||||
['Alcona', '963', '439', '55', '26', '47', '164', '173', '111', '', '268',
|
['Alcona', '963', '439', '55', '26', '47', '164', '173', '111', '', '268',
|
||||||
'', '272', '275', '269', '', '271', '', '224', '76', '', '', '', '', ''],
|
'', '272', '275', '269', '', '271', '', '224', '76', '', '', '', '',
|
||||||
['Caledonia', '923', '393', '40', '23', '45', '158', '150', '103', '', '244',
|
''],
|
||||||
'', '247', '254', '255', '', '244', '', '139', '143', '', '', '', '', ''],
|
['Caledonia', '923', '393', '40', '23', '45', '158', '150', '103', '',
|
||||||
|
'244', '', '247', '254', '255', '', '244', '', '139', '143', '', '',
|
||||||
|
'', '', ''],
|
||||||
['Curtis', '1026', '349', '30', '30', '25', '102', '95', '84', '', '159',
|
['Curtis', '1026', '349', '30', '30', '25', '102', '95', '84', '', '159',
|
||||||
'', '164', '162', '161', '', '157', '', '', '', '', '', '', '', ''],
|
'', '164', '162', '161', '', '157', '', '', '', '', '', '', '', ''],
|
||||||
['Greenbush', '1212', '423', '56', '26', '40', '126', '104', '131', '', '208',
|
['Greenbush', '1212', '423', '56', '26', '40', '126', '104', '131', '',
|
||||||
'', '213', '214', '215', '', '208', '', '', '', '', '208', '', '', ''],
|
'208', '', '213', '214', '215', '', '208', '', '', '', '', '208', '',
|
||||||
|
'', ''],
|
||||||
['Gustin', '611', '180', '22', '35', '17', '55', '73', '45', '', '108',
|
['Gustin', '611', '180', '22', '35', '17', '55', '73', '45', '', '108',
|
||||||
'', '104', '111', '111', '', '109', '', '', '', '', '', '81', '42', ''],
|
'', '104', '111', '111', '', '109', '', '', '', '', '', '81', '42',
|
||||||
['Harrisville', '1142', '430', '45', '90', '29', '101', '155', '94', '', '226',
|
''],
|
||||||
'', '226', '232', '244', '', '226', '', '', '', '232', '', '', '', ''],
|
['Harrisville', '1142', '430', '45', '90', '29', '101', '155', '94', '',
|
||||||
|
'226', '', '226', '232', '244', '', '226', '', '', '', '232', '', '',
|
||||||
|
'', ''],
|
||||||
['Hawes', '884', '293', '38', '36', '27', '109', '121', '84', '', '192',
|
['Hawes', '884', '293', '38', '36', '27', '109', '121', '84', '', '192',
|
||||||
'', '195', '195', '193', '', '184', '', '', '', '', '', '118', '87', ''],
|
'', '195', '195', '193', '', '184', '', '', '', '', '', '118', '87',
|
||||||
|
''],
|
||||||
['Haynes', '626', '275', '31', '20', '32', '104', '121', '53', '', '163',
|
['Haynes', '626', '275', '31', '20', '32', '104', '121', '53', '', '163',
|
||||||
'', '163', '173', '161', '', '152', '', '', '', '76', '', '69', '31', ''],
|
'', '163', '173', '161', '', '152', '', '', '', '76', '', '69', '31',
|
||||||
|
''],
|
||||||
['Mikado', '781', '208', '19', '39', '17', '81', '90', '63', '', '149',
|
['Mikado', '781', '208', '19', '39', '17', '81', '90', '63', '', '149',
|
||||||
'', '149', '145', '147', '', '143', '', '', '', '', '113', '', '', ''],
|
'', '149', '145', '147', '', '143', '', '', '', '', '113', '', '', ''],
|
||||||
['Millen', '353', '139', '7', '16', '13', '38', '49', '19', '', '62',
|
['Millen', '353', '139', '7', '16', '13', '38', '49', '19', '', '62',
|
||||||
|
|
@ -2764,7 +2771,9 @@ data_stream_vertical_headers = [
|
||||||
['Mitchell', '327', '96', '12', '17', '7', '29', '41', '17', '', '57',
|
['Mitchell', '327', '96', '12', '17', '7', '29', '41', '17', '', '57',
|
||||||
'', '55', '57', '60', '', '56', '', '', '', '', '', '', '', ''],
|
'', '55', '57', '60', '', '56', '', '', '', '', '', '', '', ''],
|
||||||
['City Harrisville', '389', '171', '16', '15', '18', '35', '49', '31', '',
|
['City Harrisville', '389', '171', '16', '15', '18', '35', '49', '31', '',
|
||||||
'78', '', '80', '82', '81', '', '77', '', '', '', '73', '', '', '', ''],
|
'78', '', '80', '82', '81', '', '77', '', '', '', '73', '', '', '',
|
||||||
['Totals', '9237', '3396', '371', '373', '317', '1102', '1221', '835', '0', '1914', '0',
|
''],
|
||||||
'1934', '1967', '1963', '0', '1889', '0', '363', '219', '381', '321', '268', '160', '0']
|
['Totals', '9237', '3396', '371', '373', '317', '1102', '1221', '835', '0',
|
||||||
|
'1914', '0', '1934', '1967', '1963', '0', '1889', '0', '363', '219',
|
||||||
|
'381', '321', '268', '160', '0']
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.testing import assert_frame_equal
|
from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
|
|
@ -11,12 +13,30 @@ from camelot.__version__ import generate_version
|
||||||
|
|
||||||
from .data import *
|
from .data import *
|
||||||
|
|
||||||
|
import pdfminer
|
||||||
|
|
||||||
|
# The version of PDFMiner has an impact on some of the tests. Unfortunately,
|
||||||
|
# we can't enforce usage of a recent version of PDFMiner without dropping
|
||||||
|
# support for Python 2.
|
||||||
|
# To check the version of pdfminer.six installed:
|
||||||
|
# pip freeze | grep pdfminer.six
|
||||||
|
# To force upgrade:
|
||||||
|
# pip install --upgrade --force-reinstall pdfminer.six
|
||||||
|
# To force usage of a Python 2 compatible version:
|
||||||
|
# pip install "pdfminer.six==20191110"
|
||||||
|
# This condition can be removed in favor of a version requirement bump for
|
||||||
|
# pdfminer.six once support for Python 2 is dropped.
|
||||||
|
|
||||||
|
LEGACY_PDF_MINER = pdfminer.__version__ < "20200402"
|
||||||
|
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
|
||||||
def test_parsing_report():
|
def test_parsing_report():
|
||||||
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
|
parsing_report = {
|
||||||
|
"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1
|
||||||
|
}
|
||||||
|
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
|
|
@ -64,6 +84,8 @@ def test_stream_table_rotated():
|
||||||
assert_frame_equal(df, result_without_first_row)
|
assert_frame_equal(df, result_without_first_row)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(LEGACY_PDF_MINER,
|
||||||
|
reason="depends on a recent version of PDFMiner")
|
||||||
def test_stream_two_tables():
|
def test_stream_two_tables():
|
||||||
df1 = pd.DataFrame(data_stream_two_tables_1)
|
df1 = pd.DataFrame(data_stream_two_tables_1)
|
||||||
df2 = pd.DataFrame(data_stream_two_tables_2)
|
df2 = pd.DataFrame(data_stream_two_tables_2)
|
||||||
|
|
@ -106,6 +128,8 @@ def test_stream_columns():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(LEGACY_PDF_MINER,
|
||||||
|
reason="depends on a recent version of PDFMiner")
|
||||||
def test_stream_split_text():
|
def test_stream_split_text():
|
||||||
df = pd.DataFrame(data_stream_split_text)
|
df = pd.DataFrame(data_stream_split_text)
|
||||||
|
|
||||||
|
|
@ -143,6 +167,8 @@ def test_stream_edge_tol():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(LEGACY_PDF_MINER,
|
||||||
|
reason="depends on a recent version of PDFMiner")
|
||||||
def test_stream_layout_kwargs():
|
def test_stream_layout_kwargs():
|
||||||
df = pd.DataFrame(data_stream_layout_kwargs)
|
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||||
|
|
||||||
|
|
@ -248,7 +274,8 @@ def test_repr():
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0])
|
||||||
|
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -258,21 +285,24 @@ def test_pages():
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0])
|
||||||
|
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
tables = camelot.read_pdf(url, pages="1-end")
|
tables = camelot.read_pdf(url, pages="1-end")
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0])
|
||||||
|
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
tables = camelot.read_pdf(url, pages="all")
|
tables = camelot.read_pdf(url, pages="all")
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0])
|
||||||
|
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -282,7 +312,8 @@ def test_url():
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0])
|
||||||
|
== "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -302,7 +333,12 @@ def test_table_order():
|
||||||
return t
|
return t
|
||||||
|
|
||||||
table_list = TableList(
|
table_list = TableList(
|
||||||
[_make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2)]
|
[
|
||||||
|
_make_table(2, 1),
|
||||||
|
_make_table(1, 1),
|
||||||
|
_make_table(3, 4),
|
||||||
|
_make_table(1, 2)
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
assert [(t.page, t.order) for t in sorted(table_list)] == [
|
assert [(t.page, t.order) for t in sorted(table_list)] == [
|
||||||
|
|
|
||||||
|
|
@ -4,13 +4,30 @@ import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
import pdfminer
|
||||||
|
|
||||||
import camelot
|
import camelot
|
||||||
|
|
||||||
|
# The version of PDFMiner has an impact on some of the tests. Unfortunately,
|
||||||
|
# we can't enforce usage of a recent version of PDFMiner without dropping
|
||||||
|
# support for Python 2.
|
||||||
|
# To check the version of pdfminer.six installed:
|
||||||
|
# pip freeze | grep pdfminer.six
|
||||||
|
# To force upgrade:
|
||||||
|
# pip install --upgrade --force-reinstall pdfminer.six
|
||||||
|
# To force usage of a Python 2 compatible version:
|
||||||
|
# pip install "pdfminer.six==20191110"
|
||||||
|
# This condition can be removed in favor of a version requirement bump for
|
||||||
|
# pdfminer.six once support for Python 2 is dropped.
|
||||||
|
|
||||||
|
LEGACY_PDF_MINER = pdfminer.__version__ < "20200402"
|
||||||
|
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(LEGACY_PDF_MINER,
|
||||||
|
reason="depends on a recent version of PDFMiner")
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_text_plot():
|
def test_text_plot():
|
||||||
|
|
@ -35,6 +52,8 @@ def test_lattice_contour_plot():
|
||||||
return camelot.plot(tables[0], kind='contour')
|
return camelot.plot(tables[0], kind='contour')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(LEGACY_PDF_MINER,
|
||||||
|
reason="depends on a recent version of PDFMiner")
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_stream_contour_plot():
|
def test_stream_contour_plot():
|
||||||
|
|
@ -59,6 +78,8 @@ def test_joint_plot():
|
||||||
return camelot.plot(tables[0], kind='joint')
|
return camelot.plot(tables[0], kind='joint')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(LEGACY_PDF_MINER,
|
||||||
|
reason="depends on a recent version of PDFMiner")
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_textedge_plot():
|
def test_textedge_plot():
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue