Moved duplicated common code to base objects

* Move table initialization common areas to BaseParser
* Stop relying on intermediate file name for source page index
* Create table comparison utility function to help in debugging
* Generate pdf as images in stream mode plots
* Fix pylint errors
pull/127/head
Frh 2020-04-10 16:02:00 -07:00
parent dff9f5cd82
commit 467c4a3de0
17 changed files with 402 additions and 153 deletions

View File

@ -10,6 +10,11 @@ from operator import itemgetter
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from .utils import (
compute_accuracy,
compute_whitespace,
)
# minimum number of vertical textline intersections for a textedge # minimum number of vertical textline intersections for a textedge
# to be considered valid # to be considered valid
@ -479,6 +484,9 @@ class Table(object):
self.whitespace = 0 self.whitespace = 0
self.order = None self.order = None
self.page = None self.page = None
self.flavor = None # Flavor of the parser that generated the table
self.pdf_size = None # Dimensions of the original PDF page
self.debug_info = None # Field holding debug data
def __repr__(self): def __repr__(self):
return "<{} shape={}>".format(self.__class__.__name__, self.shape) return "<{} shape={}>".format(self.__class__.__name__, self.shape)
@ -513,6 +521,17 @@ class Table(object):
} }
return report return report
def fill_data(self, parser):
self.flavor = parser.id
self.debug_info = parser.debug_info
data = self.data
self.df = pd.DataFrame(data)
self.shape = self.df.shape
self.whitespace = compute_whitespace(data)
self.pdf_size = (parser.pdf_width, parser.pdf_height)
def set_all_edges(self): def set_all_edges(self):
"""Sets all table edges to True. """Sets all table edges to True.
""" """
@ -747,6 +766,7 @@ class Table(object):
"encoding": "utf-8", "encoding": "utf-8",
} }
kw.update(kwargs) kw.update(kwargs)
# pylint: disable=abstract-class-instantiated
writer = pd.ExcelWriter(path) writer = pd.ExcelWriter(path)
self.df.to_excel(writer, **kw) self.df.to_excel(writer, **kw)
writer.save() writer.save()
@ -874,6 +894,7 @@ class TableList(object):
self._compress_dir(**kwargs) self._compress_dir(**kwargs)
elif f == "excel": elif f == "excel":
filepath = os.path.join(dirname, basename) filepath = os.path.join(dirname, basename)
# pylint: disable=abstract-class-instantiated
writer = pd.ExcelWriter(filepath) writer = pd.ExcelWriter(filepath)
for table in self._tables: for table in self._tables:
sheet_name = "page-{}-table-{}".format(table.page, table.order) sheet_name = "page-{}-table-{}".format(table.page, table.order)

View File

@ -101,26 +101,32 @@ class PDFHandler(object):
temp : str temp : str
Tmp directory. Tmp directory.
Returns
-------
fpath : str
The path of the single page PDF created.
""" """
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
with open(filepath, "rb") as fileobj: with open(filepath, "rb") as fileobj:
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
froot, fext = os.path.splitext(fpath) froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1) p = infile.getPage(page - 1)
outfile = PdfFileWriter() outfile = PdfFileWriter()
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(fpath) layout, __ = get_page_layout(fpath)
# fix rotated PDF # fix rotated PDF
chars = get_text_objects(layout, ltype="char") chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text") vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "": if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) fpath_new = "".join(
[froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False) infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
@ -134,9 +140,11 @@ class PDFHandler(object):
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
return fpath
def parse( def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs self, flavor="lattice", suppress_stdout=False,
layout_kwargs={}, **kwargs
): ):
"""Extracts tables by calling parser.get_tables on all single """Extracts tables by calling parser.get_tables on all single
page PDFs. page PDFs.
@ -149,7 +157,7 @@ class PDFHandler(object):
suppress_stdout : str (default: False) suppress_stdout : str (default: False)
Suppress logs and warnings. Suppress logs and warnings.
layout_kwargs : dict, optional (default: {}) layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
kwargs : dict kwargs : dict
See camelot.read_pdf kwargs. See camelot.read_pdf kwargs.
@ -161,15 +169,22 @@ class PDFHandler(object):
""" """
tables = [] tables = []
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
for p in self.pages: parser = \
self._save_page(self.filepath, p, tempdir) Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
pages = [
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages # For each of the pages we need to parse, generate a single page
] # .pdf in a temporary folder.
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for page_idx in self.pages:
for p in pages: single_page_pdf_file = self._save_page(
self.filepath,
page_idx,
tempdir
)
t = parser.extract_tables( t = parser.extract_tables(
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs single_page_pdf_file,
page_idx,
suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs
) )
tables.extend(t) tables.extend(t)
return TableList(sorted(tables)) return TableList(sorted(tables))

View File

@ -2,11 +2,13 @@
from __future__ import division from __future__ import division
import cv2 from cv2 import cv2
import numpy as np import numpy as np
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): def adaptive_threshold(
imagename, process_background=False, blocksize=15, c=-2
):
"""Thresholds an image using OpenCV's adaptiveThreshold. """Thresholds an image using OpenCV's adaptiveThreshold.
Parameters Parameters
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
c : int, optional (default: -2) c : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
Returns Returns
------- -------
@ -39,7 +41,9 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
if process_background: if process_background:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, blocksize, c
) )
else: else:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
@ -54,7 +58,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
def find_lines( def find_lines(
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0 threshold, regions=None, direction="horizontal",
line_scale=15, iterations=0
): ):
"""Finds horizontal and vertical lines by applying morphological """Finds horizontal and vertical lines by applying morphological
transformations on an image. transformations on an image.
@ -78,7 +83,7 @@ def find_lines(
iterations : int, optional (default: 0) iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
Returns Returns
------- -------
@ -100,13 +105,14 @@ def find_lines(
size = threshold.shape[1] // line_scale size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None: elif direction is None:
raise ValueError("Specify direction as either 'vertical' or 'horizontal'") raise ValueError("Specify direction as either 'vertical' "
"or 'horizontal'")
if regions is not None: if regions is not None:
region_mask = np.zeros(threshold.shape) region_mask = np.zeros(threshold.shape)
for region in regions: for region in regions:
x, y, w, h = region x, y, w, h = region
region_mask[y : y + h, x : x + w] = 1 region_mask[y: y + h, x: x + w] = 1
threshold = np.multiply(threshold, region_mask) threshold = np.multiply(threshold, region_mask)
threshold = cv2.erode(threshold, el) threshold = cv2.erode(threshold, el)
@ -115,12 +121,16 @@ def find_lines(
try: try:
_, contours, _ = cv2.findContours( _, contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE threshold.astype(np.uint8),
cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE
) )
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, _ = cv2.findContours( contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE threshold.astype(np.uint8),
cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE
) )
for c in contours: for c in contours:
@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal):
tables = {} tables = {}
for c in contours: for c in contours:
x, y, w, h = c x, y, w, h = c
roi = joints[y : y + h, x : x + w] roi = joints[y: y + h, x: x + w]
try: try:
__, jc, __ = cv2.findContours( __, jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE

View File

@ -2,19 +2,94 @@
import os import os
from ..utils import get_page_layout, get_text_objects from ..utils import (
get_page_layout,
get_text_objects
)
from ..core import Table
from ..image_processing import (
adaptive_threshold,
find_lines,
find_contours,
find_joints
)
# Pylint can't detect contents of cv2
from cv2 import imread # pylint: disable=no-name-in-module
class BaseParser(object): class BaseParser(object):
"""Defines a base parser. """Defines a base parser.
""" """
def __init__(self, parser_id):
self.imagename = None
self.pdf_image = None
self.id = parser_id
def _generate_layout(self, filename, layout_kwargs): # For plotting details of parsing algorithms
self.debug_info = {}
def _generate_layout(self, filename, page_idx, layout_kwargs):
self.filename = filename self.filename = filename
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs) self.layout, self.dimensions = get_page_layout(
filename,
**layout_kwargs
)
self.images = get_text_objects(self.layout, ltype="image") self.images = get_text_objects(self.layout, ltype="image")
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text") self.horizontal_text = get_text_objects(
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text") self.layout,
ltype="horizontal_text"
)
self.vertical_text = get_text_objects(
self.layout,
ltype="vertical_text"
)
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)
self.page = page_idx
def generate_image(self):
if self.pdf_image is None:
self._generate_image_file()
self.pdf_image = imread(self.imagename)
def _generate_image_file(self):
if self.imagename:
return
from ..ext.ghostscript import Ghostscript
self.imagename = "".join([self.rootname, ".png"])
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
self.imagename, self.filename
)
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
Ghostscript(*gs_call, stdout=null)
# with Ghostscript(*gs_call, stdout=null) as gs:
# pass
null.close()
"""Initialize new table object, ready to be populated
Parameters
----------
table_idx : int
Index of this table within the pdf page analyzed
cols : list
list of coordinate boundaries tuples (left, right)
rows : list
list of coordinate boundaries tuples (bottom, top)
Returns
-------
t : camelot.core.Table
"""
def _initialize_new_table(self, table_idx, cols, rows):
table = Table(cols, rows)
table.page = self.page
table.order = table_idx + 1
return table

View File

@ -13,7 +13,6 @@ import numpy as np
import pandas as pd import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import Table
from ..utils import ( from ..utils import (
scale_image, scale_image,
scale_pdf, scale_pdf,
@ -22,7 +21,6 @@ from ..utils import (
merge_close_lines, merge_close_lines,
get_table_index, get_table_index,
compute_accuracy, compute_accuracy,
compute_whitespace,
) )
from ..image_processing import ( from ..image_processing import (
adaptive_threshold, adaptive_threshold,
@ -80,7 +78,7 @@ class Lattice(BaseParser):
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
threshold_constant : int, optional (default: -2) threshold_constant : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
@ -114,6 +112,7 @@ class Lattice(BaseParser):
resolution=300, resolution=300,
**kwargs **kwargs
): ):
super().__init__("lattice")
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
@ -208,19 +207,6 @@ class Lattice(BaseParser):
t.cells[i][j].text = t.cells[i - 1][j].text t.cells[i][j].text = t.cells[i - 1][j].text
return t return t
def _generate_image(self):
from ..ext.ghostscript import Ghostscript
self.imagename = "".join([self.rootname, ".png"])
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
self.imagename, self.filename
)
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
with Ghostscript(*gs_call, stdout=null) as gs:
pass
null.close()
def _generate_table_bbox(self): def _generate_table_bbox(self):
def scale_areas(areas): def scale_areas(areas):
scaled_areas = [] scaled_areas = []
@ -234,20 +220,21 @@ class Lattice(BaseParser):
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas return scaled_areas
self.image, self.threshold = adaptive_threshold( self.pdf_image, self.threshold = adaptive_threshold(
self.imagename, self.imagename,
process_background=self.process_background, process_background=self.process_background,
blocksize=self.threshold_blocksize, blocksize=self.threshold_blocksize,
c=self.threshold_constant, c=self.threshold_constant,
) )
image_width = self.image.shape[1] image_width = self.pdf_image.shape[1]
image_height = self.image.shape[0] image_height = self.pdf_image.shape[0]
image_width_scaler = image_width / float(self.pdf_width) image_width_scaler = image_width / float(self.pdf_width)
image_height_scaler = image_height / float(self.pdf_height) image_height_scaler = image_height / float(self.pdf_height)
pdf_width_scaler = self.pdf_width / float(image_width) pdf_width_scaler = self.pdf_width / float(image_width)
pdf_height_scaler = self.pdf_height / float(image_height) pdf_height_scaler = self.pdf_height / float(image_height)
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) image_scalers = (image_width_scaler,
image_height_scaler, self.pdf_height)
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
if self.table_areas is None: if self.table_areas is None:
@ -291,7 +278,11 @@ class Lattice(BaseParser):
self.table_bbox_unscaled = copy.deepcopy(table_bbox) self.table_bbox_unscaled = copy.deepcopy(table_bbox)
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( [
self.table_bbox,
self.vertical_segments,
self.horizontal_segments
] = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers table_bbox, vertical_segments, horizontal_segments, pdf_scalers
) )
@ -315,7 +306,10 @@ class Lattice(BaseParser):
rows.extend([tk[1], tk[3]]) rows.extend([tk[1], tk[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol) rows = merge_close_lines(
sorted(rows, reverse=True),
line_tol=self.line_tol
)
# make grid using x and y coord of shortlisted rows and cols # make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
@ -328,7 +322,7 @@ class Lattice(BaseParser):
if v_s is None or h_s is None: if v_s is None or h_s is None:
raise ValueError("No segments found on {}".format(self.rootname)) raise ValueError("No segments found on {}".format(self.rootname))
table = Table(cols, rows) table = self._initialize_new_table(table_idx, cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True # set table border edges to True
@ -359,48 +353,44 @@ class Lattice(BaseParser):
accuracy = compute_accuracy([[100, pos_errors]]) accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None: if self.copy_text is not None:
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text) table = Lattice._copy_spanning_text(
table,
copy_text=self.copy_text
)
data = table.data table.fill_data(self)
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "lattice"
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting # for plotting
_text = [] _text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text table._text = _text
table._image = (self.image, self.table_bbox_unscaled) table._image = (self.pdf_image, self.table_bbox_unscaled)
table._segments = (self.vertical_segments, self.horizontal_segments) table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None table._textedges = None
return table return table
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): def extract_tables(self, filename, page_idx=1, suppress_stdout=False,
self._generate_layout(filename, layout_kwargs) layout_kwargs={}):
self._generate_layout(filename, page_idx, layout_kwargs)
if not suppress_stdout: if not suppress_stdout:
logger.info("Processing {}".format(os.path.basename(self.rootname))) logger.info(f"Processing {os.path.basename(self.rootname)}")
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
warnings.warn( warnings.warn(
"{} is image-based, camelot only works on" f"{os.path.basename(self.rootname)} is image-based, "
" text-based pages.".format(os.path.basename(self.rootname)) "camelot only works on text-based pages."
) )
else: else:
warnings.warn( warnings.warn(
"No tables found on {}".format(os.path.basename(self.rootname)) f"No tables found on {os.path.basename(self.rootname)}"
) )
return [] return []
self._generate_image() self._generate_image_file()
self._generate_table_bbox() self._generate_table_bbox()
_tables = [] _tables = []
@ -408,8 +398,10 @@ class Lattice(BaseParser):
for table_idx, tk in enumerate( for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
): ):
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) cols, rows, v_s, h_s = self._generate_columns_and_rows(
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) table_idx, tk)
table = self._generate_table(
table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = tk table._bbox = tk
_tables.append(table) _tables.append(table)

View File

@ -9,7 +9,7 @@ import numpy as np
import pandas as pd import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import TextEdges, Table from ..core import TextEdges
from ..utils import (text_in_bbox, get_table_index, compute_accuracy, from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace) compute_whitespace)
@ -69,6 +69,7 @@ class Stream(BaseParser):
column_tol=0, column_tol=0,
**kwargs **kwargs
): ):
super().__init__("stream")
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.columns = columns self.columns = columns
@ -120,21 +121,26 @@ class Stream(BaseParser):
Two-dimensional list of text objects grouped into rows. Two-dimensional list of text objects grouped into rows.
""" """
row_y = 0 row_y = None
rows = [] rows = []
temp = [] temp = []
for t in text: non_empty_text = [t for t in text if t.get_text().strip()]
for t in non_empty_text:
# is checking for upright necessary? # is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs # if t.get_text().strip() and all([obj.upright \
# for obj in t._objs
# if type(obj) is LTChar]): # if type(obj) is LTChar]):
if t.get_text().strip(): if row_y is not None and \
if not np.isclose(row_y, t.y0, atol=row_tol): not np.isclose(row_y, t.y0, atol=row_tol) and \
rows.append(sorted(temp, key=lambda t: t.x0)) 0.5 * (t.y1 + t.y0) < row_y:
temp = [] rows.append(sorted(temp, key=lambda t: t.x0))
row_y = t.y0 temp = []
temp.append(t) # We update the row's bottom as we go, to be forgiving if there
# is a gradual change across multiple columns.
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0)) rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # TODO: hacky
return rows return rows
@staticmethod @staticmethod
@ -278,7 +284,7 @@ class Stream(BaseParser):
def _nurminen_table_detection(self, textlines): def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm """A general implementation of the table detection algorithm
described by Anssi Nurminen's master's thesis. described by Anssi Nurminen's master's thesis.
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
Assumes that tables are situated relatively far apart Assumes that tables are situated relatively far apart
vertically. vertically.
@ -378,12 +384,29 @@ class Stream(BaseParser):
"No tables found in table area {}" "No tables found in table area {}"
.format(table_idx + 1) .format(table_idx + 1)
) )
cols = [
(t.x0, t.x1) for r in rows_grouped if len(r) == ncols # Identify rows which contain the mode of the number of columns
for t in r full_rows = list(filter(
lambda row: len(row) == ncols,
rows_grouped))
cells_on_full_rows_xrange = [
(t.x0, t.x1) for r in full_rows for t in r
] ]
cols = self._merge_columns(sorted(cols), # TODO: fixme / make a decision on this
# plausible_rows = list(filter(
# lambda row: len(row) <= ncols*1.2 and len(row) >= ncols*.8,
# rows_grouped))
# plausible_cells_xrange = [
# (t.x0, t.x1) for r in plausible_rows for t in r
# ]
# self.debug_info['plausible_rows'] = plausible_rows
# Identify column boundaries based on the contents of these rows
cols = self._merge_columns(sorted(cells_on_full_rows_xrange),
column_tol=self.column_tol) column_tol=self.column_tol)
# cols = self._merge_columns(sorted(plausible_cells_xrange),
# column_tol=self.column_tol)
inner_text = [] inner_text = []
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
@ -409,7 +432,7 @@ class Stream(BaseParser):
return cols, rows return cols, rows
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, cols, rows, **kwargs):
table = Table(cols, rows) table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges() table = table.set_all_edges()
pos_errors = [] pos_errors = []
@ -431,31 +454,25 @@ class Stream(BaseParser):
table.cells[r_idx][c_idx].text = text table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]]) accuracy = compute_accuracy([[100, pos_errors]])
data = table.data table.fill_data(self)
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "stream"
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting # for plotting
_text = [] _text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text table._text = _text
table._image = None self.generate_image()
table._image = (self.pdf_image, self.table_bbox)
table._segments = None table._segments = None
table._textedges = self.textedges table._textedges = self.textedges
return table return table
def extract_tables(self, filename, suppress_stdout=False, def extract_tables(self, filename, page_idx=1, suppress_stdout=False,
layout_kwargs={}): layout_kwargs={}):
self._generate_layout(filename, layout_kwargs) self._generate_layout(filename, page_idx, layout_kwargs)
if not suppress_stdout: if not suppress_stdout:
logger.info("Processing {}".format( logger.info("Processing {}".format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
@ -474,6 +491,8 @@ class Stream(BaseParser):
) )
return [] return []
# Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas.
self._generate_table_bbox() self._generate_table_bbox()
_tables = [] _tables = []

View File

@ -37,7 +37,7 @@ class PlotMethods(object):
raise NotImplementedError( raise NotImplementedError(
"Lattice flavor does not support kind='{}'".format(kind) "Lattice flavor does not support kind='{}'".format(kind)
) )
elif table.flavor == "stream" and kind in ["joint", "line"]: elif table.flavor == "stream" and kind in ["line"]:
raise NotImplementedError( raise NotImplementedError(
"Stream flavor does not support kind='{}'".format(kind) "Stream flavor does not support kind='{}'".format(kind)
) )
@ -64,9 +64,18 @@ class PlotMethods(object):
for t in table._text: for t in table._text:
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1])) ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
alpha=0.5
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
img, __ = table._image
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
def grid(self, table): def grid(self, table):
@ -94,6 +103,9 @@ class PlotMethods(object):
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
if cell.bottom: if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
img, __ = table._image
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
def contour(self, table): def contour(self, table):
@ -109,12 +121,8 @@ class PlotMethods(object):
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
try: img, table_bbox = table._image
img, table_bbox = table._image _FOR_LATTICE = table.flavor == "lattice"
_FOR_LATTICE = True
except TypeError:
img, table_bbox = (None, {table._bbox: None})
_FOR_LATTICE = False
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal") ax = fig.add_subplot(111, aspect="equal")
@ -132,7 +140,8 @@ class PlotMethods(object):
for t in table_bbox.keys(): for t in table_bbox.keys():
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red" (t[0], t[1]), t[2] - t[0], t[3] - t[1],
fill=False, color="red"
) )
) )
if not _FOR_LATTICE: if not _FOR_LATTICE:
@ -143,6 +152,8 @@ class PlotMethods(object):
if _FOR_LATTICE: if _FOR_LATTICE:
ax.imshow(img) ax.imshow(img)
else:
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
def textedge(self, table): def textedge(self, table):
@ -164,7 +175,11 @@ class PlotMethods(object):
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.add_patch( ax.add_patch(
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue") patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue",
alpha=0.5
)
) )
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
@ -172,6 +187,8 @@ class PlotMethods(object):
for te in table._textedges: for te in table._textedges:
ax.plot([te.x, te.x], [te.y0, te.y1]) ax.plot([te.x, te.x], [te.y0, te.y1])
img, __ = table._image
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
def joint(self, table): def joint(self, table):
@ -220,4 +237,8 @@ class PlotMethods(object):
ax.plot([v[0], v[2]], [v[1], v[3]]) ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal: for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]]) ax.plot([h[0], h[2]], [h[1], h[3]])
img, __ = table._image
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig

View File

@ -13,6 +13,7 @@ from itertools import groupby
from operator import itemgetter from operator import itemgetter
import numpy as np import numpy as np
import pandas as pd
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -30,6 +31,9 @@ from pdfminer.layout import (
) )
# pylint: disable=import-error
# PyLint will evaluate both branches, and will necessarily complain about one
# of them.
PY3 = sys.version_info[0] >= 3 PY3 = sys.version_info[0] >= 3
if PY3: if PY3:
from urllib.request import urlopen from urllib.request import urlopen
@ -310,7 +314,8 @@ def get_rotation(chars, horizontal_text, vertical_text):
if hlen < vlen: if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise" rotation = "anticlockwise" if clockwise < anticlockwise \
else "clockwise"
return rotation return rotation
@ -341,12 +346,16 @@ def segments_in_bbox(bbox, v_segments, h_segments):
v_s = [ v_s = [
v v
for v in v_segments for v in v_segments
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2 if v[1] > lb[1] - 2 and
v[3] < rt[1] + 2 and
lb[0] - 2 <= v[0] <= rt[0] + 2
] ]
h_s = [ h_s = [
h h
for h in h_segments for h in h_segments
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2 if h[0] > lb[0] - 2 and
h[2] < rt[0] + 2 and
lb[1] - 2 <= h[1] <= rt[1] + 2
] ]
return v_s, h_s return v_s, h_s
@ -464,10 +473,10 @@ def flag_font_size(textline, direction, strip_text=""):
for t in textline for t in textline
if not isinstance(t, LTAnno) if not isinstance(t, LTAnno)
] ]
l = [np.round(size, decimals=6) for text, size in d] text_sizes = [np.round(size, decimals=6) for text, size in d]
if len(set(l)) > 1: if len(set(text_sizes)) > 1:
flist = [] flist = []
min_size = min(l) min_size = min(text_sizes)
for key, chars in groupby(d, itemgetter(1)): for key, chars in groupby(d, itemgetter(1)):
if key == min_size: if key == min_size:
fchars = [t[0] for t in chars] fchars = [t[0] for t in chars]
@ -511,7 +520,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
of row/column and text is the an lttextline substring. of row/column and text is the an lttextline substring.
""" """
idx = 0
cut_text = [] cut_text = []
bbox = textline.bbox bbox = textline.bbox
try: try:
@ -528,7 +536,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
] ]
r = r_idx[0] r = r_idx[0]
x_cuts = [ x_cuts = [
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right (c, table.cells[r][c].x2)
for c in x_overlap
if table.cells[r][c].right
] ]
if not x_cuts: if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
@ -561,7 +571,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
] ]
c = c_idx[0] c = c_idx[0]
y_cuts = [ y_cuts = [
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom (r, table.cells[r][c].y1)
for r in y_overlap
if table.cells[r][c].bottom
] ]
if not y_cuts: if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
@ -644,9 +656,8 @@ def get_table_index(
""" """
r_idx, c_idx = [-1] * 2 r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)): for r in range(len(table.rows)):
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \
r (t.y0 + t.y1) / 2.0 > table.rows[r][1]:
][1]:
lt_col_overlap = [] lt_col_overlap = []
for c in table.cols: for c in table.cols:
if c[0] <= t.x1 and c[1] >= t.x0: if c[0] <= t.x1 and c[1] >= t.x0:
@ -681,7 +692,9 @@ def get_table_index(
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y charea = X * Y
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea error = (
(X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))
) / charea
if split_text: if split_text:
return ( return (
@ -697,13 +710,16 @@ def get_table_index(
( (
r_idx, r_idx,
c_idx, c_idx,
flag_font_size(t._objs, direction, strip_text=strip_text), flag_font_size(t._objs,
direction,
strip_text=strip_text),
) )
], ],
error, error,
) )
else: else:
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \
error
def compute_accuracy(error_weights): def compute_accuracy(error_weights):
@ -751,7 +767,6 @@ def compute_whitespace(d):
""" """
whitespace = 0 whitespace = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d: for i in d:
for j in i: for j in i:
if j.strip() == "": if j.strip() == "":
@ -852,3 +867,78 @@ def get_text_objects(layout, ltype="char", t=None):
except AttributeError: except AttributeError:
pass pass
return t return t
def compare_tables(left, right):
"""Compare two tables and displays differences in a human readable form.
Parameters
----------
left : data frame
right : data frame
"""
diff_cols = right.shape[1]-left.shape[1]
diff_rows = right.shape[0]-left.shape[0]
differences = []
if (diff_rows):
differences.append(
f"{abs(diff_rows)} "
f"{'more' if diff_rows>0 else 'fewer'} rows"
)
if (diff_cols):
differences.append(
f"{abs(diff_cols)} "
f"{'more' if diff_cols>0 else 'fewer'} columns"
)
if differences:
differences_str = " and ".join(differences)
print(f"Right has {differences_str} than left "
f"[{right.shape[0]},{right.shape[1]}] vs "
f"[{left.shape[0]},{left.shape[1]}]")
table1, table2 = [left, right]
name_table1, name_table2 = ["left", "right"]
if not diff_rows:
# Same number of rows: compare columns since they're of the same length
if diff_cols > 0:
# Use the longest table as a reference
table1, table2 = table2, table1
name_table1, name_table2 = name_table2, name_table1
for i, col in enumerate(table1.columns):
lcol = table1.iloc[:, i]
if col in table2:
scol = table2.iloc[:, i]
if not lcol.equals(scol):
diff_df = pd.DataFrame()
diff_df[name_table1] = scol
diff_df[name_table2] = lcol
diff_df["Match"] = lcol == scol
print(
f"Column {i} different:\n"
f"{diff_df}"
)
break
else:
print("Column {i} unique to {name_table1}: {lcol}")
break
elif not diff_cols:
# Same number of cols: compare rows since they're of the same length
if diff_rows > 0:
# Use the longest table as a reference
table1, table2 = table2, table1
name_table1, name_table2 = name_table2, name_table1
for i in table1.iterrows():
lrow = table1.loc[i, :]
if i < table2.shape[1]:
srow = table2.loc[i, :]
if not lrow.equals(srow):
diff_df = pd.DataFrame()
diff_df = diff_df.append(lrow, ignore_index=True)
diff_df = diff_df.append(srow, ignore_index=True)
diff_df.insert(0, 'Table', [name_table1, name_table2])
print(f"Column {i} differs:")
print(diff_df.values)
break
else:
print(f"Row {i} unique to {name_table1}: {lrow}")
break

View File

@ -838,7 +838,7 @@ data_stream_two_tables_1 = [
"2,330 .9", "2,330 .9",
], ],
[ [
"Violent crime . . . . . . . .\n . .\n . .\n . .\n" \ "Violent crime . . . . . . . .\n . .\n . .\n . .\n"
" . .\n . .", " . .\n . .",
"467 .9", "467 .9",
"69 .1", "69 .1",
@ -1503,15 +1503,8 @@ data_stream_table_areas = [
] ]
data_stream_columns = [ data_stream_columns = [
[ ["Clave \nEntidad", "Nombre Entidad", "Clave \nMunicipio",
"Clave", "Nombre Municipio", "Clave \nLocalidad", "Nombre Localidad"],
"Nombre Entidad",
"Clave",
"Nombre Municipio",
"Clave",
"Nombre Localidad",
],
["Entidad", "", "Municipio", "", "Localidad", ""],
["01", "Aguascalientes", "001", "Aguascalientes", "0094", "Granja Adelita"], ["01", "Aguascalientes", "001", "Aguascalientes", "0094", "Granja Adelita"],
["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"], ["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"],
["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"], ["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"],
@ -2732,11 +2725,9 @@ data_stream_vertical_headers = [
['', '', '', '', '', '', '', '', '', '', '', 'Congress-', ['', '', '', '', '', '', '', '', '', '', '', 'Congress-',
'Senator 36th', 'Rep106th', '', 'Reg. of', '', 'Road', '', '', 'Senator 36th', 'Rep106th', '', 'Reg. of', '', 'Road', '', '',
'Distri', 'Dist', '', '', 'Dist'], 'Distri', 'Dist', '', '', 'Dist'],
['', '', '', '', '', '', '', '', '', '', '1st Dist', '', 'Dist.', ['', '', '', '', '', 'Governor', '', '', 'U.S. Senator', '',
'Dist.', '', 'Deeds', '', 'Commission', '', 'District #1', '1st Dist', '', 'Dist.', 'Dist.', '', 'Deeds', '', 'Commission',
'ct #2', '#3', 'Dist #4', '', '#5'], '', 'District #1', 'ct #2', '#3', 'Dist #4', '', '#5'],
['', '', '', '', '', 'Governor', '', '', 'U.S. Senator', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', ''],
['', 'Number of Registered voters', 'Poll Book Totals', ['', 'Number of Registered voters', 'Poll Book Totals',
'Brian Calley', 'Patrick Colbeck', 'Jim Hines', 'Bill Schuette', 'Brian Calley', 'Patrick Colbeck', 'Jim Hines', 'Bill Schuette',
'John James', 'Sandy Pensler', '', 'Jack Bergman', '', 'John James', 'Sandy Pensler', '', 'Jack Bergman', '',

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.2 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.7 KiB

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.9 KiB

After

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 113 KiB

View File

@ -9,10 +9,12 @@ from pandas.testing import assert_frame_equal
import camelot import camelot
from camelot.core import Table, TableList from camelot.core import Table, TableList
from camelot.utils import compare_tables
from camelot.__version__ import generate_version from camelot.__version__ import generate_version
from .data import * from .data import *
import pdfminer import pdfminer
# The version of PDFMiner has an impact on some of the tests. Unfortunately, # The version of PDFMiner has an impact on some of the tests. Unfortunately,
@ -48,9 +50,11 @@ def test_password():
filename = os.path.join(testdir, "health_protected.pdf") filename = os.path.join(testdir, "health_protected.pdf")
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream") tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
tables = camelot.read_pdf(filename, password="userpass", flavor="stream") tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
@ -59,6 +63,7 @@ def test_stream():
filename = os.path.join(testdir, "health.pdf") filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="stream") tables = camelot.read_pdf(filename, flavor="stream")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
@ -79,6 +84,7 @@ def test_stream_table_rotated():
filename = os.path.join(testdir, "anticlockwise_table_2.pdf") filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="stream") tables = camelot.read_pdf(filename, flavor="stream")
assert len(tables) == 1
result_without_first_row = pd.DataFrame( result_without_first_row = pd.DataFrame(
tables[0].df.drop(tables[0].df.columns[0], axis=1).values) tables[0].df.drop(tables[0].df.columns[0], axis=1).values)
assert_frame_equal(df, result_without_first_row) assert_frame_equal(df, result_without_first_row)
@ -275,9 +281,9 @@ def test_repr():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_pages(): def test_pages():
@ -285,22 +291,23 @@ def test_pages():
tables = camelot.read_pdf(url) tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages="1-end") tables = camelot.read_pdf(url, pages="1-end")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages="all") tables = camelot.read_pdf(url, pages="all")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) ==
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
@ -310,7 +317,8 @@ def test_url():
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) ==
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )

View File

@ -43,6 +43,13 @@ def test_grid_plot():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind='grid') return camelot.plot(tables[0], kind='grid')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_stream_grid_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
return camelot.plot(tables[0], kind='grid')
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)