Refactor base classes and improve plotting

Move common code to base class to reduce duplication
Stream plots display pdf background for better context
pull/153/head
Frh 2020-04-18 23:03:27 -07:00
parent 816471e426
commit 697289e409
13 changed files with 447 additions and 122 deletions

View File

@ -10,6 +10,15 @@ from operator import itemgetter
import numpy as np
import pandas as pd
from cv2 import cv2
from .utils import (
build_file_path_in_temp_dir,
compute_accuracy,
compute_whitespace,
export_pdf_as_png
)
# minimum number of vertical textline intersections for a textedge
# to be considered valid
@ -159,7 +168,10 @@ class TextEdges(object):
# get vertical textedges that intersect maximum number of
# times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align]
return list(filter(
lambda te: te.is_valid,
self._textedges[relevant_align])
)
def get_table_areas(self, textlines, relevant_textedges):
"""Returns a dict of interesting table areas on the PDF page
@ -179,7 +191,6 @@ class TextEdges(object):
table_areas = {}
for te in relevant_textedges:
if te.is_valid:
if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
@ -225,7 +236,8 @@ class TextEdges(object):
max(found[3], tl.y1),
)
table_areas[updated_area] = None
average_textline_height = sum_textline_height / float(len(textlines))
average_textline_height = sum_textline_height / \
float(len(textlines))
# add some padding to table areas
table_areas_padded = {}
@ -339,6 +351,8 @@ class Table(object):
Accuracy with which text was assigned to the cell.
whitespace : float
Percentage of whitespace in the table.
filename : str
Path of the original PDF
order : int
Table number on PDF page.
page : int
@ -356,8 +370,15 @@ class Table(object):
self.shape = (0, 0)
self.accuracy = 0
self.whitespace = 0
self.filename = None
self.order = None
self.page = None
self.flavor = None # Flavor of the parser that generated the table
self.pdf_size = None # Dimensions of the original PDF page
self.debug_info = None # Field holding debug data
self._image = None
self._image_path = None # Temporary file to hold an image of the pdf
def __repr__(self):
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
@ -392,6 +413,32 @@ class Table(object):
}
return report
def record_metadata(self, parser):
"""Record data about the origin of the table
"""
self.flavor = parser.id
self.filename = parser.filename
self.debug_info = parser.debug_info
data = self.data
self.df = pd.DataFrame(data)
self.shape = self.df.shape
self.whitespace = compute_whitespace(data)
self.pdf_size = (parser.pdf_width, parser.pdf_height)
def get_pdf_image(self):
"""Compute pdf image and cache it
"""
if self._image is None:
if self._image_path is None:
self._image_path = build_file_path_in_temp_dir(
os.path.basename(self.filename),
".png"
)
export_pdf_as_png(self.filename, self._image_path)
self._image = cv2.imread(self._image_path)
return self._image
def set_all_edges(self):
"""Sets all table edges to True.
"""

View File

@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList
from .parsers import Stream, Lattice
from .utils import (
TemporaryDirectory,
build_file_path_in_temp_dir,
get_page_layout,
get_text_objects,
get_rotation,
@ -16,6 +16,11 @@ from .utils import (
download_url,
)
PARSERS = {
"lattice": Lattice,
"stream": Stream
}
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
@ -89,31 +94,47 @@ class PDFHandler(object):
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
def _read_pdf_page(self, page=1, layout_kwargs=None):
"""Saves specified page from PDF into a temporary directory. Removes
password protection and normalizes rotation.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
page : int
Page number.
temp : str
Tmp directory.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
Returns
-------
layout : object
dimensions : tuple
The dimensions of the pdf page
filepath : str
The path of the single page PDF - either the original, or a
normalized version.
"""
with open(filepath, "rb") as fileobj:
layout_kwargs = layout_kwargs or {}
with open(self.filepath, "rb") as fileobj:
# Normalize the pdf file, but skip if it's not encrypted or has
# only one page.
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
fpath = build_file_path_in_temp_dir(
"page-{page}.pdf".format(page=page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, __ = get_page_layout(fpath)
layout, dimensions = get_page_layout(
fpath, **layout_kwargs)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
@ -121,12 +142,7 @@ class PDFHandler(object):
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join(
[
froot.replace("page", "p"),
"_rotated",
fext
]
)
[froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted:
@ -140,10 +156,13 @@ class PDFHandler(object):
outfile.addPage(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dimensions = get_page_layout(
fpath, **layout_kwargs)
return layout, dimensions, fpath
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs=None,
**kwargs
self, flavor="lattice", suppress_stdout=False,
layout_kwargs=None, **kwargs
):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
@ -168,19 +187,22 @@ class PDFHandler(object):
"""
layout_kwargs = layout_kwargs or {}
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
pages = [
os.path.join(tempdir, "page-{0}.pdf".format(p))
for p in self.pages
]
parser = Lattice(**kwargs) \
if flavor == "lattice" else Stream(**kwargs)
for p in pages:
t = parser.extract_tables(
p, suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs
)
tables.extend(t)
parser_obj = PARSERS[flavor]
parser = parser_obj(**kwargs)
# Read the layouts/dimensions of each of the pages we need to
# parse. This might require creating a temporary .pdf.
for page_idx in self.pages:
layout, dimensions, source_file = self._read_pdf_page(
page_idx,
layout_kwargs=layout_kwargs
)
parser._generate_layout(source_file, layout, dimensions,
page_idx, layout_kwargs)
t = parser.extract_tables(
source_file,
suppress_stdout=suppress_stdout
)
tables.extend(t)
return TableList(sorted(tables))

View File

@ -2,20 +2,28 @@
import os
from ..utils import get_page_layout, get_text_objects
from ..utils import (
get_text_objects
)
from ..core import Table
class BaseParser(object):
"""Defines a base parser.
"""
def __init__(self, parser_id):
self.id = parser_id
def _generate_layout(self, filename, layout_kwargs):
# For plotting details of parsing algorithms
self.debug_info = {}
def _generate_layout(self, filename, layout, dimensions,
page_idx, layout_kwargs):
self.filename = filename
self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(
filename,
**layout_kwargs
)
self.layout = layout
self.dimensions = dimensions
self.page = page_idx
self.images = get_text_objects(self.layout, ltype="image")
self.horizontal_text = get_text_objects(
self.layout,
@ -27,3 +35,25 @@ class BaseParser(object):
)
self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename)
"""Initialize new table object, ready to be populated
Parameters
----------
table_idx : int
Index of this table within the pdf page analyzed
cols : list
list of coordinate boundaries tuples (left, right)
rows : list
list of coordinate boundaries tuples (bottom, top)
Returns
-------
table : camelot.core.Table
"""
def _initialize_new_table(self, table_idx, cols, rows):
table = Table(cols, rows)
table.page = self.page
table.order = table_idx + 1
return table

View File

@ -2,15 +2,20 @@
from __future__ import division
import os
import sys
import copy
import locale
import logging
import warnings
import subprocess
import numpy as np
import pandas as pd
from .base import BaseParser
from ..core import Table
from ..utils import (
build_file_path_in_temp_dir,
export_pdf_as_png,
scale_image,
scale_pdf,
segments_in_bbox,
@ -18,7 +23,6 @@ from ..utils import (
merge_close_lines,
get_table_index,
compute_accuracy,
compute_whitespace,
)
from ..image_processing import (
adaptive_threshold,
@ -110,13 +114,13 @@ class Lattice(BaseParser):
resolution=300,
**kwargs
):
shift_text = shift_text or ["l", "t"]
super().__init__("lattice")
self.table_regions = table_regions
self.table_areas = table_areas
self.process_background = process_background
self.line_scale = line_scale
self.copy_text = copy_text
self.shift_text = shift_text
self.shift_text = shift_text or ["l", "t"]
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
@ -126,6 +130,8 @@ class Lattice(BaseParser):
self.threshold_constant = threshold_constant
self.iterations = iterations
self.resolution = resolution
self.image_path = None
self.pdf_image = None
@staticmethod
def _reduce_index(t, idx, shift_text):
@ -205,18 +211,6 @@ class Lattice(BaseParser):
t.cells[i][j].text = t.cells[i - 1][j].text
return t
def _generate_image(self):
from ..ext.ghostscript import Ghostscript
self.imagename = "".join([self.rootname, ".png"])
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
self.imagename, self.filename
)
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
Ghostscript(*gs_call, stdout=null)
null.close()
def _generate_table_bbox(self):
def scale_areas(areas):
scaled_areas = []
@ -230,15 +224,20 @@ class Lattice(BaseParser):
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas
self.image, self.threshold = adaptive_threshold(
self.imagename,
self.image_path = build_file_path_in_temp_dir(
os.path.basename(self.filename),
".png"
)
export_pdf_as_png(self.filename, self.image_path)
self.pdf_image, self.threshold = adaptive_threshold(
self.image_path,
process_background=self.process_background,
blocksize=self.threshold_blocksize,
c=self.threshold_constant,
)
image_width = self.image.shape[1]
image_height = self.image.shape[0]
image_width = self.pdf_image.shape[1]
image_height = self.pdf_image.shape[0]
image_width_scaler = image_width / float(self.pdf_width)
image_height_scaler = image_height / float(self.pdf_height)
pdf_width_scaler = self.pdf_width / float(image_width)
@ -332,7 +331,7 @@ class Lattice(BaseParser):
if v_s is None or h_s is None:
raise ValueError("No segments found on {}".format(self.rootname))
table = Table(cols, rows)
table = self._initialize_new_table(table_idx, cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True
@ -360,6 +359,7 @@ class Lattice(BaseParser):
)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
# FRHTODO
accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None:
@ -368,39 +368,27 @@ class Lattice(BaseParser):
copy_text=self.copy_text
)
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "lattice"
table.record_metadata(self)
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = (self.image, self.table_bbox_unscaled)
table._image = self.pdf_image # Reuse the image used for calc
table._bbox_unscaled = self.table_bbox_unscaled
table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None
return table
def extract_tables(
self,
filename,
suppress_stdout=False,
layout_kwargs=None
):
layout_kwargs = layout_kwargs or {}
self._generate_layout(filename, layout_kwargs)
def extract_tables(self, filename, suppress_stdout=False):
# FRHTODO: move extract table core to the base class
rootname = os.path.basename(self.rootname)
if not suppress_stdout:
logger.info("Processing {rootname}".format(rootname=rootname))
logger.info(
"Processing {rootname}".format(rootname=rootname))
if not self.horizontal_text:
if self.images:
@ -415,7 +403,6 @@ class Lattice(BaseParser):
)
return []
self._generate_image()
self._generate_table_bbox()
_tables = []

View File

@ -9,7 +9,7 @@ import numpy as np
import pandas as pd
from .base import BaseParser
from ..core import TextEdges, Table
from ..core import TextEdges
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace)
@ -69,11 +69,9 @@ class Stream(BaseParser):
column_tol=0,
**kwargs
):
super().__init__("stream")
self.table_regions = table_regions
self.table_areas = table_areas
self.table_bbox = None
self.t_bbox = None
self.textedges = []
self.columns = columns
self._validate_columns()
self.split_text = split_text
@ -191,7 +189,8 @@ class Stream(BaseParser):
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous.
"""Makes row coordinates continuous. For the row to "touch"
we split the existing gap between them in half.
Parameters
----------
@ -206,18 +205,20 @@ class Stream(BaseParser):
List of continuous row y-coordinate tuples.
"""
row_mids = [
sum((t.y0 + t.y1) / 2 for t in r) / len(r) if len(r) > 0 else 0
row_boundaries = [
[
max(t.y1 for t in r),
min(t.y0 for t in r)
]
for r in rows_grouped
]
rows = [
(row_mids[i] + row_mids[i - 1]) / 2
for i in range(1, len(row_mids))
]
rows.insert(0, text_y_max)
rows.append(text_y_min)
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
return rows
for i in range(0, len(row_boundaries)-1):
top_row = row_boundaries[i]
bottom_row = row_boundaries[i+1]
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
row_boundaries[0][0] = text_y_max
row_boundaries[-1][1] = text_y_min
return row_boundaries
@staticmethod
def _add_columns(cols, text, row_tol):
@ -414,7 +415,7 @@ class Stream(BaseParser):
return cols, rows
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = Table(cols, rows)
table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges()
pos_errors = []
@ -436,32 +437,22 @@ class Stream(BaseParser):
table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]])
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
table.record_metadata(self)
whitespace = compute_whitespace(data)
table.flavor = "stream"
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = None
table._bbox = self.table_bbox
table._segments = None
table._textedges = self.textedges
return table
def extract_tables(self, filename, suppress_stdout=False,
layout_kwargs=None):
layout_kwargs = layout_kwargs or {}
self._generate_layout(filename, layout_kwargs)
def extract_tables(self, filename, suppress_stdout=False):
if not suppress_stdout:
logger.info("Processing {}".format(
os.path.basename(self.rootname)))

View File

@ -68,11 +68,14 @@ class PlotMethods(object):
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1]
t[3] - t[1],
alpha=0.5
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig
def grid(self, table):
@ -100,6 +103,9 @@ class PlotMethods(object):
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig
def contour(self, table):
@ -115,12 +121,13 @@ class PlotMethods(object):
fig : matplotlib.fig.Figure
"""
try:
img, table_bbox = table._image
_FOR_LATTICE = True
except TypeError:
img, table_bbox = (None, {table._bbox: None})
_FOR_LATTICE = False
img = table.get_pdf_image()
_FOR_LATTICE = table.flavor == "lattice"
if _FOR_LATTICE:
table_bbox = table._bbox_unscaled
else:
table_bbox = {table._bbox: None}
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
@ -150,6 +157,8 @@ class PlotMethods(object):
if _FOR_LATTICE:
ax.imshow(img)
else:
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig
def textedge(self, table):
@ -173,7 +182,8 @@ class PlotMethods(object):
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue"
color="blue",
alpha=0.5
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
@ -182,6 +192,8 @@ class PlotMethods(object):
for te in table._textedges:
ax.plot([te.x, te.x], [te.y0, te.y1])
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig
def joint(self, table):
@ -197,7 +209,8 @@ class PlotMethods(object):
fig : matplotlib.fig.Figure
"""
img, table_bbox = table._image
img = table.get_pdf_image()
table_bbox = table._bbox_unscaled
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
x_coord = []
@ -230,4 +243,7 @@ class PlotMethods(object):
ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]])
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig

View File

@ -3,6 +3,7 @@ from __future__ import division
import re
import os
import atexit
import sys
import random
import shutil
@ -13,6 +14,7 @@ from itertools import groupby
from operator import itemgetter
import numpy as np
import pandas as pd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
@ -29,6 +31,7 @@ from pdfminer.layout import (
LTImage,
)
from .ext.ghostscript import Ghostscript
# pylint: disable=import-error
# PyLint will evaluate both branches, and will necessarily complain about one
@ -150,13 +153,40 @@ def remove_extra(kwargs, flavor="lattice"):
# https://stackoverflow.com/a/22726782
# and https://stackoverflow.com/questions/10965479
class TemporaryDirectory(object):
def __enter__(self):
self.name = tempfile.mkdtemp()
# Only delete the temporary directory upon
# program exit.
atexit.register(shutil.rmtree, self.name)
return self.name
def __exit__(self, exc_type, exc_value, traceback):
shutil.rmtree(self.name)
pass
def build_file_path_in_temp_dir(filename, extension=None):
"""Generates a new path within a temporary directory
Parameters
----------
filename : str
extension : str
Returns
-------
file_path_in_temporary_dir : str
"""
with TemporaryDirectory() as temp_dir:
if extension:
filename = filename + extension
path = os.path.join(
temp_dir,
filename
)
return path
def translate(x1, x2):
@ -387,6 +417,117 @@ def text_in_bbox(bbox, text):
return t_bbox
def bbox_from_text(textlines):
"""Returns the smallest bbox containing all the text objects passed as
a parameters.
Parameters
----------
textlines : List of PDFMiner text objects.
Returns
-------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
"""
if len(textlines) == 0:
return None
bbox = (
textlines[0].x0,
textlines[0].y0,
textlines[0].x1,
textlines[0].y1
)
for tl in textlines[1:]:
bbox = (
min(bbox[0], tl.x0),
min(bbox[1], tl.y0),
max(bbox[2], tl.x1),
max(bbox[3], tl.y1)
)
return bbox
def find_columns_coordinates(tls):
"""Given a list of text objects, guess columns boundaries and returns a
list of x-coordinates for split points between columns.
Parameters
----------
tls : list of PDFMiner text object.
Returns
-------
cols_anchors : list
List of x-coordinates for columns.
"""
# Make a list of disjunct cols boundaries across the textlines
# that comprise the table.
# [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
cols_bounds = []
tls.sort(key=lambda tl: tl.x0)
for tl in tls:
if (not cols_bounds) or cols_bounds[-1][1] < tl.x0:
cols_bounds.append([tl.x0, tl.x1])
else:
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
# From the row boundaries, identify splits by getting the mid points
# between the boundaries.
# Row boundaries: [ a ] [b] [ c ]
# Splits: | | | |
cols_anchors = list(map(
lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
range(1, len(cols_bounds)-1)
))
cols_anchors.insert(0, cols_bounds[0][0])
cols_anchors.append(cols_bounds[-1][1])
return cols_anchors
def distance_tl_to_bbox(tl, bbox):
"""Returns a tuple corresponding to the horizontal and vertical gaps
between a textline and a bbox.
Parameters
----------
tl : PDFMiner text object.
bbox : tuple (x0, y0, x1, y1)
Returns
-------
distance : tuple
Tuple (horizontal distance, vertical distance)
"""
v_distance, h_distance = None, None
if tl.x1 <= bbox[0]:
# tl to the left
h_distance = bbox[0] - tl.x1
elif bbox[2] <= tl.x0:
# tl to the right
h_distance = tl.x0 - bbox[2]
else:
# textline overlaps vertically
h_distance = 0
if tl.y1 <= bbox[1]:
# tl below
v_distance = bbox[1] - tl.y1
elif bbox[3] <= tl.y0:
# tl above
v_distance = tl.y0 - bbox[3]
else:
# tl overlaps horizontally
v_distance = 0
return (h_distance, v_distance)
def merge_close_lines(ar, line_tol=2):
"""Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections.
@ -867,3 +1008,94 @@ def get_text_objects(layout, ltype="char", t=None):
except AttributeError:
pass
return t
def export_pdf_as_png(pdf_path, destination_path):
"""Generate an image from a pdf.
Parameters
----------
pdf_path : str
destination_path : str
"""
gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
Ghostscript(*gs_call, stdout=null)
null.close()
def compare_tables(left, right):
"""Compare two tables and displays differences in a human readable form.
Parameters
----------
left : data frame
right : data frame
"""
diff_cols = right.shape[1]-left.shape[1]
diff_rows = right.shape[0]-left.shape[0]
differences = []
if (diff_rows):
differences.append(
f"{abs(diff_rows)} "
f"{'more' if diff_rows>0 else 'fewer'} rows"
)
if (diff_cols):
differences.append(
f"{abs(diff_cols)} "
f"{'more' if diff_cols>0 else 'fewer'} columns"
)
if differences:
differences_str = " and ".join(differences)
print(f"Right has {differences_str} than left "
f"[{right.shape[0]},{right.shape[1]}] vs "
f"[{left.shape[0]},{left.shape[1]}]")
table1, table2 = [left, right]
name_table1, name_table2 = ["left", "right"]
if not diff_rows:
# Same number of rows: compare columns since they're of the same length
if diff_cols > 0:
# Use the longest table as a reference
table1, table2 = table2, table1
name_table1, name_table2 = name_table2, name_table1
for i, col in enumerate(table1.columns):
lcol = table1.iloc[:, i]
if col in table2:
scol = table2.iloc[:, i]
if not lcol.equals(scol):
diff_df = pd.DataFrame()
diff_df[name_table1] = scol
diff_df[name_table2] = lcol
diff_df["Match"] = lcol == scol
print(
f"Column {i} different:\n"
f"{diff_df}"
)
break
else:
print("Column {i} unique to {name_table1}: {lcol}")
break
elif not diff_cols:
# Same number of cols: compare rows since they're of the same length
if diff_rows > 0:
# Use the longest table as a reference
table1, table2 = table2, table1
name_table1, name_table2 = name_table2, name_table1
for index, lrow in table1.iterrows():
if index < table2.shape[1]:
srow = table2.loc[index, :]
if not lrow.equals(srow):
diff_df = pd.DataFrame()
diff_df = diff_df.append(lrow, ignore_index=True)
diff_df = diff_df.append(srow, ignore_index=True)
diff_df.insert(0, 'Table', [name_table1, name_table2])
print(f"Row {index} differs:")
print(diff_df.values)
break
else:
print(f"Row {index} unique to {name_table1}: {lrow}")
break
else:
print("Tables have different shapes")

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.2 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.7 KiB

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.7 KiB

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.9 KiB

After

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 113 KiB