Refactor base classes and improve plotting

Move common code to base class to reduce duplication
Stream plots display pdf background for better context
pull/153/head
Frh 2020-04-18 23:03:27 -07:00
parent 816471e426
commit 697289e409
13 changed files with 447 additions and 122 deletions

View File

@ -10,6 +10,15 @@ from operator import itemgetter
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from cv2 import cv2
from .utils import (
build_file_path_in_temp_dir,
compute_accuracy,
compute_whitespace,
export_pdf_as_png
)
# minimum number of vertical textline intersections for a textedge # minimum number of vertical textline intersections for a textedge
# to be considered valid # to be considered valid
@ -159,7 +168,10 @@ class TextEdges(object):
# get vertical textedges that intersect maximum number of # get vertical textedges that intersect maximum number of
# times with horizontal textlines # times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align] return list(filter(
lambda te: te.is_valid,
self._textedges[relevant_align])
)
def get_table_areas(self, textlines, relevant_textedges): def get_table_areas(self, textlines, relevant_textedges):
"""Returns a dict of interesting table areas on the PDF page """Returns a dict of interesting table areas on the PDF page
@ -179,7 +191,6 @@ class TextEdges(object):
table_areas = {} table_areas = {}
for te in relevant_textedges: for te in relevant_textedges:
if te.is_valid:
if not table_areas: if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None table_areas[(te.x, te.y0, te.x, te.y1)] = None
else: else:
@ -225,7 +236,8 @@ class TextEdges(object):
max(found[3], tl.y1), max(found[3], tl.y1),
) )
table_areas[updated_area] = None table_areas[updated_area] = None
average_textline_height = sum_textline_height / float(len(textlines)) average_textline_height = sum_textline_height / \
float(len(textlines))
# add some padding to table areas # add some padding to table areas
table_areas_padded = {} table_areas_padded = {}
@ -339,6 +351,8 @@ class Table(object):
Accuracy with which text was assigned to the cell. Accuracy with which text was assigned to the cell.
whitespace : float whitespace : float
Percentage of whitespace in the table. Percentage of whitespace in the table.
filename : str
Path of the original PDF
order : int order : int
Table number on PDF page. Table number on PDF page.
page : int page : int
@ -356,8 +370,15 @@ class Table(object):
self.shape = (0, 0) self.shape = (0, 0)
self.accuracy = 0 self.accuracy = 0
self.whitespace = 0 self.whitespace = 0
self.filename = None
self.order = None self.order = None
self.page = None self.page = None
self.flavor = None # Flavor of the parser that generated the table
self.pdf_size = None # Dimensions of the original PDF page
self.debug_info = None # Field holding debug data
self._image = None
self._image_path = None # Temporary file to hold an image of the pdf
def __repr__(self): def __repr__(self):
return "<{} shape={}>".format(self.__class__.__name__, self.shape) return "<{} shape={}>".format(self.__class__.__name__, self.shape)
@ -392,6 +413,32 @@ class Table(object):
} }
return report return report
def record_metadata(self, parser):
"""Record data about the origin of the table
"""
self.flavor = parser.id
self.filename = parser.filename
self.debug_info = parser.debug_info
data = self.data
self.df = pd.DataFrame(data)
self.shape = self.df.shape
self.whitespace = compute_whitespace(data)
self.pdf_size = (parser.pdf_width, parser.pdf_height)
def get_pdf_image(self):
"""Compute pdf image and cache it
"""
if self._image is None:
if self._image_path is None:
self._image_path = build_file_path_in_temp_dir(
os.path.basename(self.filename),
".png"
)
export_pdf_as_png(self.filename, self._image_path)
self._image = cv2.imread(self._image_path)
return self._image
def set_all_edges(self): def set_all_edges(self):
"""Sets all table edges to True. """Sets all table edges to True.
""" """

View File

@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList from .core import TableList
from .parsers import Stream, Lattice from .parsers import Stream, Lattice
from .utils import ( from .utils import (
TemporaryDirectory, build_file_path_in_temp_dir,
get_page_layout, get_page_layout,
get_text_objects, get_text_objects,
get_rotation, get_rotation,
@ -16,6 +16,11 @@ from .utils import (
download_url, download_url,
) )
PARSERS = {
"lattice": Lattice,
"stream": Stream
}
class PDFHandler(object): class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting """Handles all operations like temp directory creation, splitting
@ -89,31 +94,47 @@ class PDFHandler(object):
P.extend(range(p["start"], p["end"] + 1)) P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P)) return sorted(set(P))
def _save_page(self, filepath, page, temp): def _read_pdf_page(self, page=1, layout_kwargs=None):
"""Saves specified page from PDF into a temporary directory. """Saves specified page from PDF into a temporary directory. Removes
password protection and normalizes rotation.
Parameters Parameters
---------- ----------
filepath : str
Filepath or URL of the PDF file.
page : int page : int
Page number. Page number.
temp : str layout_kwargs : dict, optional (default: {})
Tmp directory. A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
Returns
-------
layout : object
dimensions : tuple
The dimensions of the pdf page
filepath : str
The path of the single page PDF - either the original, or a
normalized version.
""" """
with open(filepath, "rb") as fileobj: layout_kwargs = layout_kwargs or {}
with open(self.filepath, "rb") as fileobj:
# Normalize the pdf file, but skip if it's not encrypted or has
# only one page.
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
fpath = os.path.join(temp, "page-{0}.pdf".format(page)) fpath = build_file_path_in_temp_dir(
"page-{page}.pdf".format(page=page))
froot, fext = os.path.splitext(fpath) froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1) p = infile.getPage(page - 1)
outfile = PdfFileWriter() outfile = PdfFileWriter()
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
layout, __ = get_page_layout(fpath) layout, dimensions = get_page_layout(
fpath, **layout_kwargs)
# fix rotated PDF # fix rotated PDF
chars = get_text_objects(layout, ltype="char") chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
@ -121,12 +142,7 @@ class PDFHandler(object):
rotation = get_rotation(chars, horizontal_text, vertical_text) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "": if rotation != "":
fpath_new = "".join( fpath_new = "".join(
[ [froot.replace("page", "p"), "_rotated", fext])
froot.replace("page", "p"),
"_rotated",
fext
]
)
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False) infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
@ -140,10 +156,13 @@ class PDFHandler(object):
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
layout, dimensions = get_page_layout(
fpath, **layout_kwargs)
return layout, dimensions, fpath
def parse( def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, self, flavor="lattice", suppress_stdout=False,
**kwargs layout_kwargs=None, **kwargs
): ):
"""Extracts tables by calling parser.get_tables on all single """Extracts tables by calling parser.get_tables on all single
page PDFs. page PDFs.
@ -168,19 +187,22 @@ class PDFHandler(object):
""" """
layout_kwargs = layout_kwargs or {} layout_kwargs = layout_kwargs or {}
tables = [] tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages: parser_obj = PARSERS[flavor]
self._save_page(self.filepath, p, tempdir) parser = parser_obj(**kwargs)
pages = [
os.path.join(tempdir, "page-{0}.pdf".format(p)) # Read the layouts/dimensions of each of the pages we need to
for p in self.pages # parse. This might require creating a temporary .pdf.
] for page_idx in self.pages:
parser = Lattice(**kwargs) \ layout, dimensions, source_file = self._read_pdf_page(
if flavor == "lattice" else Stream(**kwargs) page_idx,
for p in pages: layout_kwargs=layout_kwargs
t = parser.extract_tables( )
p, suppress_stdout=suppress_stdout, parser._generate_layout(source_file, layout, dimensions,
layout_kwargs=layout_kwargs page_idx, layout_kwargs)
) t = parser.extract_tables(
tables.extend(t) source_file,
suppress_stdout=suppress_stdout
)
tables.extend(t)
return TableList(sorted(tables)) return TableList(sorted(tables))

View File

@ -2,20 +2,28 @@
import os import os
from ..utils import get_page_layout, get_text_objects from ..utils import (
get_text_objects
)
from ..core import Table
class BaseParser(object): class BaseParser(object):
"""Defines a base parser. """Defines a base parser.
""" """
def __init__(self, parser_id):
self.id = parser_id
def _generate_layout(self, filename, layout_kwargs): # For plotting details of parsing algorithms
self.debug_info = {}
def _generate_layout(self, filename, layout, dimensions,
page_idx, layout_kwargs):
self.filename = filename self.filename = filename
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout( self.layout = layout
filename, self.dimensions = dimensions
**layout_kwargs self.page = page_idx
)
self.images = get_text_objects(self.layout, ltype="image") self.images = get_text_objects(self.layout, ltype="image")
self.horizontal_text = get_text_objects( self.horizontal_text = get_text_objects(
self.layout, self.layout,
@ -27,3 +35,25 @@ class BaseParser(object):
) )
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)
"""Initialize new table object, ready to be populated
Parameters
----------
table_idx : int
Index of this table within the pdf page analyzed
cols : list
list of coordinate boundaries tuples (left, right)
rows : list
list of coordinate boundaries tuples (bottom, top)
Returns
-------
table : camelot.core.Table
"""
def _initialize_new_table(self, table_idx, cols, rows):
table = Table(cols, rows)
table.page = self.page
table.order = table_idx + 1
return table

View File

@ -2,15 +2,20 @@
from __future__ import division from __future__ import division
import os import os
import sys
import copy import copy
import locale
import logging import logging
import warnings import warnings
import subprocess
import numpy as np
import pandas as pd import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import Table
from ..utils import ( from ..utils import (
build_file_path_in_temp_dir,
export_pdf_as_png,
scale_image, scale_image,
scale_pdf, scale_pdf,
segments_in_bbox, segments_in_bbox,
@ -18,7 +23,6 @@ from ..utils import (
merge_close_lines, merge_close_lines,
get_table_index, get_table_index,
compute_accuracy, compute_accuracy,
compute_whitespace,
) )
from ..image_processing import ( from ..image_processing import (
adaptive_threshold, adaptive_threshold,
@ -110,13 +114,13 @@ class Lattice(BaseParser):
resolution=300, resolution=300,
**kwargs **kwargs
): ):
shift_text = shift_text or ["l", "t"] super().__init__("lattice")
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
self.line_scale = line_scale self.line_scale = line_scale
self.copy_text = copy_text self.copy_text = copy_text
self.shift_text = shift_text self.shift_text = shift_text or ["l", "t"]
self.split_text = split_text self.split_text = split_text
self.flag_size = flag_size self.flag_size = flag_size
self.strip_text = strip_text self.strip_text = strip_text
@ -126,6 +130,8 @@ class Lattice(BaseParser):
self.threshold_constant = threshold_constant self.threshold_constant = threshold_constant
self.iterations = iterations self.iterations = iterations
self.resolution = resolution self.resolution = resolution
self.image_path = None
self.pdf_image = None
@staticmethod @staticmethod
def _reduce_index(t, idx, shift_text): def _reduce_index(t, idx, shift_text):
@ -205,18 +211,6 @@ class Lattice(BaseParser):
t.cells[i][j].text = t.cells[i - 1][j].text t.cells[i][j].text = t.cells[i - 1][j].text
return t return t
def _generate_image(self):
from ..ext.ghostscript import Ghostscript
self.imagename = "".join([self.rootname, ".png"])
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
self.imagename, self.filename
)
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
Ghostscript(*gs_call, stdout=null)
null.close()
def _generate_table_bbox(self): def _generate_table_bbox(self):
def scale_areas(areas): def scale_areas(areas):
scaled_areas = [] scaled_areas = []
@ -230,15 +224,20 @@ class Lattice(BaseParser):
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas return scaled_areas
self.image, self.threshold = adaptive_threshold( self.image_path = build_file_path_in_temp_dir(
self.imagename, os.path.basename(self.filename),
".png"
)
export_pdf_as_png(self.filename, self.image_path)
self.pdf_image, self.threshold = adaptive_threshold(
self.image_path,
process_background=self.process_background, process_background=self.process_background,
blocksize=self.threshold_blocksize, blocksize=self.threshold_blocksize,
c=self.threshold_constant, c=self.threshold_constant,
) )
image_width = self.image.shape[1] image_width = self.pdf_image.shape[1]
image_height = self.image.shape[0] image_height = self.pdf_image.shape[0]
image_width_scaler = image_width / float(self.pdf_width) image_width_scaler = image_width / float(self.pdf_width)
image_height_scaler = image_height / float(self.pdf_height) image_height_scaler = image_height / float(self.pdf_height)
pdf_width_scaler = self.pdf_width / float(image_width) pdf_width_scaler = self.pdf_width / float(image_width)
@ -332,7 +331,7 @@ class Lattice(BaseParser):
if v_s is None or h_s is None: if v_s is None or h_s is None:
raise ValueError("No segments found on {}".format(self.rootname)) raise ValueError("No segments found on {}".format(self.rootname))
table = Table(cols, rows) table = self._initialize_new_table(table_idx, cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True # set table border edges to True
@ -360,6 +359,7 @@ class Lattice(BaseParser):
) )
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text table.cells[r_idx][c_idx].text = text
# FRHTODO
accuracy = compute_accuracy([[100, pos_errors]]) accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None: if self.copy_text is not None:
@ -368,39 +368,27 @@ class Lattice(BaseParser):
copy_text=self.copy_text copy_text=self.copy_text
) )
data = table.data table.record_metadata(self)
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "lattice"
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting # for plotting
_text = [] _text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text table._text = _text
table._image = (self.image, self.table_bbox_unscaled) table._image = self.pdf_image # Reuse the image used for calc
table._bbox_unscaled = self.table_bbox_unscaled
table._segments = (self.vertical_segments, self.horizontal_segments) table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None table._textedges = None
return table return table
def extract_tables( def extract_tables(self, filename, suppress_stdout=False):
self, # FRHTODO: move extract table core to the base class
filename,
suppress_stdout=False,
layout_kwargs=None
):
layout_kwargs = layout_kwargs or {}
self._generate_layout(filename, layout_kwargs)
rootname = os.path.basename(self.rootname) rootname = os.path.basename(self.rootname)
if not suppress_stdout: if not suppress_stdout:
logger.info("Processing {rootname}".format(rootname=rootname)) logger.info(
"Processing {rootname}".format(rootname=rootname))
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
@ -415,7 +403,6 @@ class Lattice(BaseParser):
) )
return [] return []
self._generate_image()
self._generate_table_bbox() self._generate_table_bbox()
_tables = [] _tables = []

View File

@ -9,7 +9,7 @@ import numpy as np
import pandas as pd import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import TextEdges, Table from ..core import TextEdges
from ..utils import (text_in_bbox, get_table_index, compute_accuracy, from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace) compute_whitespace)
@ -69,11 +69,9 @@ class Stream(BaseParser):
column_tol=0, column_tol=0,
**kwargs **kwargs
): ):
super().__init__("stream")
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.table_bbox = None
self.t_bbox = None
self.textedges = []
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
self.split_text = split_text self.split_text = split_text
@ -191,7 +189,8 @@ class Stream(BaseParser):
@staticmethod @staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min): def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous. """Makes row coordinates continuous. For the row to "touch"
we split the existing gap between them in half.
Parameters Parameters
---------- ----------
@ -206,18 +205,20 @@ class Stream(BaseParser):
List of continuous row y-coordinate tuples. List of continuous row y-coordinate tuples.
""" """
row_mids = [ row_boundaries = [
sum((t.y0 + t.y1) / 2 for t in r) / len(r) if len(r) > 0 else 0 [
max(t.y1 for t in r),
min(t.y0 for t in r)
]
for r in rows_grouped for r in rows_grouped
] ]
rows = [ for i in range(0, len(row_boundaries)-1):
(row_mids[i] + row_mids[i - 1]) / 2 top_row = row_boundaries[i]
for i in range(1, len(row_mids)) bottom_row = row_boundaries[i+1]
] top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
rows.insert(0, text_y_max) row_boundaries[0][0] = text_y_max
rows.append(text_y_min) row_boundaries[-1][1] = text_y_min
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] return row_boundaries
return rows
@staticmethod @staticmethod
def _add_columns(cols, text, row_tol): def _add_columns(cols, text, row_tol):
@ -414,7 +415,7 @@ class Stream(BaseParser):
return cols, rows return cols, rows
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, cols, rows, **kwargs):
table = Table(cols, rows) table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges() table = table.set_all_edges()
pos_errors = [] pos_errors = []
@ -436,32 +437,22 @@ class Stream(BaseParser):
table.cells[r_idx][c_idx].text = text table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]]) accuracy = compute_accuracy([[100, pos_errors]])
data = table.data table.record_metadata(self)
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "stream"
table.accuracy = accuracy table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting # for plotting
_text = [] _text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text table._text = _text
table._image = None table._bbox = self.table_bbox
table._segments = None table._segments = None
table._textedges = self.textedges table._textedges = self.textedges
return table return table
def extract_tables(self, filename, suppress_stdout=False, def extract_tables(self, filename, suppress_stdout=False):
layout_kwargs=None):
layout_kwargs = layout_kwargs or {}
self._generate_layout(filename, layout_kwargs)
if not suppress_stdout: if not suppress_stdout:
logger.info("Processing {}".format( logger.info("Processing {}".format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))

View File

@ -68,11 +68,14 @@ class PlotMethods(object):
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), (t[0], t[1]),
t[2] - t[0], t[2] - t[0],
t[3] - t[1] t[3] - t[1],
alpha=0.5
) )
) )
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
def grid(self, table): def grid(self, table):
@ -100,6 +103,9 @@ class PlotMethods(object):
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
if cell.bottom: if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
def contour(self, table): def contour(self, table):
@ -115,12 +121,13 @@ class PlotMethods(object):
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
try:
img, table_bbox = table._image img = table.get_pdf_image()
_FOR_LATTICE = True _FOR_LATTICE = table.flavor == "lattice"
except TypeError: if _FOR_LATTICE:
img, table_bbox = (None, {table._bbox: None}) table_bbox = table._bbox_unscaled
_FOR_LATTICE = False else:
table_bbox = {table._bbox: None}
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal") ax = fig.add_subplot(111, aspect="equal")
@ -150,6 +157,8 @@ class PlotMethods(object):
if _FOR_LATTICE: if _FOR_LATTICE:
ax.imshow(img) ax.imshow(img)
else:
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
def textedge(self, table): def textedge(self, table):
@ -173,7 +182,8 @@ class PlotMethods(object):
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], (t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue" color="blue",
alpha=0.5
) )
) )
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
@ -182,6 +192,8 @@ class PlotMethods(object):
for te in table._textedges: for te in table._textedges:
ax.plot([te.x, te.x], [te.y0, te.y1]) ax.plot([te.x, te.x], [te.y0, te.y1])
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
def joint(self, table): def joint(self, table):
@ -197,7 +209,8 @@ class PlotMethods(object):
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
img, table_bbox = table._image img = table.get_pdf_image()
table_bbox = table._bbox_unscaled
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal") ax = fig.add_subplot(111, aspect="equal")
x_coord = [] x_coord = []
@ -230,4 +243,7 @@ class PlotMethods(object):
ax.plot([v[0], v[2]], [v[1], v[3]]) ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal: for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]]) ax.plot([h[0], h[2]], [h[1], h[3]])
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig

View File

@ -3,6 +3,7 @@ from __future__ import division
import re import re
import os import os
import atexit
import sys import sys
import random import random
import shutil import shutil
@ -13,6 +14,7 @@ from itertools import groupby
from operator import itemgetter from operator import itemgetter
import numpy as np import numpy as np
import pandas as pd
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -29,6 +31,7 @@ from pdfminer.layout import (
LTImage, LTImage,
) )
from .ext.ghostscript import Ghostscript
# pylint: disable=import-error # pylint: disable=import-error
# PyLint will evaluate both branches, and will necessarily complain about one # PyLint will evaluate both branches, and will necessarily complain about one
@ -150,13 +153,40 @@ def remove_extra(kwargs, flavor="lattice"):
# https://stackoverflow.com/a/22726782 # https://stackoverflow.com/a/22726782
# and https://stackoverflow.com/questions/10965479
class TemporaryDirectory(object): class TemporaryDirectory(object):
def __enter__(self): def __enter__(self):
self.name = tempfile.mkdtemp() self.name = tempfile.mkdtemp()
# Only delete the temporary directory upon
# program exit.
atexit.register(shutil.rmtree, self.name)
return self.name return self.name
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
shutil.rmtree(self.name) pass
def build_file_path_in_temp_dir(filename, extension=None):
"""Generates a new path within a temporary directory
Parameters
----------
filename : str
extension : str
Returns
-------
file_path_in_temporary_dir : str
"""
with TemporaryDirectory() as temp_dir:
if extension:
filename = filename + extension
path = os.path.join(
temp_dir,
filename
)
return path
def translate(x1, x2): def translate(x1, x2):
@ -387,6 +417,117 @@ def text_in_bbox(bbox, text):
return t_bbox return t_bbox
def bbox_from_text(textlines):
"""Returns the smallest bbox containing all the text objects passed as
a parameters.
Parameters
----------
textlines : List of PDFMiner text objects.
Returns
-------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
"""
if len(textlines) == 0:
return None
bbox = (
textlines[0].x0,
textlines[0].y0,
textlines[0].x1,
textlines[0].y1
)
for tl in textlines[1:]:
bbox = (
min(bbox[0], tl.x0),
min(bbox[1], tl.y0),
max(bbox[2], tl.x1),
max(bbox[3], tl.y1)
)
return bbox
def find_columns_coordinates(tls):
"""Given a list of text objects, guess columns boundaries and returns a
list of x-coordinates for split points between columns.
Parameters
----------
tls : list of PDFMiner text object.
Returns
-------
cols_anchors : list
List of x-coordinates for columns.
"""
# Make a list of disjunct cols boundaries across the textlines
# that comprise the table.
# [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
cols_bounds = []
tls.sort(key=lambda tl: tl.x0)
for tl in tls:
if (not cols_bounds) or cols_bounds[-1][1] < tl.x0:
cols_bounds.append([tl.x0, tl.x1])
else:
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
# From the row boundaries, identify splits by getting the mid points
# between the boundaries.
# Row boundaries: [ a ] [b] [ c ]
# Splits: | | | |
cols_anchors = list(map(
lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
range(1, len(cols_bounds)-1)
))
cols_anchors.insert(0, cols_bounds[0][0])
cols_anchors.append(cols_bounds[-1][1])
return cols_anchors
def distance_tl_to_bbox(tl, bbox):
"""Returns a tuple corresponding to the horizontal and vertical gaps
between a textline and a bbox.
Parameters
----------
tl : PDFMiner text object.
bbox : tuple (x0, y0, x1, y1)
Returns
-------
distance : tuple
Tuple (horizontal distance, vertical distance)
"""
v_distance, h_distance = None, None
if tl.x1 <= bbox[0]:
# tl to the left
h_distance = bbox[0] - tl.x1
elif bbox[2] <= tl.x0:
# tl to the right
h_distance = tl.x0 - bbox[2]
else:
# textline overlaps vertically
h_distance = 0
if tl.y1 <= bbox[1]:
# tl below
v_distance = bbox[1] - tl.y1
elif bbox[3] <= tl.y0:
# tl above
v_distance = tl.y0 - bbox[3]
else:
# tl overlaps horizontally
v_distance = 0
return (h_distance, v_distance)
def merge_close_lines(ar, line_tol=2): def merge_close_lines(ar, line_tol=2):
"""Merges lines which are within a tolerance by calculating a """Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections. moving mean, based on their x or y axis projections.
@ -867,3 +1008,94 @@ def get_text_objects(layout, ltype="char", t=None):
except AttributeError: except AttributeError:
pass pass
return t return t
def export_pdf_as_png(pdf_path, destination_path):
"""Generate an image from a pdf.
Parameters
----------
pdf_path : str
destination_path : str
"""
gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
gs_call = gs_call.encode().split()
null = open(os.devnull, "wb")
Ghostscript(*gs_call, stdout=null)
null.close()
def compare_tables(left, right):
"""Compare two tables and displays differences in a human readable form.
Parameters
----------
left : data frame
right : data frame
"""
diff_cols = right.shape[1]-left.shape[1]
diff_rows = right.shape[0]-left.shape[0]
differences = []
if (diff_rows):
differences.append(
f"{abs(diff_rows)} "
f"{'more' if diff_rows>0 else 'fewer'} rows"
)
if (diff_cols):
differences.append(
f"{abs(diff_cols)} "
f"{'more' if diff_cols>0 else 'fewer'} columns"
)
if differences:
differences_str = " and ".join(differences)
print(f"Right has {differences_str} than left "
f"[{right.shape[0]},{right.shape[1]}] vs "
f"[{left.shape[0]},{left.shape[1]}]")
table1, table2 = [left, right]
name_table1, name_table2 = ["left", "right"]
if not diff_rows:
# Same number of rows: compare columns since they're of the same length
if diff_cols > 0:
# Use the longest table as a reference
table1, table2 = table2, table1
name_table1, name_table2 = name_table2, name_table1
for i, col in enumerate(table1.columns):
lcol = table1.iloc[:, i]
if col in table2:
scol = table2.iloc[:, i]
if not lcol.equals(scol):
diff_df = pd.DataFrame()
diff_df[name_table1] = scol
diff_df[name_table2] = lcol
diff_df["Match"] = lcol == scol
print(
f"Column {i} different:\n"
f"{diff_df}"
)
break
else:
print("Column {i} unique to {name_table1}: {lcol}")
break
elif not diff_cols:
# Same number of cols: compare rows since they're of the same length
if diff_rows > 0:
# Use the longest table as a reference
table1, table2 = table2, table1
name_table1, name_table2 = name_table2, name_table1
for index, lrow in table1.iterrows():
if index < table2.shape[1]:
srow = table2.loc[index, :]
if not lrow.equals(srow):
diff_df = pd.DataFrame()
diff_df = diff_df.append(lrow, ignore_index=True)
diff_df = diff_df.append(srow, ignore_index=True)
diff_df.insert(0, 'Table', [name_table1, name_table2])
print(f"Row {index} differs:")
print(diff_df.values)
break
else:
print(f"Row {index} unique to {name_table1}: {lrow}")
break
else:
print("Tables have different shapes")

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.2 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.7 KiB

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.7 KiB

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.9 KiB

After

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 113 KiB