diff --git a/camelot/core.py b/camelot/core.py index 5fff3c6..1ce71ab 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -10,6 +10,15 @@ from operator import itemgetter import numpy as np import pandas as pd +from cv2 import cv2 + +from .utils import ( + build_file_path_in_temp_dir, + compute_accuracy, + compute_whitespace, + export_pdf_as_png +) + # minimum number of vertical textline intersections for a textedge # to be considered valid @@ -159,7 +168,10 @@ class TextEdges(object): # get vertical textedges that intersect maximum number of # times with horizontal textlines relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] - return self._textedges[relevant_align] + return list(filter( + lambda te: te.is_valid, + self._textedges[relevant_align]) + ) def get_table_areas(self, textlines, relevant_textedges): """Returns a dict of interesting table areas on the PDF page @@ -179,7 +191,6 @@ class TextEdges(object): table_areas = {} for te in relevant_textedges: - if te.is_valid: if not table_areas: table_areas[(te.x, te.y0, te.x, te.y1)] = None else: @@ -225,7 +236,8 @@ class TextEdges(object): max(found[3], tl.y1), ) table_areas[updated_area] = None - average_textline_height = sum_textline_height / float(len(textlines)) + average_textline_height = sum_textline_height / \ + float(len(textlines)) # add some padding to table areas table_areas_padded = {} @@ -339,6 +351,8 @@ class Table(object): Accuracy with which text was assigned to the cell. whitespace : float Percentage of whitespace in the table. + filename : str + Path of the original PDF order : int Table number on PDF page. page : int @@ -356,8 +370,15 @@ class Table(object): self.shape = (0, 0) self.accuracy = 0 self.whitespace = 0 + self.filename = None self.order = None self.page = None + self.flavor = None # Flavor of the parser that generated the table + self.pdf_size = None # Dimensions of the original PDF page + self.debug_info = None # Field holding debug data + + self._image = None + self._image_path = None # Temporary file to hold an image of the pdf def __repr__(self): return "<{} shape={}>".format(self.__class__.__name__, self.shape) @@ -392,6 +413,32 @@ class Table(object): } return report + def record_metadata(self, parser): + """Record data about the origin of the table + """ + self.flavor = parser.id + self.filename = parser.filename + self.debug_info = parser.debug_info + data = self.data + self.df = pd.DataFrame(data) + self.shape = self.df.shape + + self.whitespace = compute_whitespace(data) + self.pdf_size = (parser.pdf_width, parser.pdf_height) + + def get_pdf_image(self): + """Compute pdf image and cache it + """ + if self._image is None: + if self._image_path is None: + self._image_path = build_file_path_in_temp_dir( + os.path.basename(self.filename), + ".png" + ) + export_pdf_as_png(self.filename, self._image_path) + self._image = cv2.imread(self._image_path) + return self._image + def set_all_edges(self): """Sets all table edges to True. """ diff --git a/camelot/handlers.py b/camelot/handlers.py index a689ee5..7a9f2ff 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import ( - TemporaryDirectory, + build_file_path_in_temp_dir, get_page_layout, get_text_objects, get_rotation, @@ -16,6 +16,11 @@ from .utils import ( download_url, ) +PARSERS = { + "lattice": Lattice, + "stream": Stream +} + class PDFHandler(object): """Handles all operations like temp directory creation, splitting @@ -89,31 +94,47 @@ class PDFHandler(object): P.extend(range(p["start"], p["end"] + 1)) return sorted(set(P)) - def _save_page(self, filepath, page, temp): - """Saves specified page from PDF into a temporary directory. + def _read_pdf_page(self, page=1, layout_kwargs=None): + """Saves specified page from PDF into a temporary directory. Removes + password protection and normalizes rotation. Parameters ---------- - filepath : str - Filepath or URL of the PDF file. page : int Page number. - temp : str - Tmp directory. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. # noqa + + + Returns + ------- + layout : object + + dimensions : tuple + The dimensions of the pdf page + + filepath : str + The path of the single page PDF - either the original, or a + normalized version. """ - with open(filepath, "rb") as fileobj: + layout_kwargs = layout_kwargs or {} + with open(self.filepath, "rb") as fileobj: + # Normalize the pdf file, but skip if it's not encrypted or has + # only one page. infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) - fpath = os.path.join(temp, "page-{0}.pdf".format(page)) + fpath = build_file_path_in_temp_dir( + "page-{page}.pdf".format(page=page)) froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) - layout, __ = get_page_layout(fpath) + layout, dimensions = get_page_layout( + fpath, **layout_kwargs) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") @@ -121,12 +142,7 @@ class PDFHandler(object): rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join( - [ - froot.replace("page", "p"), - "_rotated", - fext - ] - ) + [froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) infile = PdfFileReader(open(fpath_new, "rb"), strict=False) if infile.isEncrypted: @@ -140,10 +156,13 @@ class PDFHandler(object): outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) + layout, dimensions = get_page_layout( + fpath, **layout_kwargs) + return layout, dimensions, fpath def parse( - self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, - **kwargs + self, flavor="lattice", suppress_stdout=False, + layout_kwargs=None, **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -168,19 +187,22 @@ class PDFHandler(object): """ layout_kwargs = layout_kwargs or {} tables = [] - with TemporaryDirectory() as tempdir: - for p in self.pages: - self._save_page(self.filepath, p, tempdir) - pages = [ - os.path.join(tempdir, "page-{0}.pdf".format(p)) - for p in self.pages - ] - parser = Lattice(**kwargs) \ - if flavor == "lattice" else Stream(**kwargs) - for p in pages: - t = parser.extract_tables( - p, suppress_stdout=suppress_stdout, - layout_kwargs=layout_kwargs - ) - tables.extend(t) + + parser_obj = PARSERS[flavor] + parser = parser_obj(**kwargs) + + # Read the layouts/dimensions of each of the pages we need to + # parse. This might require creating a temporary .pdf. + for page_idx in self.pages: + layout, dimensions, source_file = self._read_pdf_page( + page_idx, + layout_kwargs=layout_kwargs + ) + parser._generate_layout(source_file, layout, dimensions, + page_idx, layout_kwargs) + t = parser.extract_tables( + source_file, + suppress_stdout=suppress_stdout + ) + tables.extend(t) return TableList(sorted(tables)) diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 5713625..bd41fc3 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -2,20 +2,28 @@ import os -from ..utils import get_page_layout, get_text_objects +from ..utils import ( + get_text_objects +) +from ..core import Table class BaseParser(object): """Defines a base parser. """ + def __init__(self, parser_id): + self.id = parser_id - def _generate_layout(self, filename, layout_kwargs): + # For plotting details of parsing algorithms + self.debug_info = {} + + def _generate_layout(self, filename, layout, dimensions, + page_idx, layout_kwargs): self.filename = filename self.layout_kwargs = layout_kwargs - self.layout, self.dimensions = get_page_layout( - filename, - **layout_kwargs - ) + self.layout = layout + self.dimensions = dimensions + self.page = page_idx self.images = get_text_objects(self.layout, ltype="image") self.horizontal_text = get_text_objects( self.layout, @@ -27,3 +35,25 @@ class BaseParser(object): ) self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) + + """Initialize new table object, ready to be populated + + Parameters + ---------- + table_idx : int + Index of this table within the pdf page analyzed + cols : list + list of coordinate boundaries tuples (left, right) + rows : list + list of coordinate boundaries tuples (bottom, top) + + Returns + ------- + table : camelot.core.Table + + """ + def _initialize_new_table(self, table_idx, cols, rows): + table = Table(cols, rows) + table.page = self.page + table.order = table_idx + 1 + return table diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 3a40f47..5bb130b 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -2,15 +2,20 @@ from __future__ import division import os +import sys import copy +import locale import logging import warnings +import subprocess +import numpy as np import pandas as pd from .base import BaseParser -from ..core import Table from ..utils import ( + build_file_path_in_temp_dir, + export_pdf_as_png, scale_image, scale_pdf, segments_in_bbox, @@ -18,7 +23,6 @@ from ..utils import ( merge_close_lines, get_table_index, compute_accuracy, - compute_whitespace, ) from ..image_processing import ( adaptive_threshold, @@ -110,13 +114,13 @@ class Lattice(BaseParser): resolution=300, **kwargs ): - shift_text = shift_text or ["l", "t"] + super().__init__("lattice") self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background self.line_scale = line_scale self.copy_text = copy_text - self.shift_text = shift_text + self.shift_text = shift_text or ["l", "t"] self.split_text = split_text self.flag_size = flag_size self.strip_text = strip_text @@ -126,6 +130,8 @@ class Lattice(BaseParser): self.threshold_constant = threshold_constant self.iterations = iterations self.resolution = resolution + self.image_path = None + self.pdf_image = None @staticmethod def _reduce_index(t, idx, shift_text): @@ -205,18 +211,6 @@ class Lattice(BaseParser): t.cells[i][j].text = t.cells[i - 1][j].text return t - def _generate_image(self): - from ..ext.ghostscript import Ghostscript - - self.imagename = "".join([self.rootname, ".png"]) - gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format( - self.imagename, self.filename - ) - gs_call = gs_call.encode().split() - null = open(os.devnull, "wb") - Ghostscript(*gs_call, stdout=null) - null.close() - def _generate_table_bbox(self): def scale_areas(areas): scaled_areas = [] @@ -230,15 +224,20 @@ class Lattice(BaseParser): scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) return scaled_areas - self.image, self.threshold = adaptive_threshold( - self.imagename, + self.image_path = build_file_path_in_temp_dir( + os.path.basename(self.filename), + ".png" + ) + export_pdf_as_png(self.filename, self.image_path) + self.pdf_image, self.threshold = adaptive_threshold( + self.image_path, process_background=self.process_background, blocksize=self.threshold_blocksize, c=self.threshold_constant, ) - image_width = self.image.shape[1] - image_height = self.image.shape[0] + image_width = self.pdf_image.shape[1] + image_height = self.pdf_image.shape[0] image_width_scaler = image_width / float(self.pdf_width) image_height_scaler = image_height / float(self.pdf_height) pdf_width_scaler = self.pdf_width / float(image_width) @@ -332,7 +331,7 @@ class Lattice(BaseParser): if v_s is None or h_s is None: raise ValueError("No segments found on {}".format(self.rootname)) - table = Table(cols, rows) + table = self._initialize_new_table(table_idx, cols, rows) # set table edges to True using ver+hor lines table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) # set table border edges to True @@ -360,6 +359,7 @@ class Lattice(BaseParser): ) for r_idx, c_idx, text in indices: table.cells[r_idx][c_idx].text = text + # FRHTODO accuracy = compute_accuracy([[100, pos_errors]]) if self.copy_text is not None: @@ -368,39 +368,27 @@ class Lattice(BaseParser): copy_text=self.copy_text ) - data = table.data - table.df = pd.DataFrame(data) - table.shape = table.df.shape - - whitespace = compute_whitespace(data) - table.flavor = "lattice" + table.record_metadata(self) table.accuracy = accuracy - table.whitespace = whitespace - table.order = table_idx + 1 - table.page = int(os.path.basename(self.rootname).replace("page-", "")) # for plotting _text = [] _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) table._text = _text - table._image = (self.image, self.table_bbox_unscaled) + table._image = self.pdf_image # Reuse the image used for calc + table._bbox_unscaled = self.table_bbox_unscaled table._segments = (self.vertical_segments, self.horizontal_segments) table._textedges = None return table - def extract_tables( - self, - filename, - suppress_stdout=False, - layout_kwargs=None - ): - layout_kwargs = layout_kwargs or {} - self._generate_layout(filename, layout_kwargs) + def extract_tables(self, filename, suppress_stdout=False): + # FRHTODO: move extract table core to the base class rootname = os.path.basename(self.rootname) if not suppress_stdout: - logger.info("Processing {rootname}".format(rootname=rootname)) + logger.info( + "Processing {rootname}".format(rootname=rootname)) if not self.horizontal_text: if self.images: @@ -415,7 +403,6 @@ class Lattice(BaseParser): ) return [] - self._generate_image() self._generate_table_bbox() _tables = [] diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 4af0a0e..554e2f8 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd from .base import BaseParser -from ..core import TextEdges, Table +from ..core import TextEdges from ..utils import (text_in_bbox, get_table_index, compute_accuracy, compute_whitespace) @@ -69,11 +69,9 @@ class Stream(BaseParser): column_tol=0, **kwargs ): + super().__init__("stream") self.table_regions = table_regions self.table_areas = table_areas - self.table_bbox = None - self.t_bbox = None - self.textedges = [] self.columns = columns self._validate_columns() self.split_text = split_text @@ -191,7 +189,8 @@ class Stream(BaseParser): @staticmethod def _join_rows(rows_grouped, text_y_max, text_y_min): - """Makes row coordinates continuous. + """Makes row coordinates continuous. For the row to "touch" + we split the existing gap between them in half. Parameters ---------- @@ -206,18 +205,20 @@ class Stream(BaseParser): List of continuous row y-coordinate tuples. """ - row_mids = [ - sum((t.y0 + t.y1) / 2 for t in r) / len(r) if len(r) > 0 else 0 + row_boundaries = [ + [ + max(t.y1 for t in r), + min(t.y0 for t in r) + ] for r in rows_grouped ] - rows = [ - (row_mids[i] + row_mids[i - 1]) / 2 - for i in range(1, len(row_mids)) - ] - rows.insert(0, text_y_max) - rows.append(text_y_min) - rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] - return rows + for i in range(0, len(row_boundaries)-1): + top_row = row_boundaries[i] + bottom_row = row_boundaries[i+1] + top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2 + row_boundaries[0][0] = text_y_max + row_boundaries[-1][1] = text_y_min + return row_boundaries @staticmethod def _add_columns(cols, text, row_tol): @@ -414,7 +415,7 @@ class Stream(BaseParser): return cols, rows def _generate_table(self, table_idx, cols, rows, **kwargs): - table = Table(cols, rows) + table = self._initialize_new_table(table_idx, cols, rows) table = table.set_all_edges() pos_errors = [] @@ -436,32 +437,22 @@ class Stream(BaseParser): table.cells[r_idx][c_idx].text = text accuracy = compute_accuracy([[100, pos_errors]]) - data = table.data - table.df = pd.DataFrame(data) - table.shape = table.df.shape + table.record_metadata(self) - whitespace = compute_whitespace(data) - table.flavor = "stream" table.accuracy = accuracy - table.whitespace = whitespace - table.order = table_idx + 1 - table.page = int(os.path.basename(self.rootname).replace("page-", "")) # for plotting _text = [] _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) table._text = _text - table._image = None + table._bbox = self.table_bbox table._segments = None table._textedges = self.textedges return table - def extract_tables(self, filename, suppress_stdout=False, - layout_kwargs=None): - layout_kwargs = layout_kwargs or {} - self._generate_layout(filename, layout_kwargs) + def extract_tables(self, filename, suppress_stdout=False): if not suppress_stdout: logger.info("Processing {}".format( os.path.basename(self.rootname))) diff --git a/camelot/plotting.py b/camelot/plotting.py index 51928e9..0782bb1 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -68,11 +68,14 @@ class PlotMethods(object): patches.Rectangle( (t[0], t[1]), t[2] - t[0], - t[3] - t[1] + t[3] - t[1], + alpha=0.5 ) ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) + img = table.get_pdf_image() + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig def grid(self, table): @@ -100,6 +103,9 @@ class PlotMethods(object): ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) if cell.bottom: ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) + + img = table.get_pdf_image() + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig def contour(self, table): @@ -115,12 +121,13 @@ class PlotMethods(object): fig : matplotlib.fig.Figure """ - try: - img, table_bbox = table._image - _FOR_LATTICE = True - except TypeError: - img, table_bbox = (None, {table._bbox: None}) - _FOR_LATTICE = False + + img = table.get_pdf_image() + _FOR_LATTICE = table.flavor == "lattice" + if _FOR_LATTICE: + table_bbox = table._bbox_unscaled + else: + table_bbox = {table._bbox: None} fig = plt.figure() ax = fig.add_subplot(111, aspect="equal") @@ -150,6 +157,8 @@ class PlotMethods(object): if _FOR_LATTICE: ax.imshow(img) + else: + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig def textedge(self, table): @@ -173,7 +182,8 @@ class PlotMethods(object): ax.add_patch( patches.Rectangle( (t[0], t[1]), t[2] - t[0], t[3] - t[1], - color="blue" + color="blue", + alpha=0.5 ) ) ax.set_xlim(min(xs) - 10, max(xs) + 10) @@ -182,6 +192,8 @@ class PlotMethods(object): for te in table._textedges: ax.plot([te.x, te.x], [te.y0, te.y1]) + img = table.get_pdf_image() + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig def joint(self, table): @@ -197,7 +209,8 @@ class PlotMethods(object): fig : matplotlib.fig.Figure """ - img, table_bbox = table._image + img = table.get_pdf_image() + table_bbox = table._bbox_unscaled fig = plt.figure() ax = fig.add_subplot(111, aspect="equal") x_coord = [] @@ -230,4 +243,7 @@ class PlotMethods(object): ax.plot([v[0], v[2]], [v[1], v[3]]) for h in horizontal: ax.plot([h[0], h[2]], [h[1], h[3]]) + + img = table.get_pdf_image() + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig diff --git a/camelot/utils.py b/camelot/utils.py index c3bf723..89b6eee 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -3,6 +3,7 @@ from __future__ import division import re import os +import atexit import sys import random import shutil @@ -13,6 +14,7 @@ from itertools import groupby from operator import itemgetter import numpy as np +import pandas as pd from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage @@ -29,6 +31,7 @@ from pdfminer.layout import ( LTImage, ) +from .ext.ghostscript import Ghostscript # pylint: disable=import-error # PyLint will evaluate both branches, and will necessarily complain about one @@ -150,13 +153,40 @@ def remove_extra(kwargs, flavor="lattice"): # https://stackoverflow.com/a/22726782 +# and https://stackoverflow.com/questions/10965479 class TemporaryDirectory(object): def __enter__(self): self.name = tempfile.mkdtemp() + # Only delete the temporary directory upon + # program exit. + atexit.register(shutil.rmtree, self.name) return self.name def __exit__(self, exc_type, exc_value, traceback): - shutil.rmtree(self.name) + pass + + +def build_file_path_in_temp_dir(filename, extension=None): + """Generates a new path within a temporary directory + + Parameters + ---------- + filename : str + extension : str + + Returns + ------- + file_path_in_temporary_dir : str + + """ + with TemporaryDirectory() as temp_dir: + if extension: + filename = filename + extension + path = os.path.join( + temp_dir, + filename + ) + return path def translate(x1, x2): @@ -387,6 +417,117 @@ def text_in_bbox(bbox, text): return t_bbox +def bbox_from_text(textlines): + """Returns the smallest bbox containing all the text objects passed as + a parameters. + + Parameters + ---------- + textlines : List of PDFMiner text objects. + + Returns + ------- + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate + space. + + """ + if len(textlines) == 0: + return None + bbox = ( + textlines[0].x0, + textlines[0].y0, + textlines[0].x1, + textlines[0].y1 + ) + + for tl in textlines[1:]: + bbox = ( + min(bbox[0], tl.x0), + min(bbox[1], tl.y0), + max(bbox[2], tl.x1), + max(bbox[3], tl.y1) + ) + return bbox + + +def find_columns_coordinates(tls): + """Given a list of text objects, guess columns boundaries and returns a + list of x-coordinates for split points between columns. + + Parameters + ---------- + tls : list of PDFMiner text object. + + Returns + ------- + cols_anchors : list + List of x-coordinates for columns. + + """ + # Make a list of disjunct cols boundaries across the textlines + # that comprise the table. + # [(1st col left, 1st col right), (2nd col left, 2nd col right), ...] + cols_bounds = [] + tls.sort(key=lambda tl: tl.x0) + for tl in tls: + if (not cols_bounds) or cols_bounds[-1][1] < tl.x0: + cols_bounds.append([tl.x0, tl.x1]) + else: + cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1) + + # From the row boundaries, identify splits by getting the mid points + # between the boundaries. + # Row boundaries: [ a ] [b] [ c ] + # Splits: | | | | + cols_anchors = list(map( + lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0, + range(1, len(cols_bounds)-1) + )) + cols_anchors.insert(0, cols_bounds[0][0]) + cols_anchors.append(cols_bounds[-1][1]) + return cols_anchors + + +def distance_tl_to_bbox(tl, bbox): + """Returns a tuple corresponding to the horizontal and vertical gaps + between a textline and a bbox. + + Parameters + ---------- + tl : PDFMiner text object. + bbox : tuple (x0, y0, x1, y1) + + Returns + ------- + distance : tuple + Tuple (horizontal distance, vertical distance) + + """ + v_distance, h_distance = None, None + if tl.x1 <= bbox[0]: + # tl to the left + h_distance = bbox[0] - tl.x1 + elif bbox[2] <= tl.x0: + # tl to the right + h_distance = tl.x0 - bbox[2] + else: + # textline overlaps vertically + h_distance = 0 + + if tl.y1 <= bbox[1]: + # tl below + v_distance = bbox[1] - tl.y1 + elif bbox[3] <= tl.y0: + # tl above + v_distance = tl.y0 - bbox[3] + else: + # tl overlaps horizontally + v_distance = 0 + return (h_distance, v_distance) + + def merge_close_lines(ar, line_tol=2): """Merges lines which are within a tolerance by calculating a moving mean, based on their x or y axis projections. @@ -867,3 +1008,94 @@ def get_text_objects(layout, ltype="char", t=None): except AttributeError: pass return t + + +def export_pdf_as_png(pdf_path, destination_path): + """Generate an image from a pdf. + + Parameters + ---------- + pdf_path : str + destination_path : str + """ + gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}" + gs_call = gs_call.encode().split() + null = open(os.devnull, "wb") + Ghostscript(*gs_call, stdout=null) + null.close() + + +def compare_tables(left, right): + """Compare two tables and displays differences in a human readable form. + + Parameters + ---------- + left : data frame + right : data frame + """ + diff_cols = right.shape[1]-left.shape[1] + diff_rows = right.shape[0]-left.shape[0] + differences = [] + if (diff_rows): + differences.append( + f"{abs(diff_rows)} " + f"{'more' if diff_rows>0 else 'fewer'} rows" + ) + if (diff_cols): + differences.append( + f"{abs(diff_cols)} " + f"{'more' if diff_cols>0 else 'fewer'} columns" + ) + if differences: + differences_str = " and ".join(differences) + print(f"Right has {differences_str} than left " + f"[{right.shape[0]},{right.shape[1]}] vs " + f"[{left.shape[0]},{left.shape[1]}]") + + table1, table2 = [left, right] + name_table1, name_table2 = ["left", "right"] + if not diff_rows: + # Same number of rows: compare columns since they're of the same length + if diff_cols > 0: + # Use the longest table as a reference + table1, table2 = table2, table1 + name_table1, name_table2 = name_table2, name_table1 + for i, col in enumerate(table1.columns): + lcol = table1.iloc[:, i] + if col in table2: + scol = table2.iloc[:, i] + if not lcol.equals(scol): + diff_df = pd.DataFrame() + diff_df[name_table1] = scol + diff_df[name_table2] = lcol + diff_df["Match"] = lcol == scol + print( + f"Column {i} different:\n" + f"{diff_df}" + ) + break + else: + print("Column {i} unique to {name_table1}: {lcol}") + break + elif not diff_cols: + # Same number of cols: compare rows since they're of the same length + if diff_rows > 0: + # Use the longest table as a reference + table1, table2 = table2, table1 + name_table1, name_table2 = name_table2, name_table1 + for index, lrow in table1.iterrows(): + if index < table2.shape[1]: + srow = table2.loc[index, :] + if not lrow.equals(srow): + diff_df = pd.DataFrame() + diff_df = diff_df.append(lrow, ignore_index=True) + diff_df = diff_df.append(srow, ignore_index=True) + diff_df.insert(0, 'Table', [name_table1, name_table2]) + print(f"Row {index} differs:") + print(diff_df.values) + break + else: + print(f"Row {index} unique to {name_table1}: {lrow}") + break + else: + print("Tables have different shapes") diff --git a/tests/files/baseline_plots/test_grid_plot.png b/tests/files/baseline_plots/test_grid_plot.png index 0607d15..87fe2aa 100644 Binary files a/tests/files/baseline_plots/test_grid_plot.png and b/tests/files/baseline_plots/test_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_line_plot.png b/tests/files/baseline_plots/test_line_plot.png index 12c44c0..6ddeace 100644 Binary files a/tests/files/baseline_plots/test_line_plot.png and b/tests/files/baseline_plots/test_line_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_contour_plot.png b/tests/files/baseline_plots/test_stream_contour_plot.png index 958ea0a..d781439 100644 Binary files a/tests/files/baseline_plots/test_stream_contour_plot.png and b/tests/files/baseline_plots/test_stream_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_grid_plot.png b/tests/files/baseline_plots/test_stream_grid_plot.png index 818958c..b04a2f1 100644 Binary files a/tests/files/baseline_plots/test_stream_grid_plot.png and b/tests/files/baseline_plots/test_stream_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_text_plot.png b/tests/files/baseline_plots/test_text_plot.png index 63b5520..497af37 100644 Binary files a/tests/files/baseline_plots/test_text_plot.png and b/tests/files/baseline_plots/test_text_plot.png differ diff --git a/tests/files/baseline_plots/test_textedge_plot.png b/tests/files/baseline_plots/test_textedge_plot.png index 1de4e9c..1c04473 100644 Binary files a/tests/files/baseline_plots/test_textedge_plot.png and b/tests/files/baseline_plots/test_textedge_plot.png differ