Refactor base classes and improve plotting

Move common code to base class to reduce duplication Stream plots display pdf background for better context
2020-04-18 23:03:27 -07:00 · 2020-04-18 23:03:27 -07:00 · 697289e409
parent 816471e426
commit 697289e409
13 changed files with 447 additions and 122 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -10,6 +10,15 @@ from operator import itemgetter
 import numpy as np
 import pandas as pd
 from cv2 import cv2
 from .utils import (
    build_file_path_in_temp_dir,
    compute_accuracy,
    compute_whitespace,
    export_pdf_as_png
 )
 # minimum number of vertical textline intersections for a textedge
 # to be considered valid
@ -159,7 +168,10 @@ class TextEdges(object):
        # get vertical textedges that intersect maximum number of
        # times with horizontal textlines
        relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
-        return self._textedges[relevant_align]
+        return list(filter(
            lambda te: te.is_valid,
            self._textedges[relevant_align])
        )
    def get_table_areas(self, textlines, relevant_textedges):
        """Returns a dict of interesting table areas on the PDF page
@ -179,7 +191,6 @@ class TextEdges(object):
        table_areas = {}
        for te in relevant_textedges:
            if te.is_valid:
                if not table_areas:
                    table_areas[(te.x, te.y0, te.x, te.y1)] = None
                else:
@ -225,7 +236,8 @@ class TextEdges(object):
                    max(found[3], tl.y1),
                )
                table_areas[updated_area] = None
-        average_textline_height = sum_textline_height / float(len(textlines))
+        average_textline_height = sum_textline_height / \
            float(len(textlines))
        # add some padding to table areas
        table_areas_padded = {}
@ -339,6 +351,8 @@ class Table(object):
        Accuracy with which text was assigned to the cell.
    whitespace : float
        Percentage of whitespace in the table.
    filename : str
        Path of the original PDF
    order : int
        Table number on PDF page.
    page : int
@ -356,8 +370,15 @@ class Table(object):
        self.shape = (0, 0)
        self.accuracy = 0
        self.whitespace = 0
        self.filename = None
        self.order = None
        self.page = None
        self.flavor = None      # Flavor of the parser that generated the table
        self.pdf_size = None    # Dimensions of the original PDF page
        self.debug_info = None  # Field holding debug data
        self._image = None
        self._image_path = None  # Temporary file to hold an image of the pdf
    def __repr__(self):
        return "<{} shape={}>".format(self.__class__.__name__, self.shape)
@ -392,6 +413,32 @@ class Table(object):
        }
        return report
    def record_metadata(self, parser):
        """Record data about the origin of the table
        """
        self.flavor = parser.id
        self.filename = parser.filename
        self.debug_info = parser.debug_info
        data = self.data
        self.df = pd.DataFrame(data)
        self.shape = self.df.shape
        self.whitespace = compute_whitespace(data)
        self.pdf_size = (parser.pdf_width, parser.pdf_height)
    def get_pdf_image(self):
        """Compute pdf image and cache it
        """
        if self._image is None:
            if self._image_path is None:
                self._image_path = build_file_path_in_temp_dir(
                    os.path.basename(self.filename),
                    ".png"
                )
                export_pdf_as_png(self.filename, self._image_path)
            self._image = cv2.imread(self._image_path)
        return self._image
    def set_all_edges(self):
        """Sets all table edges to True.
        """
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
 from .core import TableList
 from .parsers import Stream, Lattice
 from .utils import (
-    TemporaryDirectory,
+    build_file_path_in_temp_dir,
    get_page_layout,
    get_text_objects,
    get_rotation,
@ -16,6 +16,11 @@ from .utils import (
    download_url,
 )
 PARSERS = {
    "lattice": Lattice,
    "stream": Stream
 }
 class PDFHandler(object):
    """Handles all operations like temp directory creation, splitting
@ -89,31 +94,47 @@ class PDFHandler(object):
            P.extend(range(p["start"], p["end"] + 1))
        return sorted(set(P))
-    def _save_page(self, filepath, page, temp):
+    def _read_pdf_page(self, page=1, layout_kwargs=None):
-        """Saves specified page from PDF into a temporary directory.
+        """Saves specified page from PDF into a temporary directory. Removes
        password protection and normalizes rotation.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        page : int
            Page number.
-        temp : str
+        layout_kwargs : dict, optional (default: {})
-            Tmp directory.
+            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.  # noqa
        Returns
        -------
        layout : object
        dimensions : tuple
            The dimensions of the pdf page
        filepath : str
            The path of the single page PDF - either the original, or a
            normalized version.
        """
-        with open(filepath, "rb") as fileobj:
+        layout_kwargs = layout_kwargs or {}
        with open(self.filepath, "rb") as fileobj:
            # Normalize the pdf file, but skip if it's not encrypted or has
            # only one page.
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
-            fpath = os.path.join(temp, "page-{0}.pdf".format(page))
+            fpath = build_file_path_in_temp_dir(
                "page-{page}.pdf".format(page=page))
            froot, fext = os.path.splitext(fpath)
            p = infile.getPage(page - 1)
            outfile = PdfFileWriter()
            outfile.addPage(p)
            with open(fpath, "wb") as f:
                outfile.write(f)
-            layout, __ = get_page_layout(fpath)
+            layout, dimensions = get_page_layout(
                fpath, **layout_kwargs)
            # fix rotated PDF
            chars = get_text_objects(layout, ltype="char")
            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
@ -121,12 +142,7 @@ class PDFHandler(object):
            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != "":
                fpath_new = "".join(
-                    [
+                    [froot.replace("page", "p"), "_rotated", fext])
                        froot.replace("page", "p"),
                        "_rotated",
                        fext
                    ]
                )
                os.rename(fpath, fpath_new)
                infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
                if infile.isEncrypted:
@ -140,10 +156,13 @@ class PDFHandler(object):
                outfile.addPage(p)
                with open(fpath, "wb") as f:
                    outfile.write(f)
                layout, dimensions = get_page_layout(
                    fpath, **layout_kwargs)
        return layout, dimensions, fpath
    def parse(
-        self, flavor="lattice", suppress_stdout=False, layout_kwargs=None,
+        self, flavor="lattice", suppress_stdout=False,
-        **kwargs
+        layout_kwargs=None, **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
@ -168,19 +187,22 @@ class PDFHandler(object):
        """
        layout_kwargs = layout_kwargs or {}
        tables = []
-        with TemporaryDirectory() as tempdir:
+
-            for p in self.pages:
+        parser_obj = PARSERS[flavor]
-                self._save_page(self.filepath, p, tempdir)
+        parser = parser_obj(**kwargs)
-            pages = [
+
-                os.path.join(tempdir, "page-{0}.pdf".format(p))
+        # Read the layouts/dimensions of each of the pages we need to
-                for p in self.pages
+        # parse. This might require creating a temporary .pdf.
-            ]
+        for page_idx in self.pages:
-            parser = Lattice(**kwargs) \
+            layout, dimensions, source_file = self._read_pdf_page(
-                if flavor == "lattice" else Stream(**kwargs)
+                page_idx,
-            for p in pages:
+                layout_kwargs=layout_kwargs
-                t = parser.extract_tables(
+            )
-                    p, suppress_stdout=suppress_stdout,
+            parser._generate_layout(source_file, layout, dimensions,
-                    layout_kwargs=layout_kwargs
+                                page_idx, layout_kwargs)
-                )
+            t = parser.extract_tables(
-                tables.extend(t)
+                source_file,
                suppress_stdout=suppress_stdout
            )
            tables.extend(t)
        return TableList(sorted(tables))
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -2,20 +2,28 @@
 import os
-from ..utils import get_page_layout, get_text_objects
+from ..utils import (
    get_text_objects
 )
 from ..core import Table
 class BaseParser(object):
    """Defines a base parser.
    """
    def __init__(self, parser_id):
        self.id = parser_id
-    def _generate_layout(self, filename, layout_kwargs):
+        # For plotting details of parsing algorithms
        self.debug_info = {}
    def _generate_layout(self, filename, layout, dimensions,
                         page_idx, layout_kwargs):
        self.filename = filename
        self.layout_kwargs = layout_kwargs
-        self.layout, self.dimensions = get_page_layout(
+        self.layout = layout
-            filename,
+        self.dimensions = dimensions
-            **layout_kwargs
+        self.page = page_idx
        )
        self.images = get_text_objects(self.layout, ltype="image")
        self.horizontal_text = get_text_objects(
            self.layout,
@ -27,3 +35,25 @@ class BaseParser(object):
        )
        self.pdf_width, self.pdf_height = self.dimensions
        self.rootname, __ = os.path.splitext(self.filename)
    """Initialize new table object, ready to be populated
    Parameters
    ----------
    table_idx : int
        Index of this table within the pdf page analyzed
    cols : list
        list of coordinate boundaries tuples (left, right)
    rows : list
        list of coordinate boundaries tuples (bottom, top)
    Returns
    -------
    table : camelot.core.Table
    """
    def _initialize_new_table(self, table_idx, cols, rows):
        table = Table(cols, rows)
        table.page = self.page
        table.order = table_idx + 1
        return table
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -2,15 +2,20 @@
 from __future__ import division
 import os
 import sys
 import copy
 import locale
 import logging
 import warnings
 import subprocess
 import numpy as np
 import pandas as pd
 from .base import BaseParser
 from ..core import Table
 from ..utils import (
    build_file_path_in_temp_dir,
    export_pdf_as_png,
    scale_image,
    scale_pdf,
    segments_in_bbox,
@ -18,7 +23,6 @@ from ..utils import (
    merge_close_lines,
    get_table_index,
    compute_accuracy,
    compute_whitespace,
 )
 from ..image_processing import (
    adaptive_threshold,
@ -110,13 +114,13 @@ class Lattice(BaseParser):
        resolution=300,
        **kwargs
    ):
-        shift_text = shift_text or ["l", "t"]
+        super().__init__("lattice")
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.process_background = process_background
        self.line_scale = line_scale
        self.copy_text = copy_text
-        self.shift_text = shift_text
+        self.shift_text = shift_text or ["l", "t"]
        self.split_text = split_text
        self.flag_size = flag_size
        self.strip_text = strip_text
@ -126,6 +130,8 @@ class Lattice(BaseParser):
        self.threshold_constant = threshold_constant
        self.iterations = iterations
        self.resolution = resolution
        self.image_path = None
        self.pdf_image = None
    @staticmethod
    def _reduce_index(t, idx, shift_text):
@ -205,18 +211,6 @@ class Lattice(BaseParser):
                                t.cells[i][j].text = t.cells[i - 1][j].text
        return t
    def _generate_image(self):
        from ..ext.ghostscript import Ghostscript
        self.imagename = "".join([self.rootname, ".png"])
        gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
            self.imagename, self.filename
        )
        gs_call = gs_call.encode().split()
        null = open(os.devnull, "wb")
        Ghostscript(*gs_call, stdout=null)
        null.close()
    def _generate_table_bbox(self):
        def scale_areas(areas):
            scaled_areas = []
@ -230,15 +224,20 @@ class Lattice(BaseParser):
                scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            return scaled_areas
-        self.image, self.threshold = adaptive_threshold(
+        self.image_path = build_file_path_in_temp_dir(
-            self.imagename,
+            os.path.basename(self.filename),
            ".png"
        )
        export_pdf_as_png(self.filename, self.image_path)
        self.pdf_image, self.threshold = adaptive_threshold(
            self.image_path,
            process_background=self.process_background,
            blocksize=self.threshold_blocksize,
            c=self.threshold_constant,
        )
-        image_width = self.image.shape[1]
+        image_width = self.pdf_image.shape[1]
-        image_height = self.image.shape[0]
+        image_height = self.pdf_image.shape[0]
        image_width_scaler = image_width / float(self.pdf_width)
        image_height_scaler = image_height / float(self.pdf_height)
        pdf_width_scaler = self.pdf_width / float(image_width)
@ -332,7 +331,7 @@ class Lattice(BaseParser):
        if v_s is None or h_s is None:
            raise ValueError("No segments found on {}".format(self.rootname))
-        table = Table(cols, rows)
+        table = self._initialize_new_table(table_idx, cols, rows)
        # set table edges to True using ver+hor lines
        table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
        # set table border edges to True
@ -360,6 +359,7 @@ class Lattice(BaseParser):
                    )
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        # FRHTODO
        accuracy = compute_accuracy([[100, pos_errors]])
        if self.copy_text is not None:
@ -368,39 +368,27 @@ class Lattice(BaseParser):
                copy_text=self.copy_text
            )
-        data = table.data
+        table.record_metadata(self)
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape
        whitespace = compute_whitespace(data)
        table.flavor = "lattice"
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
        table.page = int(os.path.basename(self.rootname).replace("page-", ""))
        # for plotting
        _text = []
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
-        table._image = (self.image, self.table_bbox_unscaled)
+        table._image = self.pdf_image  # Reuse the image used for calc
        table._bbox_unscaled = self.table_bbox_unscaled
        table._segments = (self.vertical_segments, self.horizontal_segments)
        table._textedges = None
        return table
-    def extract_tables(
+    def extract_tables(self, filename, suppress_stdout=False):
-        self,
+        # FRHTODO: move extract table core to the base class
        filename,
        suppress_stdout=False,
        layout_kwargs=None
    ):
        layout_kwargs = layout_kwargs or {}
        self._generate_layout(filename, layout_kwargs)
        rootname = os.path.basename(self.rootname)
        if not suppress_stdout:
-            logger.info("Processing {rootname}".format(rootname=rootname))
+            logger.info(
                "Processing {rootname}".format(rootname=rootname))
        if not self.horizontal_text:
            if self.images:
@ -415,7 +403,6 @@ class Lattice(BaseParser):
                )
            return []
        self._generate_image()
        self._generate_table_bbox()
        _tables = []
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -9,7 +9,7 @@ import numpy as np
 import pandas as pd
 from .base import BaseParser
-from ..core import TextEdges, Table
+from ..core import TextEdges
 from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
                     compute_whitespace)
@ -69,11 +69,9 @@ class Stream(BaseParser):
        column_tol=0,
        **kwargs
    ):
        super().__init__("stream")
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.table_bbox = None
        self.t_bbox = None
        self.textedges = []
        self.columns = columns
        self._validate_columns()
        self.split_text = split_text
@ -191,7 +189,8 @@ class Stream(BaseParser):
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
-        """Makes row coordinates continuous.
+        """Makes row coordinates continuous. For the row to "touch"
        we split the existing gap between them in half.
        Parameters
        ----------
@ -206,18 +205,20 @@ class Stream(BaseParser):
            List of continuous row y-coordinate tuples.
        """
-        row_mids = [
+        row_boundaries = [
-            sum((t.y0 + t.y1) / 2 for t in r) / len(r) if len(r) > 0 else 0
+            [
                max(t.y1 for t in r),
                min(t.y0 for t in r)
            ]
            for r in rows_grouped
        ]
-        rows = [
+        for i in range(0, len(row_boundaries)-1):
-            (row_mids[i] + row_mids[i - 1]) / 2
+            top_row = row_boundaries[i]
-            for i in range(1, len(row_mids))
+            bottom_row = row_boundaries[i+1]
-        ]
+            top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
-        rows.insert(0, text_y_max)
+        row_boundaries[0][0] = text_y_max
-        rows.append(text_y_min)
+        row_boundaries[-1][1] = text_y_min
-        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
+        return row_boundaries
        return rows
    @staticmethod
    def _add_columns(cols, text, row_tol):
@ -414,7 +415,7 @@ class Stream(BaseParser):
        return cols, rows
    def _generate_table(self, table_idx, cols, rows, **kwargs):
-        table = Table(cols, rows)
+        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
        pos_errors = []
@ -436,32 +437,22 @@ class Stream(BaseParser):
                        table.cells[r_idx][c_idx].text = text
        accuracy = compute_accuracy([[100, pos_errors]])
-        data = table.data
+        table.record_metadata(self)
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape
        whitespace = compute_whitespace(data)
        table.flavor = "stream"
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
        table.page = int(os.path.basename(self.rootname).replace("page-", ""))
        # for plotting
        _text = []
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
-        table._image = None
+        table._bbox = self.table_bbox
        table._segments = None
        table._textedges = self.textedges
        return table
-    def extract_tables(self, filename, suppress_stdout=False,
+    def extract_tables(self, filename, suppress_stdout=False):
                       layout_kwargs=None):
        layout_kwargs = layout_kwargs or {}
        self._generate_layout(filename, layout_kwargs)
        if not suppress_stdout:
            logger.info("Processing {}".format(
                os.path.basename(self.rootname)))
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -68,11 +68,14 @@ class PlotMethods(object):
                patches.Rectangle(
                        (t[0], t[1]),
                        t[2] - t[0],
-                        t[3] - t[1]
+                        t[3] - t[1],
                        alpha=0.5
                    )
                )
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
        img = table.get_pdf_image()
        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig
    def grid(self, table):
@ -100,6 +103,9 @@ class PlotMethods(object):
                    ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
                if cell.bottom:
                    ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
        img = table.get_pdf_image()
        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig
    def contour(self, table):
@ -115,12 +121,13 @@ class PlotMethods(object):
        fig : matplotlib.fig.Figure
        """
-        try:
+
-            img, table_bbox = table._image
+        img = table.get_pdf_image()
-            _FOR_LATTICE = True
+        _FOR_LATTICE = table.flavor == "lattice"
-        except TypeError:
+        if _FOR_LATTICE:
-            img, table_bbox = (None, {table._bbox: None})
+            table_bbox = table._bbox_unscaled
-            _FOR_LATTICE = False
+        else:
            table_bbox = {table._bbox: None}
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
@ -150,6 +157,8 @@ class PlotMethods(object):
        if _FOR_LATTICE:
            ax.imshow(img)
        else:
            ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig
    def textedge(self, table):
@ -173,7 +182,8 @@ class PlotMethods(object):
            ax.add_patch(
                patches.Rectangle(
                    (t[0], t[1]), t[2] - t[0], t[3] - t[1],
-                    color="blue"
+                    color="blue",
                    alpha=0.5
                )
            )
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
@ -182,6 +192,8 @@ class PlotMethods(object):
        for te in table._textedges:
            ax.plot([te.x, te.x], [te.y0, te.y1])
        img = table.get_pdf_image()
        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig
    def joint(self, table):
@ -197,7 +209,8 @@ class PlotMethods(object):
        fig : matplotlib.fig.Figure
        """
-        img, table_bbox = table._image
+        img = table.get_pdf_image()
        table_bbox = table._bbox_unscaled
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
        x_coord = []
@ -230,4 +243,7 @@ class PlotMethods(object):
            ax.plot([v[0], v[2]], [v[1], v[3]])
        for h in horizontal:
            ax.plot([h[0], h[2]], [h[1], h[3]])
        img = table.get_pdf_image()
        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -3,6 +3,7 @@ from __future__ import division
 import re
 import os
 import atexit
 import sys
 import random
 import shutil
@ -13,6 +14,7 @@ from itertools import groupby
 from operator import itemgetter
 import numpy as np
 import pandas as pd
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@ -29,6 +31,7 @@ from pdfminer.layout import (
    LTImage,
 )
 from .ext.ghostscript import Ghostscript
 # pylint: disable=import-error
 # PyLint will evaluate both branches, and will necessarily complain about one
@ -150,13 +153,40 @@ def remove_extra(kwargs, flavor="lattice"):
 # https://stackoverflow.com/a/22726782
 # and https://stackoverflow.com/questions/10965479
 class TemporaryDirectory(object):
    def __enter__(self):
        self.name = tempfile.mkdtemp()
        # Only delete the temporary directory upon
        # program exit.
        atexit.register(shutil.rmtree, self.name)
        return self.name
    def __exit__(self, exc_type, exc_value, traceback):
-        shutil.rmtree(self.name)
+        pass
 def build_file_path_in_temp_dir(filename, extension=None):
    """Generates a new path within a temporary directory
    Parameters
    ----------
    filename : str
    extension : str
    Returns
    -------
    file_path_in_temporary_dir : str
    """
    with TemporaryDirectory() as temp_dir:
        if extension:
            filename = filename + extension
        path = os.path.join(
            temp_dir,
            filename
        )
    return path
 def translate(x1, x2):
@ -387,6 +417,117 @@ def text_in_bbox(bbox, text):
    return t_bbox
 def bbox_from_text(textlines):
    """Returns the smallest bbox containing all the text objects passed as
    a parameters.
    Parameters
    ----------
    textlines : List of PDFMiner text objects.
    Returns
    -------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    """
    if len(textlines) == 0:
        return None
    bbox = (
        textlines[0].x0,
        textlines[0].y0,
        textlines[0].x1,
        textlines[0].y1
    )
    for tl in textlines[1:]:
        bbox = (
            min(bbox[0], tl.x0),
            min(bbox[1], tl.y0),
            max(bbox[2], tl.x1),
            max(bbox[3], tl.y1)
        )
    return bbox
 def find_columns_coordinates(tls):
    """Given a list of text objects, guess columns boundaries and returns a
    list of x-coordinates for split points between columns.
    Parameters
    ----------
    tls : list of PDFMiner text object.
    Returns
    -------
    cols_anchors : list
        List of x-coordinates for columns.
    """
    # Make a list of disjunct cols boundaries across the textlines
    # that comprise the table.
    # [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
    cols_bounds = []
    tls.sort(key=lambda tl: tl.x0)
    for tl in tls:
        if (not cols_bounds) or cols_bounds[-1][1] < tl.x0:
            cols_bounds.append([tl.x0, tl.x1])
        else:
            cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
    # From the row boundaries, identify splits by getting the mid points
    # between the boundaries.
    # Row boundaries: [ a ]        [b]    [   c   ]
    # Splits:         |        |        |         |
    cols_anchors = list(map(
        lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
        range(1, len(cols_bounds)-1)
    ))
    cols_anchors.insert(0, cols_bounds[0][0])
    cols_anchors.append(cols_bounds[-1][1])
    return cols_anchors
 def distance_tl_to_bbox(tl, bbox):
    """Returns a tuple corresponding to the horizontal and vertical gaps
    between a textline and a bbox.
    Parameters
    ----------
    tl : PDFMiner text object.
    bbox : tuple (x0, y0, x1, y1)
    Returns
    -------
    distance : tuple
        Tuple (horizontal distance, vertical distance)
    """
    v_distance, h_distance = None, None
    if tl.x1 <= bbox[0]:
        # tl to the left
        h_distance = bbox[0] - tl.x1
    elif bbox[2] <= tl.x0:
        # tl to the right
        h_distance = tl.x0 - bbox[2]
    else:
        # textline overlaps vertically
        h_distance = 0
    if tl.y1 <= bbox[1]:
        # tl below
        v_distance = bbox[1] - tl.y1
    elif bbox[3] <= tl.y0:
        # tl above
        v_distance = tl.y0 - bbox[3]
    else:
        # tl overlaps horizontally
        v_distance = 0
    return (h_distance, v_distance)
 def merge_close_lines(ar, line_tol=2):
    """Merges lines which are within a tolerance by calculating a
    moving mean, based on their x or y axis projections.
@ -867,3 +1008,94 @@ def get_text_objects(layout, ltype="char", t=None):
    except AttributeError:
        pass
    return t
 def export_pdf_as_png(pdf_path, destination_path):
    """Generate an image from a pdf.
    Parameters
    ----------
    pdf_path : str
    destination_path : str
    """
    gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
    gs_call = gs_call.encode().split()
    null = open(os.devnull, "wb")
    Ghostscript(*gs_call, stdout=null)
    null.close()
 def compare_tables(left, right):
    """Compare two tables and displays differences in a human readable form.
    Parameters
    ----------
    left : data frame
    right : data frame
    """
    diff_cols = right.shape[1]-left.shape[1]
    diff_rows = right.shape[0]-left.shape[0]
    differences = []
    if (diff_rows):
        differences.append(
            f"{abs(diff_rows)} "
            f"{'more' if diff_rows>0 else 'fewer'} rows"
        )
    if (diff_cols):
        differences.append(
            f"{abs(diff_cols)} "
            f"{'more' if diff_cols>0 else 'fewer'} columns"
        )
    if differences:
        differences_str = " and ".join(differences)
        print(f"Right has {differences_str} than left "
              f"[{right.shape[0]},{right.shape[1]}] vs "
              f"[{left.shape[0]},{left.shape[1]}]")
    table1, table2 = [left, right]
    name_table1, name_table2 = ["left", "right"]
    if not diff_rows:
        # Same number of rows: compare columns since they're of the same length
        if diff_cols > 0:
            # Use the longest table as a reference
            table1, table2 = table2, table1
            name_table1, name_table2 = name_table2, name_table1
        for i, col in enumerate(table1.columns):
            lcol = table1.iloc[:, i]
            if col in table2:
                scol = table2.iloc[:, i]
                if not lcol.equals(scol):
                    diff_df = pd.DataFrame()
                    diff_df[name_table1] = scol
                    diff_df[name_table2] = lcol
                    diff_df["Match"] = lcol == scol
                    print(
                        f"Column {i} different:\n"
                        f"{diff_df}"
                    )
                    break
            else:
                print("Column {i} unique to {name_table1}: {lcol}")
                break
    elif not diff_cols:
        # Same number of cols: compare rows since they're of the same length
        if diff_rows > 0:
            # Use the longest table as a reference
            table1, table2 = table2, table1
            name_table1, name_table2 = name_table2, name_table1
        for index, lrow in table1.iterrows():
            if index < table2.shape[1]:
                srow = table2.loc[index, :]
                if not lrow.equals(srow):
                    diff_df = pd.DataFrame()
                    diff_df = diff_df.append(lrow, ignore_index=True)
                    diff_df = diff_df.append(srow, ignore_index=True)
                    diff_df.insert(0, 'Table', [name_table1, name_table2])
                    print(f"Row {index} differs:")
                    print(diff_df.values)
                    break
            else:
                print(f"Row {index} unique to {name_table1}: {lrow}")
                break
    else:
        print("Tables have different shapes")
--- a/tests/files/baseline_plots/test_grid_plot.png
+++ b/tests/files/baseline_plots/test_grid_plot.png
--- a/tests/files/baseline_plots/test_line_plot.png
+++ b/tests/files/baseline_plots/test_line_plot.png
--- a/tests/files/baseline_plots/test_stream_contour_plot.png
+++ b/tests/files/baseline_plots/test_stream_contour_plot.png
--- a/tests/files/baseline_plots/test_stream_grid_plot.png
+++ b/tests/files/baseline_plots/test_stream_grid_plot.png
--- a/tests/files/baseline_plots/test_text_plot.png
+++ b/tests/files/baseline_plots/test_text_plot.png
--- a/tests/files/baseline_plots/test_textedge_plot.png
+++ b/tests/files/baseline_plots/test_textedge_plot.png