More refactoring / linting

2020-04-19 15:41:45 -07:00 · 2020-04-19 15:41:45 -07:00 · 58823e57e9
parent d673a3b6e0
commit 58823e57e9
4 changed files with 53 additions and 32 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -431,6 +431,12 @@ class Table(object):
        self.whitespace = compute_whitespace(data)
        self.pdf_size = (parser.pdf_width, parser.pdf_height)

+        _text = []
+        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
+        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text])
+        self._text = _text
+
+
    def get_pdf_image(self):
        """Compute pdf image and cache it
        """
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -12,7 +12,7 @@ from ..utils import (
    scale_image,
    scale_pdf,
    segments_in_bbox,
-    text_in_bbox,
+    text_in_bbox_per_axis,
    merge_close_lines,
 )
 from ..image_processing import (
@ -252,19 +252,17 @@ class Lattice(BaseParser):
            table_bbox, vertical_segments, horizontal_segments, pdf_scalers
        )

-    def _generate_columns_and_rows(self, table_idx, tk):
+
+    def _generate_columns_and_rows(self, tk):
        # select elements which lie within table_bbox
-        t_bbox = {}
        v_s, h_s = segments_in_bbox(
            tk, self.vertical_segments, self.horizontal_segments
        )
-        t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
-        t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
-
-        t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
-        t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
-
-        self.t_bbox = t_bbox
+        self.t_bbox = text_in_bbox_per_axis(
+            tk,
+            self.horizontal_text,
+            self.vertical_text
+            )

        cols, rows = zip(*self.table_bbox[tk])
        cols, rows = list(cols), list(rows)
@ -299,10 +297,6 @@ class Lattice(BaseParser):
        table.record_parse_metadata(self)

        # for plotting
-        _text = []
-        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
-        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
-        table._text = _text
        table._image = self.pdf_image  # Reuse the image used for calc
        table._bbox_unscaled = self.table_bbox_unscaled
        table._segments = (self.vertical_segments, self.horizontal_segments)
@ -321,8 +315,7 @@ class Lattice(BaseParser):
        for table_idx, tk in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
-            cols, rows, v_s, h_s = self._generate_columns_and_rows(
-                table_idx, tk)
+            cols, rows, v_s, h_s = self._generate_columns_and_rows(tk)
            table = self._generate_table(
                table_idx, cols, rows, v_s=v_s, h_s=h_s)
            table._bbox = tk
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -7,7 +7,7 @@ import numpy as np

 from .base import BaseParser
 from ..core import TextEdges
-from ..utils import (text_in_bbox)
+from ..utils import (text_in_bbox, text_in_bbox_per_axis)


 class Stream(BaseParser):
@ -331,14 +331,11 @@ class Stream(BaseParser):

    def _generate_columns_and_rows(self, table_idx, tk):
        # select elements which lie within table_bbox
-        t_bbox = {}
-        t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
-        t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
-
-        t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
-        t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
-
-        self.t_bbox = t_bbox
+        self.t_bbox = text_in_bbox_per_axis(
+            tk,
+            self.horizontal_text,
+            self.vertical_text
+        )

        text_x_min, text_y_min, text_x_max, text_y_max = \
            self._text_bbox(self.t_bbox)
@ -415,10 +412,6 @@ class Stream(BaseParser):
        table.record_parse_metadata(self)

        # for plotting
-        _text = []
-        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
-        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
-        table._text = _text
        table._bbox = self.table_bbox
        table._segments = None
        table._textedges = self.textedges
@ -435,12 +428,12 @@ class Stream(BaseParser):

        _tables = []
        # sort tables based on y-coord
-        for table_idx, tk in enumerate(
+        for table_idx, bbox in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
-            cols, rows = self._generate_columns_and_rows(table_idx, tk)
+            cols, rows = self._generate_columns_and_rows(table_idx, bbox)
            table = self._generate_table(table_idx, cols, rows)
-            table._bbox = tk
+            table._bbox = bbox
            _tables.append(table)

        return _tables
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -417,6 +417,35 @@ def text_in_bbox(bbox, text):
    return t_bbox


+def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
+    """Returns all text objects present inside a bounding box, split between
+    horizontal and vertical text.
+
+    Parameters
+    ----------
+    bbox : tuple
+        Tuple (x1, y1, x2, y2) representing a bounding box where
+        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
+        space.
+    horizontal_text : List of PDFMiner text objects.
+    vertical_text : List of PDFMiner text objects.
+
+    Returns
+    -------
+    t_bbox : dict
+        Dict of lists of PDFMiner text objects that lie inside table, with one
+        key each for "horizontal" and "vertical"
+
+    """
+    t_bbox = {}
+    t_bbox["horizontal"] = text_in_bbox(bbox, horizontal_text)
+    t_bbox["vertical"] = text_in_bbox(bbox, vertical_text)
+
+    t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
+    t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
+    return t_bbox
+
+
 def bbox_from_text(textlines):
    """Returns the smallest bbox containing all the text objects passed as
    a parameters.