Linting

2020-04-25 22:47:23 -07:00 · 2020-04-25 22:47:23 -07:00 · e1572a10c9
parent f7aafcd05c
commit e1572a10c9
8 changed files with 141 additions and 124 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -18,7 +18,7 @@ logger = logging.getLogger("camelot")
 logger.setLevel(logging.INFO)


-class Config(object):
+class Config():
    def __init__(self):
        self.config = {}

--- a/camelot/core.py
+++ b/camelot/core.py
@ -31,7 +31,7 @@ VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
 ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS


-class TextAlignment(object):
+class TextAlignment():
    """Represents a list of textlines sharing an alignment on a coordinate.

    The alignment can be left/right/middle or top/bottom/center.
@ -137,7 +137,7 @@ class TextEdge(TextAlignment):
                self.is_valid = True


-class TextAlignments(object):
+class TextAlignments():
    """Defines a dict of text edges across reference alignments.
    """

@ -327,7 +327,7 @@ class TextEdges(TextAlignments):
        return table_areas_padded


-class Cell(object):
+class Cell():
    """Defines a cell in a table with coordinates relative to a
    left-bottom origin. (PDF coordinate space)

@ -409,7 +409,7 @@ class Cell(object):
        return self.top + self.bottom + self.left + self.right


-class Table(object):
+class Table():
    """Defines a table with coordinates relative to a left-bottom
    origin. (PDF coordinate space)

@ -815,7 +815,7 @@ class Table(object):
        return self


-class TableList(object):
+class TableList():
    """Defines a list of camelot.core.Table objects. Each table can
    be accessed using its index.

--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -19,24 +19,24 @@ from ..utils import (
 from ..core import Table


-class BaseParser(object):
+class BaseParser():
    """Defines a base parser.
    """
    def __init__(
-        self,
-        parser_id,
-        table_regions=None,
-        table_areas=None,
-        copy_text=None,
-        split_text=False,
-        strip_text="",
-        shift_text=None,
-        flag_size=False,
-        debug=False
-    ):
+            self,
+            parser_id,
+            table_regions=None,
+            table_areas=None,
+            copy_text=None,
+            split_text=False,
+            strip_text="",
+            shift_text=None,
+            flag_size=False,
+            debug=False):
        self.id = parser_id
        self.table_regions = table_regions
        self.table_areas = table_areas
+        self.table_bbox = {}

        self.copy_text = copy_text
        self.split_text = split_text
@ -49,7 +49,9 @@ class BaseParser(object):
        self.t_bbox = None

        # For plotting details of parsing algorithms
-        self.parse_details = {} if debug else None
+        self.parse_details = {}
+        if not debug:
+            self.parse_details = None

    def prepare_page_parse(self, filename, layout, dimensions,
                           page_idx, layout_kwargs):
@ -177,6 +179,18 @@ class BaseParser(object):
                        table.cells[r_idx][c_idx].text = text
        return pos_errors

+    def _generate_columns_and_rows(self, bbox, table_idx):
+        # Pure virtual, must be defined by the derived parser
+        raise NotImplementedError()
+
+    def _generate_table(self, table_idx, cols, rows, **kwargs):
+        # Pure virtual, must be defined by the derived parser
+        raise NotImplementedError()
+
+    def _generate_table_bbox(self):
+        # Pure virtual, must be defined by the derived parser
+        raise NotImplementedError()
+
    def extract_tables(self):
        if self._document_has_no_text():
            return []
@ -188,8 +202,12 @@ class BaseParser(object):
        _tables = []
        # sort tables based on y-coord
        for table_idx, bbox in enumerate(
-            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
-        ):
+                sorted(
+                        self.table_bbox.keys(),
+                        key=lambda x: x[1],
+                        reverse=True
+                    )
+                ):
            cols, rows, v_s, h_s = self._generate_columns_and_rows(
                bbox,
                table_idx
@ -232,20 +250,19 @@ class TextBaseParser(BaseParser):
    """

    def __init__(
-        self,
-        parser_id,
-        table_regions=None,
-        table_areas=None,
-        columns=None,
-        flag_size=False,
-        split_text=False,
-        strip_text="",
-        edge_tol=50,
-        row_tol=2,
-        column_tol=0,
-        debug=False,
-        **kwargs
-    ):
+            self,
+            parser_id,
+            table_regions=None,
+            table_areas=None,
+            columns=None,
+            flag_size=False,
+            split_text=False,
+            strip_text="",
+            edge_tol=50,
+            row_tol=2,
+            column_tol=0,
+            debug=False,
+            **kwargs):
        super().__init__(
            parser_id,
            table_regions=table_regions,
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -3,9 +3,9 @@

 from __future__ import division

-import numpy as np
 import copy
 import math
+import numpy as np

 from .base import TextBaseParser
 from ..core import (
@ -16,6 +16,7 @@ from ..core import (
 )
 from ..utils import (
    bbox_from_str,
+    expand_bbox_with_textline,
    text_in_bbox,
    bbox_from_textlines,
    distance_tl_to_bbox,
@ -25,6 +26,23 @@ from ..utils import (
 # maximum number of columns over which a header can spread
 MAX_COL_SPREAD_IN_HEADER = 3

+# Minimum number of textlines in a table
+MINIMUM_TEXTLINES_IN_TABLE = 6
+
+
+def column_spread(left, right, col_anchors):
+    """Get the number of columns crossed by a segment [left, right]."""
+    index_left = 0
+    while index_left < len(col_anchors) \
+            and col_anchors[index_left] < left:
+        index_left += 1
+    index_right = index_left
+    while index_right < len(col_anchors) \
+            and col_anchors[index_right] < right:
+        index_right += 1
+
+    return index_right - index_left
+

 def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
    """Expand a bbox vertically up by looking for plausible headers.
@ -40,19 +58,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
    (left, bottom, right, top) = body_bbox
    zones = []

-    def column_spread(left, right, col_anchors):
-        """Get the number of columns crossed by a segment [left, right]."""
-        indexLeft = 0
-        while indexLeft < len(col_anchors) \
-                and col_anchors[indexLeft] < left:
-            indexLeft += 1
-        indexRight = indexLeft
-        while indexRight < len(col_anchors) \
-                and col_anchors[indexRight] < right:
-            indexRight += 1
-
-        return indexRight - indexLeft
-
    keep_searching = True
    while keep_searching:
        keep_searching = False
@ -127,9 +132,8 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
            # columns.
            # This is to avoid picking unrelated paragraphs.
            if max_spread <= min(
-                MAX_COL_SPREAD_IN_HEADER,
-                math.ceil(len(col_anchors) / 2)
-            ):
+                    MAX_COL_SPREAD_IN_HEADER,
+                    math.ceil(len(col_anchors) / 2)):
                # Combined, the elements we've identified don't cross more
                # than the authorized number of columns.
                # We're trying to avoid
@ -145,7 +149,7 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
    return new_bbox


-class AlignmentCounter(object):
+class AlignmentCounter():
    """
    For a given textline, represent all other textlines aligned with it.

@ -260,7 +264,7 @@ class TextNetworks(TextAlignments):
        removed_singletons = True
        while removed_singletons:
            removed_singletons = False
-            for alignment_id, textalignments in self._text_alignments.items():
+            for textalignments in self._text_alignments.values():
                # For each alignment edge, remove items if they are singletons
                # either horizontally or vertically
                for ta in textalignments:
@ -283,7 +287,7 @@ class TextNetworks(TextAlignments):
        return max(
            self._textline_to_alignments.keys(),
            key=lambda textline:
-                self._textline_to_alignments[textline].alignment_score(),
+            self._textline_to_alignments[textline].alignment_score(),
            default=None
        )

@ -308,8 +312,8 @@ class TextNetworks(TextAlignments):
        # Retrieve the list of textlines it's aligned with, across both
        # axis
        best_alignment = self._textline_to_alignments[most_aligned_tl]
-        ref_h_alignment_id, ref_h_textlines = best_alignment.max_h()
-        ref_v_alignment_id, ref_v_textlines = best_alignment.max_v()
+        __, ref_h_textlines = best_alignment.max_h()
+        __, ref_v_textlines = best_alignment.max_v()
        if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1:
            return None

@ -375,7 +379,6 @@ class TextNetworks(TextAlignments):
        else:
            parse_details_search = None

-        MINIMUM_TEXTLINES_IN_TABLE = 6
        bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
                most_aligned_tl.x1, most_aligned_tl.y1)

@ -402,12 +405,7 @@ class TextNetworks(TextAlignments):
                # if the textline is close.
                if h_distance < max_h_gap and v_distance < max_v_gap:
                    tls_in_bbox.append(tl)
-                    bbox = (
-                        min(bbox[0], tl.x0),
-                        min(bbox[1], tl.y0),
-                        max(bbox[2], tl.x1),
-                        max(bbox[3], tl.y1)
-                    )
+                    bbox = expand_bbox_with_textline(bbox, tl)
                    del tls_search_space[i]
        if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
            return bbox
@ -461,19 +459,18 @@ class Hybrid(TextBaseParser):
    """

    def __init__(
-        self,
-        table_regions=None,
-        table_areas=None,
-        columns=None,
-        flag_size=False,
-        split_text=False,
-        strip_text="",
-        edge_tol=None,
-        row_tol=2,
-        column_tol=0,
-        debug=False,
-        **kwargs
-    ):
+            self,
+            table_regions=None,
+            table_areas=None,
+            columns=None,
+            flag_size=False,
+            split_text=False,
+            strip_text="",
+            edge_tol=None,
+            row_tol=2,
+            column_tol=0,
+            debug=False,
+            **kwargs):
        super().__init__(
            "hybrid",
            table_regions=table_regions,
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -84,24 +84,23 @@ class Lattice(BaseParser):
    """

    def __init__(
-        self,
-        table_regions=None,
-        table_areas=None,
-        process_background=False,
-        line_scale=15,
-        copy_text=None,
-        shift_text=None,
-        split_text=False,
-        flag_size=False,
-        strip_text="",
-        line_tol=2,
-        joint_tol=2,
-        threshold_blocksize=15,
-        threshold_constant=-2,
-        iterations=0,
-        resolution=300,
-        **kwargs
-    ):
+            self,
+            table_regions=None,
+            table_areas=None,
+            process_background=False,
+            line_scale=15,
+            copy_text=None,
+            shift_text=None,
+            split_text=False,
+            flag_size=False,
+            strip_text="",
+            line_tol=2,
+            joint_tol=2,
+            threshold_blocksize=15,
+            threshold_constant=-2,
+            iterations=0,
+            resolution=300,
+            **kwargs):
        super().__init__(
            "lattice",
            table_regions=table_regions,
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -50,18 +50,17 @@ class Stream(TextBaseParser):
    """

    def __init__(
-        self,
-        table_regions=None,
-        table_areas=None,
-        columns=None,
-        flag_size=False,
-        split_text=False,
-        strip_text="",
-        edge_tol=50,
-        row_tol=2,
-        column_tol=0,
-        **kwargs
-    ):
+            self,
+            table_regions=None,
+            table_areas=None,
+            columns=None,
+            flag_size=False,
+            split_text=False,
+            strip_text="",
+            edge_tol=50,
+            row_tol=2,
+            column_tol=0,
+            **kwargs):
        super().__init__(
            "stream",
            table_regions=table_regions,
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -136,7 +136,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
    return ax


-class PlotMethods(object):
+class PlotMethods():
    def __call__(self, table, kind="text", filename=None, ax=None):
        """Plot elements found on PDF page based on kind
        specified, useful for debugging and playing with different
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -156,7 +156,7 @@ def remove_extra(kwargs, flavor="lattice"):

 # https://stackoverflow.com/a/22726782
 # and https://stackoverflow.com/questions/10965479
-class TemporaryDirectory(object):
+class TemporaryDirectory():
    def __enter__(self):
        self.name = tempfile.mkdtemp()
        # Only delete the temporary directory upon
@ -488,6 +488,17 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
    return t_bbox


+def expand_bbox_with_textline(bbox, textline):
+    """Expand (if needed) a bbox so that it fits the parameter textline.
+    """
+    return (
+        min(bbox[0], textline.x0),
+        min(bbox[1], textline.y0),
+        max(bbox[2], textline.x1),
+        max(bbox[3], textline.y1)
+    )
+
+
 def bbox_from_textlines(textlines):
    """Returns the smallest bbox containing all the text objects passed as
    a parameters.
@ -514,12 +525,7 @@ def bbox_from_textlines(textlines):
    )

    for tl in textlines[1:]:
-        bbox = (
-            min(bbox[0], tl.x0),
-            min(bbox[1], tl.y0),
-            max(bbox[2], tl.x1),
-            max(bbox[3], tl.y1)
-        )
+        bbox = expand_bbox_with_textline(bbox, tl)
    return bbox


@ -1039,13 +1045,12 @@ def compute_whitespace(d):


 def get_page_layout(
-    filename,
-    char_margin=1.0,
-    line_margin=0.5,
-    word_margin=0.1,
-    detect_vertical=True,
-    all_texts=True,
-):
+        filename,
+        char_margin=1.0,
+        line_margin=0.5,
+        word_margin=0.1,
+        detect_vertical=True,
+        all_texts=True):
    """Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. See https://euske.github.io/pdfminer/ to get definitions
    of kwargs.
@ -1163,14 +1168,14 @@ def compare_tables(left, right):
    diff_cols = right.shape[1]-left.shape[1]
    diff_rows = right.shape[0]-left.shape[0]
    differences = []
-    if (diff_rows):
+    if diff_rows:
        differences.append(
            "{diff_rows} {more_fewer} rows".format(
                diff_rows=abs(diff_rows),
                more_fewer='more' if diff_rows > 0 else 'fewer'
            )
        )
-    if (diff_cols):
+    if diff_cols:
        differences.append(
            "{diff_cols} {more_fewer} columns".format(
                diff_cols=abs(diff_cols),