From e1572a10c908dfa411bc204fde3d69867456cf18 Mon Sep 17 00:00:00 2001 From: Frh Date: Sat, 25 Apr 2020 22:47:23 -0700 Subject: [PATCH] Linting --- camelot/cli.py | 2 +- camelot/core.py | 10 ++--- camelot/parsers/base.py | 75 +++++++++++++++++++++-------------- camelot/parsers/hybrid.py | 81 ++++++++++++++++++-------------------- camelot/parsers/lattice.py | 35 ++++++++-------- camelot/parsers/stream.py | 23 ++++++----- camelot/plotting.py | 2 +- camelot/utils.py | 37 +++++++++-------- 8 files changed, 141 insertions(+), 124 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index e276f01..1e85b30 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -18,7 +18,7 @@ logger = logging.getLogger("camelot") logger.setLevel(logging.INFO) -class Config(object): +class Config(): def __init__(self): self.config = {} diff --git a/camelot/core.py b/camelot/core.py index 9263628..0cd7fa6 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -31,7 +31,7 @@ VERTICAL_ALIGNMENTS = ["top", "bottom", "center"] ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS -class TextAlignment(object): +class TextAlignment(): """Represents a list of textlines sharing an alignment on a coordinate. The alignment can be left/right/middle or top/bottom/center. @@ -137,7 +137,7 @@ class TextEdge(TextAlignment): self.is_valid = True -class TextAlignments(object): +class TextAlignments(): """Defines a dict of text edges across reference alignments. """ @@ -327,7 +327,7 @@ class TextEdges(TextAlignments): return table_areas_padded -class Cell(object): +class Cell(): """Defines a cell in a table with coordinates relative to a left-bottom origin. (PDF coordinate space) @@ -409,7 +409,7 @@ class Cell(object): return self.top + self.bottom + self.left + self.right -class Table(object): +class Table(): """Defines a table with coordinates relative to a left-bottom origin. (PDF coordinate space) @@ -815,7 +815,7 @@ class Table(object): return self -class TableList(object): +class TableList(): """Defines a list of camelot.core.Table objects. Each table can be accessed using its index. diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 6816b62..4c18d77 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -19,24 +19,24 @@ from ..utils import ( from ..core import Table -class BaseParser(object): +class BaseParser(): """Defines a base parser. """ def __init__( - self, - parser_id, - table_regions=None, - table_areas=None, - copy_text=None, - split_text=False, - strip_text="", - shift_text=None, - flag_size=False, - debug=False - ): + self, + parser_id, + table_regions=None, + table_areas=None, + copy_text=None, + split_text=False, + strip_text="", + shift_text=None, + flag_size=False, + debug=False): self.id = parser_id self.table_regions = table_regions self.table_areas = table_areas + self.table_bbox = {} self.copy_text = copy_text self.split_text = split_text @@ -49,7 +49,9 @@ class BaseParser(object): self.t_bbox = None # For plotting details of parsing algorithms - self.parse_details = {} if debug else None + self.parse_details = {} + if not debug: + self.parse_details = None def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs): @@ -177,6 +179,18 @@ class BaseParser(object): table.cells[r_idx][c_idx].text = text return pos_errors + def _generate_columns_and_rows(self, bbox, table_idx): + # Pure virtual, must be defined by the derived parser + raise NotImplementedError() + + def _generate_table(self, table_idx, cols, rows, **kwargs): + # Pure virtual, must be defined by the derived parser + raise NotImplementedError() + + def _generate_table_bbox(self): + # Pure virtual, must be defined by the derived parser + raise NotImplementedError() + def extract_tables(self): if self._document_has_no_text(): return [] @@ -188,8 +202,12 @@ class BaseParser(object): _tables = [] # sort tables based on y-coord for table_idx, bbox in enumerate( - sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) - ): + sorted( + self.table_bbox.keys(), + key=lambda x: x[1], + reverse=True + ) + ): cols, rows, v_s, h_s = self._generate_columns_and_rows( bbox, table_idx @@ -232,20 +250,19 @@ class TextBaseParser(BaseParser): """ def __init__( - self, - parser_id, - table_regions=None, - table_areas=None, - columns=None, - flag_size=False, - split_text=False, - strip_text="", - edge_tol=50, - row_tol=2, - column_tol=0, - debug=False, - **kwargs - ): + self, + parser_id, + table_regions=None, + table_areas=None, + columns=None, + flag_size=False, + split_text=False, + strip_text="", + edge_tol=50, + row_tol=2, + column_tol=0, + debug=False, + **kwargs): super().__init__( parser_id, table_regions=table_regions, diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 6c399ef..bff0d58 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -3,9 +3,9 @@ from __future__ import division -import numpy as np import copy import math +import numpy as np from .base import TextBaseParser from ..core import ( @@ -16,6 +16,7 @@ from ..core import ( ) from ..utils import ( bbox_from_str, + expand_bbox_with_textline, text_in_bbox, bbox_from_textlines, distance_tl_to_bbox, @@ -25,6 +26,23 @@ from ..utils import ( # maximum number of columns over which a header can spread MAX_COL_SPREAD_IN_HEADER = 3 +# Minimum number of textlines in a table +MINIMUM_TEXTLINES_IN_TABLE = 6 + + +def column_spread(left, right, col_anchors): + """Get the number of columns crossed by a segment [left, right].""" + index_left = 0 + while index_left < len(col_anchors) \ + and col_anchors[index_left] < left: + index_left += 1 + index_right = index_left + while index_right < len(col_anchors) \ + and col_anchors[index_right] < right: + index_right += 1 + + return index_right - index_left + def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): """Expand a bbox vertically up by looking for plausible headers. @@ -40,19 +58,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): (left, bottom, right, top) = body_bbox zones = [] - def column_spread(left, right, col_anchors): - """Get the number of columns crossed by a segment [left, right].""" - indexLeft = 0 - while indexLeft < len(col_anchors) \ - and col_anchors[indexLeft] < left: - indexLeft += 1 - indexRight = indexLeft - while indexRight < len(col_anchors) \ - and col_anchors[indexRight] < right: - indexRight += 1 - - return indexRight - indexLeft - keep_searching = True while keep_searching: keep_searching = False @@ -127,9 +132,8 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): # columns. # This is to avoid picking unrelated paragraphs. if max_spread <= min( - MAX_COL_SPREAD_IN_HEADER, - math.ceil(len(col_anchors) / 2) - ): + MAX_COL_SPREAD_IN_HEADER, + math.ceil(len(col_anchors) / 2)): # Combined, the elements we've identified don't cross more # than the authorized number of columns. # We're trying to avoid @@ -145,7 +149,7 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): return new_bbox -class AlignmentCounter(object): +class AlignmentCounter(): """ For a given textline, represent all other textlines aligned with it. @@ -260,7 +264,7 @@ class TextNetworks(TextAlignments): removed_singletons = True while removed_singletons: removed_singletons = False - for alignment_id, textalignments in self._text_alignments.items(): + for textalignments in self._text_alignments.values(): # For each alignment edge, remove items if they are singletons # either horizontally or vertically for ta in textalignments: @@ -283,7 +287,7 @@ class TextNetworks(TextAlignments): return max( self._textline_to_alignments.keys(), key=lambda textline: - self._textline_to_alignments[textline].alignment_score(), + self._textline_to_alignments[textline].alignment_score(), default=None ) @@ -308,8 +312,8 @@ class TextNetworks(TextAlignments): # Retrieve the list of textlines it's aligned with, across both # axis best_alignment = self._textline_to_alignments[most_aligned_tl] - ref_h_alignment_id, ref_h_textlines = best_alignment.max_h() - ref_v_alignment_id, ref_v_textlines = best_alignment.max_v() + __, ref_h_textlines = best_alignment.max_h() + __, ref_v_textlines = best_alignment.max_v() if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1: return None @@ -375,7 +379,6 @@ class TextNetworks(TextAlignments): else: parse_details_search = None - MINIMUM_TEXTLINES_IN_TABLE = 6 bbox = (most_aligned_tl.x0, most_aligned_tl.y0, most_aligned_tl.x1, most_aligned_tl.y1) @@ -402,12 +405,7 @@ class TextNetworks(TextAlignments): # if the textline is close. if h_distance < max_h_gap and v_distance < max_v_gap: tls_in_bbox.append(tl) - bbox = ( - min(bbox[0], tl.x0), - min(bbox[1], tl.y0), - max(bbox[2], tl.x1), - max(bbox[3], tl.y1) - ) + bbox = expand_bbox_with_textline(bbox, tl) del tls_search_space[i] if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE: return bbox @@ -461,19 +459,18 @@ class Hybrid(TextBaseParser): """ def __init__( - self, - table_regions=None, - table_areas=None, - columns=None, - flag_size=False, - split_text=False, - strip_text="", - edge_tol=None, - row_tol=2, - column_tol=0, - debug=False, - **kwargs - ): + self, + table_regions=None, + table_areas=None, + columns=None, + flag_size=False, + split_text=False, + strip_text="", + edge_tol=None, + row_tol=2, + column_tol=0, + debug=False, + **kwargs): super().__init__( "hybrid", table_regions=table_regions, diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index b8b82ed..84ce5a2 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -84,24 +84,23 @@ class Lattice(BaseParser): """ def __init__( - self, - table_regions=None, - table_areas=None, - process_background=False, - line_scale=15, - copy_text=None, - shift_text=None, - split_text=False, - flag_size=False, - strip_text="", - line_tol=2, - joint_tol=2, - threshold_blocksize=15, - threshold_constant=-2, - iterations=0, - resolution=300, - **kwargs - ): + self, + table_regions=None, + table_areas=None, + process_background=False, + line_scale=15, + copy_text=None, + shift_text=None, + split_text=False, + flag_size=False, + strip_text="", + line_tol=2, + joint_tol=2, + threshold_blocksize=15, + threshold_constant=-2, + iterations=0, + resolution=300, + **kwargs): super().__init__( "lattice", table_regions=table_regions, diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 8b72e09..988490f 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -50,18 +50,17 @@ class Stream(TextBaseParser): """ def __init__( - self, - table_regions=None, - table_areas=None, - columns=None, - flag_size=False, - split_text=False, - strip_text="", - edge_tol=50, - row_tol=2, - column_tol=0, - **kwargs - ): + self, + table_regions=None, + table_areas=None, + columns=None, + flag_size=False, + split_text=False, + strip_text="", + edge_tol=50, + row_tol=2, + column_tol=0, + **kwargs): super().__init__( "stream", table_regions=table_regions, diff --git a/camelot/plotting.py b/camelot/plotting.py index 12ba457..d3d7064 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -136,7 +136,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True): return ax -class PlotMethods(object): +class PlotMethods(): def __call__(self, table, kind="text", filename=None, ax=None): """Plot elements found on PDF page based on kind specified, useful for debugging and playing with different diff --git a/camelot/utils.py b/camelot/utils.py index a675580..cf85eb1 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -156,7 +156,7 @@ def remove_extra(kwargs, flavor="lattice"): # https://stackoverflow.com/a/22726782 # and https://stackoverflow.com/questions/10965479 -class TemporaryDirectory(object): +class TemporaryDirectory(): def __enter__(self): self.name = tempfile.mkdtemp() # Only delete the temporary directory upon @@ -488,6 +488,17 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text): return t_bbox +def expand_bbox_with_textline(bbox, textline): + """Expand (if needed) a bbox so that it fits the parameter textline. + """ + return ( + min(bbox[0], textline.x0), + min(bbox[1], textline.y0), + max(bbox[2], textline.x1), + max(bbox[3], textline.y1) + ) + + def bbox_from_textlines(textlines): """Returns the smallest bbox containing all the text objects passed as a parameters. @@ -514,12 +525,7 @@ def bbox_from_textlines(textlines): ) for tl in textlines[1:]: - bbox = ( - min(bbox[0], tl.x0), - min(bbox[1], tl.y0), - max(bbox[2], tl.x1), - max(bbox[3], tl.y1) - ) + bbox = expand_bbox_with_textline(bbox, tl) return bbox @@ -1039,13 +1045,12 @@ def compute_whitespace(d): def get_page_layout( - filename, - char_margin=1.0, - line_margin=0.5, - word_margin=0.1, - detect_vertical=True, - all_texts=True, -): + filename, + char_margin=1.0, + line_margin=0.5, + word_margin=0.1, + detect_vertical=True, + all_texts=True): """Returns a PDFMiner LTPage object and page dimension of a single page pdf. See https://euske.github.io/pdfminer/ to get definitions of kwargs. @@ -1163,14 +1168,14 @@ def compare_tables(left, right): diff_cols = right.shape[1]-left.shape[1] diff_rows = right.shape[0]-left.shape[0] differences = [] - if (diff_rows): + if diff_rows: differences.append( "{diff_rows} {more_fewer} rows".format( diff_rows=abs(diff_rows), more_fewer='more' if diff_rows > 0 else 'fewer' ) ) - if (diff_cols): + if diff_cols: differences.append( "{diff_cols} {more_fewer} columns".format( diff_cols=abs(diff_cols),