Lint, refactor

2020-04-19 14:30:32 -07:00 · 2020-04-19 14:30:32 -07:00 · 50f11867af
parent cff7a9698b
commit 50f11867af
6 changed files with 47 additions and 66 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -4,7 +4,6 @@ import os
 import sqlite3
 import zipfile
 import tempfile
 from itertools import chain
 from operator import itemgetter
 import numpy as np
@ -191,26 +190,26 @@ class TextEdges(object):
        table_areas = {}
        for te in relevant_textedges:
-                if not table_areas:
+            if not table_areas:
                table_areas[(te.x, te.y0, te.x, te.y1)] = None
            else:
                found = None
                for area in table_areas:
                    # check for overlap
                    if te.y1 >= area[1] and te.y0 <= area[3]:
                        found = area
                        break
                if found is None:
                    table_areas[(te.x, te.y0, te.x, te.y1)] = None
                else:
-                    found = None
+                    table_areas.pop(found)
-                    for area in table_areas:
+                    updated_area = (
-                        # check for overlap
+                        found[0],
-                        if te.y1 >= area[1] and te.y0 <= area[3]:
+                        min(te.y0, found[1]),
-                            found = area
+                        max(found[2], te.x),
-                            break
+                        max(found[3], te.y1),
-                    if found is None:
+                    )
-                        table_areas[(te.x, te.y0, te.x, te.y1)] = None
+                    table_areas[updated_area] = None
                    else:
                        table_areas.pop(found)
                        updated_area = (
                            found[0],
                            min(te.y0, found[1]),
                            max(found[2], te.x),
                            max(found[3], te.y1),
                        )
                        table_areas[updated_area] = None
        # extend table areas based on textlines that overlap
        # vertically. it's possible that these textlines were
@ -736,17 +735,19 @@ class Table(object):
        """
        for f in copy_text:
            if f == "h":
-                for i in range(len(self.cells)):
+                for i, row in enumerate(self.cells):
-                    for j in range(len(self.cells[i])):
+                    for j, cell in enumerate(row):
-                        if self.cells[i][j].text.strip() == "":
+                        if cell.text.strip() == "" and \
-                            if self.cells[i][j].hspan and not self.cells[i][j].left:
+                           cell.hspan and \
-                                self.cells[i][j].text = self.cells[i][j - 1].text
+                           not cell.left:
                            cell.text = self.cells[i][j - 1].text
            elif f == "v":
-                for i in range(len(self.cells)):
+                for i, row in enumerate(self.cells):
-                    for j in range(len(self.cells[i])):
+                    for j, cell in enumerate(row):
-                        if self.cells[i][j].text.strip() == "":
+                        if cell.text.strip() == "" and \
-                            if self.cells[i][j].vspan and not self.cells[i][j].top:
+                           cell.vspan and \
-                                self.cells[i][j].text = self.cells[i - 1][j].text
+                           not cell.top:
                            cell.text = self.cells[i - 1][j].text
        return self
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -2,6 +2,7 @@
 import os
 import sys
 import logging
 from PyPDF2 import PdfFileReader, PdfFileWriter
@ -16,6 +17,8 @@ from .utils import (
    download_url,
 )
 logger = logging.getLogger("camelot")
 PARSERS = {
    "lattice": Lattice,
    "stream": Stream
@ -199,10 +202,13 @@ class PDFHandler(object):
                layout_kwargs=layout_kwargs
            )
            parser._generate_layout(source_file, layout, dimensions,
-                                page_idx, layout_kwargs)
+                                    page_idx, layout_kwargs)
            rootname = os.path.basename(parser.rootname)
            if not suppress_stdout:
                logger.info(
                    "Processing {rootname}".format(rootname=rootname))
            t = parser.extract_tables(
-                source_file,
+                source_file
                suppress_stdout=suppress_stdout
            )
            tables.extend(t)
        return TableList(sorted(tables))
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -12,7 +12,8 @@ from ..core import Table
 class BaseParser(object):
    """Defines a base parser.
    """
-    def __init__(self,
+    def __init__(
        self,
        parser_id,
        table_regions=None,
        table_areas=None,
@ -33,6 +34,7 @@ class BaseParser(object):
        self.flag_size = flag_size
        self.rootname = None
        self.t_bbox = None
        # For plotting details of parsing algorithms
@ -79,7 +81,6 @@ class BaseParser(object):
        table.order = table_idx + 1
        return table
    @staticmethod
    def _reduce_index(t, idx, shift_text):
        """Reduces index of a text object if it lies within a spanning
@ -112,4 +113,3 @@ class BaseParser(object):
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        return pos_errors
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -2,15 +2,9 @@
 from __future__ import division
 import os
 import sys
 import copy
 import locale
 import logging
 import warnings
 import subprocess
 import numpy as np
 import pandas as pd
 from .base import BaseParser
 from ..utils import (
@ -21,8 +15,6 @@ from ..utils import (
    segments_in_bbox,
    text_in_bbox,
    merge_close_lines,
    get_table_index,
    compute_accuracy,
 )
 from ..image_processing import (
    adaptive_threshold,
@ -32,9 +24,6 @@ from ..image_processing import (
 )
 logger = logging.getLogger("camelot")
 class Lattice(BaseParser):
    """Lattice method of parsing looks for lines between text
    to parse the table.
@ -322,13 +311,8 @@ class Lattice(BaseParser):
        return table
-    def extract_tables(self, filename, suppress_stdout=False):
+    def extract_tables(self, filename):
        # FRHTODO: move extract table core to the base class
        rootname = os.path.basename(self.rootname)
        if not suppress_stdout:
            logger.info(
                "Processing {rootname}".format(rootname=rootname))
        if not self.horizontal_text:
            if self.images:
                warnings.warn(
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -2,19 +2,13 @@
 from __future__ import division
 import os
 import logging
 import warnings
 import numpy as np
 import pandas as pd
 from .base import BaseParser
 from ..core import TextEdges
-from ..utils import (text_in_bbox, compute_accuracy,
+from ..utils import (text_in_bbox)
                     compute_whitespace)
 logger = logging.getLogger("camelot")
 class Stream(BaseParser):
@ -432,11 +426,7 @@ class Stream(BaseParser):
        return table
-    def extract_tables(self, filename, suppress_stdout=False):
+    def extract_tables(self, filename):
        if not suppress_stdout:
            logger.info("Processing {}".format(
                os.path.basename(self.rootname)))
        if not self.horizontal_text:
            if self.images:
                warnings.warn(
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1044,14 +1044,14 @@ def compare_tables(left, right):
        differences.append(
            "{diff_rows} {more_fewer} rows".format(
                diff_rows=abs(diff_rows),
-                more_fewer='more' if diff_rows>0 else 'fewer'
+                more_fewer='more' if diff_rows > 0 else 'fewer'
            )
        )
    if (diff_cols):
        differences.append(
            "{diff_cols} {more_fewer} columns".format(
                diff_cols=abs(diff_cols),
-                more_fewer='more' if diff_cols>0 else 'fewer'
+                more_fewer='more' if diff_cols > 0 else 'fewer'
            )
        )
    if differences: