Lint, refactor

2020-04-19 14:30:32 -07:00 · 2020-04-19 14:30:32 -07:00 · 20f18b478f
parent ff2ce6f47c
commit 20f18b478f
6 changed files with 47 additions and 66 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -4,7 +4,6 @@ import os
 import sqlite3
 import zipfile
 import tempfile
-from itertools import chain
 from operator import itemgetter

 import numpy as np
@ -191,26 +190,26 @@ class TextEdges(object):

        table_areas = {}
        for te in relevant_textedges:
-                if not table_areas:
+            if not table_areas:
+                table_areas[(te.x, te.y0, te.x, te.y1)] = None
+            else:
+                found = None
+                for area in table_areas:
+                    # check for overlap
+                    if te.y1 >= area[1] and te.y0 <= area[3]:
+                        found = area
+                        break
+                if found is None:
                    table_areas[(te.x, te.y0, te.x, te.y1)] = None
                else:
-                    found = None
-                    for area in table_areas:
-                        # check for overlap
-                        if te.y1 >= area[1] and te.y0 <= area[3]:
-                            found = area
-                            break
-                    if found is None:
-                        table_areas[(te.x, te.y0, te.x, te.y1)] = None
-                    else:
-                        table_areas.pop(found)
-                        updated_area = (
-                            found[0],
-                            min(te.y0, found[1]),
-                            max(found[2], te.x),
-                            max(found[3], te.y1),
-                        )
-                        table_areas[updated_area] = None
+                    table_areas.pop(found)
+                    updated_area = (
+                        found[0],
+                        min(te.y0, found[1]),
+                        max(found[2], te.x),
+                        max(found[3], te.y1),
+                    )
+                    table_areas[updated_area] = None

        # extend table areas based on textlines that overlap
        # vertically. it's possible that these textlines were
@ -736,17 +735,19 @@ class Table(object):
        """
        for f in copy_text:
            if f == "h":
-                for i in range(len(self.cells)):
-                    for j in range(len(self.cells[i])):
-                        if self.cells[i][j].text.strip() == "":
-                            if self.cells[i][j].hspan and not self.cells[i][j].left:
-                                self.cells[i][j].text = self.cells[i][j - 1].text
+                for i, row in enumerate(self.cells):
+                    for j, cell in enumerate(row):
+                        if cell.text.strip() == "" and \
+                           cell.hspan and \
+                           not cell.left:
+                            cell.text = self.cells[i][j - 1].text
            elif f == "v":
-                for i in range(len(self.cells)):
-                    for j in range(len(self.cells[i])):
-                        if self.cells[i][j].text.strip() == "":
-                            if self.cells[i][j].vspan and not self.cells[i][j].top:
-                                self.cells[i][j].text = self.cells[i - 1][j].text
+                for i, row in enumerate(self.cells):
+                    for j, cell in enumerate(row):
+                        if cell.text.strip() == "" and \
+                           cell.vspan and \
+                           not cell.top:
+                            cell.text = self.cells[i - 1][j].text
        return self


--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -2,6 +2,7 @@

 import os
 import sys
+import logging

 from PyPDF2 import PdfFileReader, PdfFileWriter

@ -16,6 +17,8 @@ from .utils import (
    download_url,
 )

+logger = logging.getLogger("camelot")
+
 PARSERS = {
    "lattice": Lattice,
    "stream": Stream
@ -199,10 +202,13 @@ class PDFHandler(object):
                layout_kwargs=layout_kwargs
            )
            parser._generate_layout(source_file, layout, dimensions,
-                                page_idx, layout_kwargs)
+                                    page_idx, layout_kwargs)
+            rootname = os.path.basename(parser.rootname)
+            if not suppress_stdout:
+                logger.info(
+                    "Processing {rootname}".format(rootname=rootname))
            t = parser.extract_tables(
-                source_file,
-                suppress_stdout=suppress_stdout
+                source_file
            )
            tables.extend(t)
        return TableList(sorted(tables))
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -12,7 +12,8 @@ from ..core import Table
 class BaseParser(object):
    """Defines a base parser.
    """
-    def __init__(self,
+    def __init__(
+        self,
        parser_id,
        table_regions=None,
        table_areas=None,
@ -33,6 +34,7 @@ class BaseParser(object):

        self.flag_size = flag_size

+        self.rootname = None
        self.t_bbox = None

        # For plotting details of parsing algorithms
@ -79,7 +81,6 @@ class BaseParser(object):
        table.order = table_idx + 1
        return table

-
    @staticmethod
    def _reduce_index(t, idx, shift_text):
        """Reduces index of a text object if it lies within a spanning
@ -112,4 +113,3 @@ class BaseParser(object):
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        return pos_errors
-
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -2,15 +2,9 @@

 from __future__ import division
 import os
-import sys
 import copy
-import locale
-import logging
 import warnings
-import subprocess

-import numpy as np
-import pandas as pd

 from .base import BaseParser
 from ..utils import (
@ -21,8 +15,6 @@ from ..utils import (
    segments_in_bbox,
    text_in_bbox,
    merge_close_lines,
-    get_table_index,
-    compute_accuracy,
 )
 from ..image_processing import (
    adaptive_threshold,
@ -32,9 +24,6 @@ from ..image_processing import (
 )


-logger = logging.getLogger("camelot")
-
-
 class Lattice(BaseParser):
    """Lattice method of parsing looks for lines between text
    to parse the table.
@ -322,13 +311,8 @@ class Lattice(BaseParser):

        return table

-    def extract_tables(self, filename, suppress_stdout=False):
-        # FRHTODO: move extract table core to the base class
+    def extract_tables(self, filename):
        rootname = os.path.basename(self.rootname)
-        if not suppress_stdout:
-            logger.info(
-                "Processing {rootname}".format(rootname=rootname))
-
        if not self.horizontal_text:
            if self.images:
                warnings.warn(
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -2,19 +2,13 @@

 from __future__ import division
 import os
-import logging
 import warnings

 import numpy as np
-import pandas as pd

 from .base import BaseParser
 from ..core import TextEdges
-from ..utils import (text_in_bbox, compute_accuracy,
-                     compute_whitespace)
-
-
-logger = logging.getLogger("camelot")
+from ..utils import (text_in_bbox)


 class Stream(BaseParser):
@ -432,11 +426,7 @@ class Stream(BaseParser):

        return table

-    def extract_tables(self, filename, suppress_stdout=False):
-        if not suppress_stdout:
-            logger.info("Processing {}".format(
-                os.path.basename(self.rootname)))
-
+    def extract_tables(self, filename):
        if not self.horizontal_text:
            if self.images:
                warnings.warn(
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1044,14 +1044,14 @@ def compare_tables(left, right):
        differences.append(
            "{diff_rows} {more_fewer} rows".format(
                diff_rows=abs(diff_rows),
-                more_fewer='more' if diff_rows>0 else 'fewer'
+                more_fewer='more' if diff_rows > 0 else 'fewer'
            )
        )
    if (diff_cols):
        differences.append(
            "{diff_cols} {more_fewer} columns".format(
                diff_cols=abs(diff_cols),
-                more_fewer='more' if diff_cols>0 else 'fewer'
+                more_fewer='more' if diff_cols > 0 else 'fewer'
            )
        )
    if differences: