From 20f18b478f59524d154b220b0a77c06cf2da8a2a Mon Sep 17 00:00:00 2001 From: Frh Date: Sun, 19 Apr 2020 14:30:32 -0700 Subject: [PATCH] Lint, refactor --- camelot/core.py | 59 +++++++++++++++++++------------------- camelot/handlers.py | 12 ++++++-- camelot/parsers/base.py | 6 ++-- camelot/parsers/lattice.py | 18 +----------- camelot/parsers/stream.py | 14 ++------- camelot/utils.py | 4 +-- 6 files changed, 47 insertions(+), 66 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 94d49e9..5712e65 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -4,7 +4,6 @@ import os import sqlite3 import zipfile import tempfile -from itertools import chain from operator import itemgetter import numpy as np @@ -191,26 +190,26 @@ class TextEdges(object): table_areas = {} for te in relevant_textedges: - if not table_areas: + if not table_areas: + table_areas[(te.x, te.y0, te.x, te.y1)] = None + else: + found = None + for area in table_areas: + # check for overlap + if te.y1 >= area[1] and te.y0 <= area[3]: + found = area + break + if found is None: table_areas[(te.x, te.y0, te.x, te.y1)] = None else: - found = None - for area in table_areas: - # check for overlap - if te.y1 >= area[1] and te.y0 <= area[3]: - found = area - break - if found is None: - table_areas[(te.x, te.y0, te.x, te.y1)] = None - else: - table_areas.pop(found) - updated_area = ( - found[0], - min(te.y0, found[1]), - max(found[2], te.x), - max(found[3], te.y1), - ) - table_areas[updated_area] = None + table_areas.pop(found) + updated_area = ( + found[0], + min(te.y0, found[1]), + max(found[2], te.x), + max(found[3], te.y1), + ) + table_areas[updated_area] = None # extend table areas based on textlines that overlap # vertically. it's possible that these textlines were @@ -736,17 +735,19 @@ class Table(object): """ for f in copy_text: if f == "h": - for i in range(len(self.cells)): - for j in range(len(self.cells[i])): - if self.cells[i][j].text.strip() == "": - if self.cells[i][j].hspan and not self.cells[i][j].left: - self.cells[i][j].text = self.cells[i][j - 1].text + for i, row in enumerate(self.cells): + for j, cell in enumerate(row): + if cell.text.strip() == "" and \ + cell.hspan and \ + not cell.left: + cell.text = self.cells[i][j - 1].text elif f == "v": - for i in range(len(self.cells)): - for j in range(len(self.cells[i])): - if self.cells[i][j].text.strip() == "": - if self.cells[i][j].vspan and not self.cells[i][j].top: - self.cells[i][j].text = self.cells[i - 1][j].text + for i, row in enumerate(self.cells): + for j, cell in enumerate(row): + if cell.text.strip() == "" and \ + cell.vspan and \ + not cell.top: + cell.text = self.cells[i - 1][j].text return self diff --git a/camelot/handlers.py b/camelot/handlers.py index 7a9f2ff..64b6197 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -2,6 +2,7 @@ import os import sys +import logging from PyPDF2 import PdfFileReader, PdfFileWriter @@ -16,6 +17,8 @@ from .utils import ( download_url, ) +logger = logging.getLogger("camelot") + PARSERS = { "lattice": Lattice, "stream": Stream @@ -199,10 +202,13 @@ class PDFHandler(object): layout_kwargs=layout_kwargs ) parser._generate_layout(source_file, layout, dimensions, - page_idx, layout_kwargs) + page_idx, layout_kwargs) + rootname = os.path.basename(parser.rootname) + if not suppress_stdout: + logger.info( + "Processing {rootname}".format(rootname=rootname)) t = parser.extract_tables( - source_file, - suppress_stdout=suppress_stdout + source_file ) tables.extend(t) return TableList(sorted(tables)) diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 19deceb..b364f04 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -12,7 +12,8 @@ from ..core import Table class BaseParser(object): """Defines a base parser. """ - def __init__(self, + def __init__( + self, parser_id, table_regions=None, table_areas=None, @@ -33,6 +34,7 @@ class BaseParser(object): self.flag_size = flag_size + self.rootname = None self.t_bbox = None # For plotting details of parsing algorithms @@ -79,7 +81,6 @@ class BaseParser(object): table.order = table_idx + 1 return table - @staticmethod def _reduce_index(t, idx, shift_text): """Reduces index of a text object if it lies within a spanning @@ -112,4 +113,3 @@ class BaseParser(object): for r_idx, c_idx, text in indices: table.cells[r_idx][c_idx].text = text return pos_errors - diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index c0f3e9b..cefc27f 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -2,15 +2,9 @@ from __future__ import division import os -import sys import copy -import locale -import logging import warnings -import subprocess -import numpy as np -import pandas as pd from .base import BaseParser from ..utils import ( @@ -21,8 +15,6 @@ from ..utils import ( segments_in_bbox, text_in_bbox, merge_close_lines, - get_table_index, - compute_accuracy, ) from ..image_processing import ( adaptive_threshold, @@ -32,9 +24,6 @@ from ..image_processing import ( ) -logger = logging.getLogger("camelot") - - class Lattice(BaseParser): """Lattice method of parsing looks for lines between text to parse the table. @@ -322,13 +311,8 @@ class Lattice(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - # FRHTODO: move extract table core to the base class + def extract_tables(self, filename): rootname = os.path.basename(self.rootname) - if not suppress_stdout: - logger.info( - "Processing {rootname}".format(rootname=rootname)) - if not self.horizontal_text: if self.images: warnings.warn( diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 2df3093..351d7d3 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -2,19 +2,13 @@ from __future__ import division import os -import logging import warnings import numpy as np -import pandas as pd from .base import BaseParser from ..core import TextEdges -from ..utils import (text_in_bbox, compute_accuracy, - compute_whitespace) - - -logger = logging.getLogger("camelot") +from ..utils import (text_in_bbox) class Stream(BaseParser): @@ -432,11 +426,7 @@ class Stream(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - if not suppress_stdout: - logger.info("Processing {}".format( - os.path.basename(self.rootname))) - + def extract_tables(self, filename): if not self.horizontal_text: if self.images: warnings.warn( diff --git a/camelot/utils.py b/camelot/utils.py index cc4a58c..2c66e7c 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1044,14 +1044,14 @@ def compare_tables(left, right): differences.append( "{diff_rows} {more_fewer} rows".format( diff_rows=abs(diff_rows), - more_fewer='more' if diff_rows>0 else 'fewer' + more_fewer='more' if diff_rows > 0 else 'fewer' ) ) if (diff_cols): differences.append( "{diff_cols} {more_fewer} columns".format( diff_cols=abs(diff_cols), - more_fewer='more' if diff_cols>0 else 'fewer' + more_fewer='more' if diff_cols > 0 else 'fewer' ) ) if differences: