Lint, refactor

pull/153/head
Frh 2020-04-19 14:30:32 -07:00
parent cff7a9698b
commit 50f11867af
6 changed files with 47 additions and 66 deletions

View File

@ -4,7 +4,6 @@ import os
import sqlite3
import zipfile
import tempfile
from itertools import chain
from operator import itemgetter
import numpy as np
@ -191,26 +190,26 @@ class TextEdges(object):
table_areas = {}
for te in relevant_textedges:
if not table_areas:
if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
found = None
for area in table_areas:
# check for overlap
if te.y1 >= area[1] and te.y0 <= area[3]:
found = area
break
if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
found = None
for area in table_areas:
# check for overlap
if te.y1 >= area[1] and te.y0 <= area[3]:
found = area
break
if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
table_areas.pop(found)
updated_area = (
found[0],
min(te.y0, found[1]),
max(found[2], te.x),
max(found[3], te.y1),
)
table_areas[updated_area] = None
table_areas.pop(found)
updated_area = (
found[0],
min(te.y0, found[1]),
max(found[2], te.x),
max(found[3], te.y1),
)
table_areas[updated_area] = None
# extend table areas based on textlines that overlap
# vertically. it's possible that these textlines were
@ -736,17 +735,19 @@ class Table(object):
"""
for f in copy_text:
if f == "h":
for i in range(len(self.cells)):
for j in range(len(self.cells[i])):
if self.cells[i][j].text.strip() == "":
if self.cells[i][j].hspan and not self.cells[i][j].left:
self.cells[i][j].text = self.cells[i][j - 1].text
for i, row in enumerate(self.cells):
for j, cell in enumerate(row):
if cell.text.strip() == "" and \
cell.hspan and \
not cell.left:
cell.text = self.cells[i][j - 1].text
elif f == "v":
for i in range(len(self.cells)):
for j in range(len(self.cells[i])):
if self.cells[i][j].text.strip() == "":
if self.cells[i][j].vspan and not self.cells[i][j].top:
self.cells[i][j].text = self.cells[i - 1][j].text
for i, row in enumerate(self.cells):
for j, cell in enumerate(row):
if cell.text.strip() == "" and \
cell.vspan and \
not cell.top:
cell.text = self.cells[i - 1][j].text
return self

View File

@ -2,6 +2,7 @@
import os
import sys
import logging
from PyPDF2 import PdfFileReader, PdfFileWriter
@ -16,6 +17,8 @@ from .utils import (
download_url,
)
logger = logging.getLogger("camelot")
PARSERS = {
"lattice": Lattice,
"stream": Stream
@ -199,10 +202,13 @@ class PDFHandler(object):
layout_kwargs=layout_kwargs
)
parser._generate_layout(source_file, layout, dimensions,
page_idx, layout_kwargs)
page_idx, layout_kwargs)
rootname = os.path.basename(parser.rootname)
if not suppress_stdout:
logger.info(
"Processing {rootname}".format(rootname=rootname))
t = parser.extract_tables(
source_file,
suppress_stdout=suppress_stdout
source_file
)
tables.extend(t)
return TableList(sorted(tables))

View File

@ -12,7 +12,8 @@ from ..core import Table
class BaseParser(object):
"""Defines a base parser.
"""
def __init__(self,
def __init__(
self,
parser_id,
table_regions=None,
table_areas=None,
@ -33,6 +34,7 @@ class BaseParser(object):
self.flag_size = flag_size
self.rootname = None
self.t_bbox = None
# For plotting details of parsing algorithms
@ -79,7 +81,6 @@ class BaseParser(object):
table.order = table_idx + 1
return table
@staticmethod
def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning
@ -112,4 +113,3 @@ class BaseParser(object):
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
return pos_errors

View File

@ -2,15 +2,9 @@
from __future__ import division
import os
import sys
import copy
import locale
import logging
import warnings
import subprocess
import numpy as np
import pandas as pd
from .base import BaseParser
from ..utils import (
@ -21,8 +15,6 @@ from ..utils import (
segments_in_bbox,
text_in_bbox,
merge_close_lines,
get_table_index,
compute_accuracy,
)
from ..image_processing import (
adaptive_threshold,
@ -32,9 +24,6 @@ from ..image_processing import (
)
logger = logging.getLogger("camelot")
class Lattice(BaseParser):
"""Lattice method of parsing looks for lines between text
to parse the table.
@ -322,13 +311,8 @@ class Lattice(BaseParser):
return table
def extract_tables(self, filename, suppress_stdout=False):
# FRHTODO: move extract table core to the base class
def extract_tables(self, filename):
rootname = os.path.basename(self.rootname)
if not suppress_stdout:
logger.info(
"Processing {rootname}".format(rootname=rootname))
if not self.horizontal_text:
if self.images:
warnings.warn(

View File

@ -2,19 +2,13 @@
from __future__ import division
import os
import logging
import warnings
import numpy as np
import pandas as pd
from .base import BaseParser
from ..core import TextEdges
from ..utils import (text_in_bbox, compute_accuracy,
compute_whitespace)
logger = logging.getLogger("camelot")
from ..utils import (text_in_bbox)
class Stream(BaseParser):
@ -432,11 +426,7 @@ class Stream(BaseParser):
return table
def extract_tables(self, filename, suppress_stdout=False):
if not suppress_stdout:
logger.info("Processing {}".format(
os.path.basename(self.rootname)))
def extract_tables(self, filename):
if not self.horizontal_text:
if self.images:
warnings.warn(

View File

@ -1044,14 +1044,14 @@ def compare_tables(left, right):
differences.append(
"{diff_rows} {more_fewer} rows".format(
diff_rows=abs(diff_rows),
more_fewer='more' if diff_rows>0 else 'fewer'
more_fewer='more' if diff_rows > 0 else 'fewer'
)
)
if (diff_cols):
differences.append(
"{diff_cols} {more_fewer} columns".format(
diff_cols=abs(diff_cols),
more_fewer='more' if diff_cols>0 else 'fewer'
more_fewer='more' if diff_cols > 0 else 'fewer'
)
)
if differences: