Lint, refactor
parent
ff2ce6f47c
commit
20f18b478f
|
|
@ -4,7 +4,6 @@ import os
|
|||
import sqlite3
|
||||
import zipfile
|
||||
import tempfile
|
||||
from itertools import chain
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
|
|
@ -191,26 +190,26 @@ class TextEdges(object):
|
|||
|
||||
table_areas = {}
|
||||
for te in relevant_textedges:
|
||||
if not table_areas:
|
||||
if not table_areas:
|
||||
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
||||
else:
|
||||
found = None
|
||||
for area in table_areas:
|
||||
# check for overlap
|
||||
if te.y1 >= area[1] and te.y0 <= area[3]:
|
||||
found = area
|
||||
break
|
||||
if found is None:
|
||||
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
||||
else:
|
||||
found = None
|
||||
for area in table_areas:
|
||||
# check for overlap
|
||||
if te.y1 >= area[1] and te.y0 <= area[3]:
|
||||
found = area
|
||||
break
|
||||
if found is None:
|
||||
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
||||
else:
|
||||
table_areas.pop(found)
|
||||
updated_area = (
|
||||
found[0],
|
||||
min(te.y0, found[1]),
|
||||
max(found[2], te.x),
|
||||
max(found[3], te.y1),
|
||||
)
|
||||
table_areas[updated_area] = None
|
||||
table_areas.pop(found)
|
||||
updated_area = (
|
||||
found[0],
|
||||
min(te.y0, found[1]),
|
||||
max(found[2], te.x),
|
||||
max(found[3], te.y1),
|
||||
)
|
||||
table_areas[updated_area] = None
|
||||
|
||||
# extend table areas based on textlines that overlap
|
||||
# vertically. it's possible that these textlines were
|
||||
|
|
@ -736,17 +735,19 @@ class Table(object):
|
|||
"""
|
||||
for f in copy_text:
|
||||
if f == "h":
|
||||
for i in range(len(self.cells)):
|
||||
for j in range(len(self.cells[i])):
|
||||
if self.cells[i][j].text.strip() == "":
|
||||
if self.cells[i][j].hspan and not self.cells[i][j].left:
|
||||
self.cells[i][j].text = self.cells[i][j - 1].text
|
||||
for i, row in enumerate(self.cells):
|
||||
for j, cell in enumerate(row):
|
||||
if cell.text.strip() == "" and \
|
||||
cell.hspan and \
|
||||
not cell.left:
|
||||
cell.text = self.cells[i][j - 1].text
|
||||
elif f == "v":
|
||||
for i in range(len(self.cells)):
|
||||
for j in range(len(self.cells[i])):
|
||||
if self.cells[i][j].text.strip() == "":
|
||||
if self.cells[i][j].vspan and not self.cells[i][j].top:
|
||||
self.cells[i][j].text = self.cells[i - 1][j].text
|
||||
for i, row in enumerate(self.cells):
|
||||
for j, cell in enumerate(row):
|
||||
if cell.text.strip() == "" and \
|
||||
cell.vspan and \
|
||||
not cell.top:
|
||||
cell.text = self.cells[i - 1][j].text
|
||||
return self
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
|
|
@ -16,6 +17,8 @@ from .utils import (
|
|||
download_url,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("camelot")
|
||||
|
||||
PARSERS = {
|
||||
"lattice": Lattice,
|
||||
"stream": Stream
|
||||
|
|
@ -199,10 +202,13 @@ class PDFHandler(object):
|
|||
layout_kwargs=layout_kwargs
|
||||
)
|
||||
parser._generate_layout(source_file, layout, dimensions,
|
||||
page_idx, layout_kwargs)
|
||||
page_idx, layout_kwargs)
|
||||
rootname = os.path.basename(parser.rootname)
|
||||
if not suppress_stdout:
|
||||
logger.info(
|
||||
"Processing {rootname}".format(rootname=rootname))
|
||||
t = parser.extract_tables(
|
||||
source_file,
|
||||
suppress_stdout=suppress_stdout
|
||||
source_file
|
||||
)
|
||||
tables.extend(t)
|
||||
return TableList(sorted(tables))
|
||||
|
|
|
|||
|
|
@ -12,7 +12,8 @@ from ..core import Table
|
|||
class BaseParser(object):
|
||||
"""Defines a base parser.
|
||||
"""
|
||||
def __init__(self,
|
||||
def __init__(
|
||||
self,
|
||||
parser_id,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
|
|
@ -33,6 +34,7 @@ class BaseParser(object):
|
|||
|
||||
self.flag_size = flag_size
|
||||
|
||||
self.rootname = None
|
||||
self.t_bbox = None
|
||||
|
||||
# For plotting details of parsing algorithms
|
||||
|
|
@ -79,7 +81,6 @@ class BaseParser(object):
|
|||
table.order = table_idx + 1
|
||||
return table
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _reduce_index(t, idx, shift_text):
|
||||
"""Reduces index of a text object if it lies within a spanning
|
||||
|
|
@ -112,4 +113,3 @@ class BaseParser(object):
|
|||
for r_idx, c_idx, text in indices:
|
||||
table.cells[r_idx][c_idx].text = text
|
||||
return pos_errors
|
||||
|
||||
|
|
|
|||
|
|
@ -2,15 +2,9 @@
|
|||
|
||||
from __future__ import division
|
||||
import os
|
||||
import sys
|
||||
import copy
|
||||
import locale
|
||||
import logging
|
||||
import warnings
|
||||
import subprocess
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from .base import BaseParser
|
||||
from ..utils import (
|
||||
|
|
@ -21,8 +15,6 @@ from ..utils import (
|
|||
segments_in_bbox,
|
||||
text_in_bbox,
|
||||
merge_close_lines,
|
||||
get_table_index,
|
||||
compute_accuracy,
|
||||
)
|
||||
from ..image_processing import (
|
||||
adaptive_threshold,
|
||||
|
|
@ -32,9 +24,6 @@ from ..image_processing import (
|
|||
)
|
||||
|
||||
|
||||
logger = logging.getLogger("camelot")
|
||||
|
||||
|
||||
class Lattice(BaseParser):
|
||||
"""Lattice method of parsing looks for lines between text
|
||||
to parse the table.
|
||||
|
|
@ -322,13 +311,8 @@ class Lattice(BaseParser):
|
|||
|
||||
return table
|
||||
|
||||
def extract_tables(self, filename, suppress_stdout=False):
|
||||
# FRHTODO: move extract table core to the base class
|
||||
def extract_tables(self, filename):
|
||||
rootname = os.path.basename(self.rootname)
|
||||
if not suppress_stdout:
|
||||
logger.info(
|
||||
"Processing {rootname}".format(rootname=rootname))
|
||||
|
||||
if not self.horizontal_text:
|
||||
if self.images:
|
||||
warnings.warn(
|
||||
|
|
|
|||
|
|
@ -2,19 +2,13 @@
|
|||
|
||||
from __future__ import division
|
||||
import os
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from .base import BaseParser
|
||||
from ..core import TextEdges
|
||||
from ..utils import (text_in_bbox, compute_accuracy,
|
||||
compute_whitespace)
|
||||
|
||||
|
||||
logger = logging.getLogger("camelot")
|
||||
from ..utils import (text_in_bbox)
|
||||
|
||||
|
||||
class Stream(BaseParser):
|
||||
|
|
@ -432,11 +426,7 @@ class Stream(BaseParser):
|
|||
|
||||
return table
|
||||
|
||||
def extract_tables(self, filename, suppress_stdout=False):
|
||||
if not suppress_stdout:
|
||||
logger.info("Processing {}".format(
|
||||
os.path.basename(self.rootname)))
|
||||
|
||||
def extract_tables(self, filename):
|
||||
if not self.horizontal_text:
|
||||
if self.images:
|
||||
warnings.warn(
|
||||
|
|
|
|||
|
|
@ -1044,14 +1044,14 @@ def compare_tables(left, right):
|
|||
differences.append(
|
||||
"{diff_rows} {more_fewer} rows".format(
|
||||
diff_rows=abs(diff_rows),
|
||||
more_fewer='more' if diff_rows>0 else 'fewer'
|
||||
more_fewer='more' if diff_rows > 0 else 'fewer'
|
||||
)
|
||||
)
|
||||
if (diff_cols):
|
||||
differences.append(
|
||||
"{diff_cols} {more_fewer} columns".format(
|
||||
diff_cols=abs(diff_cols),
|
||||
more_fewer='more' if diff_cols>0 else 'fewer'
|
||||
more_fewer='more' if diff_cols > 0 else 'fewer'
|
||||
)
|
||||
)
|
||||
if differences:
|
||||
|
|
|
|||
Loading…
Reference in New Issue