Lint, refactor
parent
cff7a9698b
commit
50f11867af
|
|
@ -4,7 +4,6 @@ import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import zipfile
|
import zipfile
|
||||||
import tempfile
|
import tempfile
|
||||||
from itertools import chain
|
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -191,26 +190,26 @@ class TextEdges(object):
|
||||||
|
|
||||||
table_areas = {}
|
table_areas = {}
|
||||||
for te in relevant_textedges:
|
for te in relevant_textedges:
|
||||||
if not table_areas:
|
if not table_areas:
|
||||||
|
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
||||||
|
else:
|
||||||
|
found = None
|
||||||
|
for area in table_areas:
|
||||||
|
# check for overlap
|
||||||
|
if te.y1 >= area[1] and te.y0 <= area[3]:
|
||||||
|
found = area
|
||||||
|
break
|
||||||
|
if found is None:
|
||||||
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
||||||
else:
|
else:
|
||||||
found = None
|
table_areas.pop(found)
|
||||||
for area in table_areas:
|
updated_area = (
|
||||||
# check for overlap
|
found[0],
|
||||||
if te.y1 >= area[1] and te.y0 <= area[3]:
|
min(te.y0, found[1]),
|
||||||
found = area
|
max(found[2], te.x),
|
||||||
break
|
max(found[3], te.y1),
|
||||||
if found is None:
|
)
|
||||||
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
table_areas[updated_area] = None
|
||||||
else:
|
|
||||||
table_areas.pop(found)
|
|
||||||
updated_area = (
|
|
||||||
found[0],
|
|
||||||
min(te.y0, found[1]),
|
|
||||||
max(found[2], te.x),
|
|
||||||
max(found[3], te.y1),
|
|
||||||
)
|
|
||||||
table_areas[updated_area] = None
|
|
||||||
|
|
||||||
# extend table areas based on textlines that overlap
|
# extend table areas based on textlines that overlap
|
||||||
# vertically. it's possible that these textlines were
|
# vertically. it's possible that these textlines were
|
||||||
|
|
@ -736,17 +735,19 @@ class Table(object):
|
||||||
"""
|
"""
|
||||||
for f in copy_text:
|
for f in copy_text:
|
||||||
if f == "h":
|
if f == "h":
|
||||||
for i in range(len(self.cells)):
|
for i, row in enumerate(self.cells):
|
||||||
for j in range(len(self.cells[i])):
|
for j, cell in enumerate(row):
|
||||||
if self.cells[i][j].text.strip() == "":
|
if cell.text.strip() == "" and \
|
||||||
if self.cells[i][j].hspan and not self.cells[i][j].left:
|
cell.hspan and \
|
||||||
self.cells[i][j].text = self.cells[i][j - 1].text
|
not cell.left:
|
||||||
|
cell.text = self.cells[i][j - 1].text
|
||||||
elif f == "v":
|
elif f == "v":
|
||||||
for i in range(len(self.cells)):
|
for i, row in enumerate(self.cells):
|
||||||
for j in range(len(self.cells[i])):
|
for j, cell in enumerate(row):
|
||||||
if self.cells[i][j].text.strip() == "":
|
if cell.text.strip() == "" and \
|
||||||
if self.cells[i][j].vspan and not self.cells[i][j].top:
|
cell.vspan and \
|
||||||
self.cells[i][j].text = self.cells[i - 1][j].text
|
not cell.top:
|
||||||
|
cell.text = self.cells[i - 1][j].text
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import logging
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
|
|
@ -16,6 +17,8 @@ from .utils import (
|
||||||
download_url,
|
download_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger("camelot")
|
||||||
|
|
||||||
PARSERS = {
|
PARSERS = {
|
||||||
"lattice": Lattice,
|
"lattice": Lattice,
|
||||||
"stream": Stream
|
"stream": Stream
|
||||||
|
|
@ -199,10 +202,13 @@ class PDFHandler(object):
|
||||||
layout_kwargs=layout_kwargs
|
layout_kwargs=layout_kwargs
|
||||||
)
|
)
|
||||||
parser._generate_layout(source_file, layout, dimensions,
|
parser._generate_layout(source_file, layout, dimensions,
|
||||||
page_idx, layout_kwargs)
|
page_idx, layout_kwargs)
|
||||||
|
rootname = os.path.basename(parser.rootname)
|
||||||
|
if not suppress_stdout:
|
||||||
|
logger.info(
|
||||||
|
"Processing {rootname}".format(rootname=rootname))
|
||||||
t = parser.extract_tables(
|
t = parser.extract_tables(
|
||||||
source_file,
|
source_file
|
||||||
suppress_stdout=suppress_stdout
|
|
||||||
)
|
)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
return TableList(sorted(tables))
|
return TableList(sorted(tables))
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,8 @@ from ..core import Table
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
def __init__(
|
||||||
|
self,
|
||||||
parser_id,
|
parser_id,
|
||||||
table_regions=None,
|
table_regions=None,
|
||||||
table_areas=None,
|
table_areas=None,
|
||||||
|
|
@ -33,6 +34,7 @@ class BaseParser(object):
|
||||||
|
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
|
|
||||||
|
self.rootname = None
|
||||||
self.t_bbox = None
|
self.t_bbox = None
|
||||||
|
|
||||||
# For plotting details of parsing algorithms
|
# For plotting details of parsing algorithms
|
||||||
|
|
@ -79,7 +81,6 @@ class BaseParser(object):
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
"""Reduces index of a text object if it lies within a spanning
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
|
|
@ -112,4 +113,3 @@ class BaseParser(object):
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
table.cells[r_idx][c_idx].text = text
|
table.cells[r_idx][c_idx].text = text
|
||||||
return pos_errors
|
return pos_errors
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,15 +2,9 @@
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import copy
|
import copy
|
||||||
import locale
|
|
||||||
import logging
|
|
||||||
import warnings
|
import warnings
|
||||||
import subprocess
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
|
@ -21,8 +15,6 @@ from ..utils import (
|
||||||
segments_in_bbox,
|
segments_in_bbox,
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
merge_close_lines,
|
merge_close_lines,
|
||||||
get_table_index,
|
|
||||||
compute_accuracy,
|
|
||||||
)
|
)
|
||||||
from ..image_processing import (
|
from ..image_processing import (
|
||||||
adaptive_threshold,
|
adaptive_threshold,
|
||||||
|
|
@ -32,9 +24,6 @@ from ..image_processing import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("camelot")
|
|
||||||
|
|
||||||
|
|
||||||
class Lattice(BaseParser):
|
class Lattice(BaseParser):
|
||||||
"""Lattice method of parsing looks for lines between text
|
"""Lattice method of parsing looks for lines between text
|
||||||
to parse the table.
|
to parse the table.
|
||||||
|
|
@ -322,13 +311,8 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False):
|
def extract_tables(self, filename):
|
||||||
# FRHTODO: move extract table core to the base class
|
|
||||||
rootname = os.path.basename(self.rootname)
|
rootname = os.path.basename(self.rootname)
|
||||||
if not suppress_stdout:
|
|
||||||
logger.info(
|
|
||||||
"Processing {rootname}".format(rootname=rootname))
|
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
if self.images:
|
if self.images:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
|
|
|
||||||
|
|
@ -2,19 +2,13 @@
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import logging
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges
|
from ..core import TextEdges
|
||||||
from ..utils import (text_in_bbox, compute_accuracy,
|
from ..utils import (text_in_bbox)
|
||||||
compute_whitespace)
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("camelot")
|
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
|
|
@ -432,11 +426,7 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False):
|
def extract_tables(self, filename):
|
||||||
if not suppress_stdout:
|
|
||||||
logger.info("Processing {}".format(
|
|
||||||
os.path.basename(self.rootname)))
|
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
if self.images:
|
if self.images:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
|
|
|
||||||
|
|
@ -1044,14 +1044,14 @@ def compare_tables(left, right):
|
||||||
differences.append(
|
differences.append(
|
||||||
"{diff_rows} {more_fewer} rows".format(
|
"{diff_rows} {more_fewer} rows".format(
|
||||||
diff_rows=abs(diff_rows),
|
diff_rows=abs(diff_rows),
|
||||||
more_fewer='more' if diff_rows>0 else 'fewer'
|
more_fewer='more' if diff_rows > 0 else 'fewer'
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if (diff_cols):
|
if (diff_cols):
|
||||||
differences.append(
|
differences.append(
|
||||||
"{diff_cols} {more_fewer} columns".format(
|
"{diff_cols} {more_fewer} columns".format(
|
||||||
diff_cols=abs(diff_cols),
|
diff_cols=abs(diff_cols),
|
||||||
more_fewer='more' if diff_cols>0 else 'fewer'
|
more_fewer='more' if diff_cols > 0 else 'fewer'
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if differences:
|
if differences:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue