Lint, refactor

pull/153/head
Frh 2020-04-19 14:30:32 -07:00
parent cff7a9698b
commit 50f11867af
6 changed files with 47 additions and 66 deletions

View File

@ -4,7 +4,6 @@ import os
import sqlite3 import sqlite3
import zipfile import zipfile
import tempfile import tempfile
from itertools import chain
from operator import itemgetter from operator import itemgetter
import numpy as np import numpy as np
@ -736,17 +735,19 @@ class Table(object):
""" """
for f in copy_text: for f in copy_text:
if f == "h": if f == "h":
for i in range(len(self.cells)): for i, row in enumerate(self.cells):
for j in range(len(self.cells[i])): for j, cell in enumerate(row):
if self.cells[i][j].text.strip() == "": if cell.text.strip() == "" and \
if self.cells[i][j].hspan and not self.cells[i][j].left: cell.hspan and \
self.cells[i][j].text = self.cells[i][j - 1].text not cell.left:
cell.text = self.cells[i][j - 1].text
elif f == "v": elif f == "v":
for i in range(len(self.cells)): for i, row in enumerate(self.cells):
for j in range(len(self.cells[i])): for j, cell in enumerate(row):
if self.cells[i][j].text.strip() == "": if cell.text.strip() == "" and \
if self.cells[i][j].vspan and not self.cells[i][j].top: cell.vspan and \
self.cells[i][j].text = self.cells[i - 1][j].text not cell.top:
cell.text = self.cells[i - 1][j].text
return self return self

View File

@ -2,6 +2,7 @@
import os import os
import sys import sys
import logging
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
@ -16,6 +17,8 @@ from .utils import (
download_url, download_url,
) )
logger = logging.getLogger("camelot")
PARSERS = { PARSERS = {
"lattice": Lattice, "lattice": Lattice,
"stream": Stream "stream": Stream
@ -200,9 +203,12 @@ class PDFHandler(object):
) )
parser._generate_layout(source_file, layout, dimensions, parser._generate_layout(source_file, layout, dimensions,
page_idx, layout_kwargs) page_idx, layout_kwargs)
rootname = os.path.basename(parser.rootname)
if not suppress_stdout:
logger.info(
"Processing {rootname}".format(rootname=rootname))
t = parser.extract_tables( t = parser.extract_tables(
source_file, source_file
suppress_stdout=suppress_stdout
) )
tables.extend(t) tables.extend(t)
return TableList(sorted(tables)) return TableList(sorted(tables))

View File

@ -12,7 +12,8 @@ from ..core import Table
class BaseParser(object): class BaseParser(object):
"""Defines a base parser. """Defines a base parser.
""" """
def __init__(self, def __init__(
self,
parser_id, parser_id,
table_regions=None, table_regions=None,
table_areas=None, table_areas=None,
@ -33,6 +34,7 @@ class BaseParser(object):
self.flag_size = flag_size self.flag_size = flag_size
self.rootname = None
self.t_bbox = None self.t_bbox = None
# For plotting details of parsing algorithms # For plotting details of parsing algorithms
@ -79,7 +81,6 @@ class BaseParser(object):
table.order = table_idx + 1 table.order = table_idx + 1
return table return table
@staticmethod @staticmethod
def _reduce_index(t, idx, shift_text): def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning """Reduces index of a text object if it lies within a spanning
@ -112,4 +113,3 @@ class BaseParser(object):
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text table.cells[r_idx][c_idx].text = text
return pos_errors return pos_errors

View File

@ -2,15 +2,9 @@
from __future__ import division from __future__ import division
import os import os
import sys
import copy import copy
import locale
import logging
import warnings import warnings
import subprocess
import numpy as np
import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..utils import ( from ..utils import (
@ -21,8 +15,6 @@ from ..utils import (
segments_in_bbox, segments_in_bbox,
text_in_bbox, text_in_bbox,
merge_close_lines, merge_close_lines,
get_table_index,
compute_accuracy,
) )
from ..image_processing import ( from ..image_processing import (
adaptive_threshold, adaptive_threshold,
@ -32,9 +24,6 @@ from ..image_processing import (
) )
logger = logging.getLogger("camelot")
class Lattice(BaseParser): class Lattice(BaseParser):
"""Lattice method of parsing looks for lines between text """Lattice method of parsing looks for lines between text
to parse the table. to parse the table.
@ -322,13 +311,8 @@ class Lattice(BaseParser):
return table return table
def extract_tables(self, filename, suppress_stdout=False): def extract_tables(self, filename):
# FRHTODO: move extract table core to the base class
rootname = os.path.basename(self.rootname) rootname = os.path.basename(self.rootname)
if not suppress_stdout:
logger.info(
"Processing {rootname}".format(rootname=rootname))
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
warnings.warn( warnings.warn(

View File

@ -2,19 +2,13 @@
from __future__ import division from __future__ import division
import os import os
import logging
import warnings import warnings
import numpy as np import numpy as np
import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import TextEdges from ..core import TextEdges
from ..utils import (text_in_bbox, compute_accuracy, from ..utils import (text_in_bbox)
compute_whitespace)
logger = logging.getLogger("camelot")
class Stream(BaseParser): class Stream(BaseParser):
@ -432,11 +426,7 @@ class Stream(BaseParser):
return table return table
def extract_tables(self, filename, suppress_stdout=False): def extract_tables(self, filename):
if not suppress_stdout:
logger.info("Processing {}".format(
os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
warnings.warn( warnings.warn(

View File

@ -1044,14 +1044,14 @@ def compare_tables(left, right):
differences.append( differences.append(
"{diff_rows} {more_fewer} rows".format( "{diff_rows} {more_fewer} rows".format(
diff_rows=abs(diff_rows), diff_rows=abs(diff_rows),
more_fewer='more' if diff_rows>0 else 'fewer' more_fewer='more' if diff_rows > 0 else 'fewer'
) )
) )
if (diff_cols): if (diff_cols):
differences.append( differences.append(
"{diff_cols} {more_fewer} columns".format( "{diff_cols} {more_fewer} columns".format(
diff_cols=abs(diff_cols), diff_cols=abs(diff_cols),
more_fewer='more' if diff_cols>0 else 'fewer' more_fewer='more' if diff_cols > 0 else 'fewer'
) )
) )
if differences: if differences: