More linting, refactor

pull/153/head
Frh 2020-04-19 14:42:18 -07:00
parent 50f11867af
commit c27a8026d6
5 changed files with 30 additions and 33 deletions

View File

@ -25,7 +25,7 @@ PARSERS = {
}
class PDFHandler(object):
class PDFHandler():
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
temp directory.
@ -201,8 +201,8 @@ class PDFHandler(object):
page_idx,
layout_kwargs=layout_kwargs
)
parser._generate_layout(source_file, layout, dimensions,
page_idx, layout_kwargs)
parser.prepare_page_parse(source_file, layout, dimensions,
page_idx, layout_kwargs)
rootname = os.path.basename(parser.rootname)
if not suppress_stdout:
logger.info(

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import os
import warnings
from ..utils import (
get_text_objects,
@ -40,8 +41,8 @@ class BaseParser(object):
# For plotting details of parsing algorithms
self.debug_info = {}
def _generate_layout(self, filename, layout, dimensions,
page_idx, layout_kwargs):
def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs):
self.filename = filename
self.layout_kwargs = layout_kwargs
self.layout = layout
@ -59,6 +60,22 @@ class BaseParser(object):
self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename)
def _document_has_no_text(self):
if not self.horizontal_text:
rootname = os.path.basename(self.rootname)
if self.images:
warnings.warn(
"{rootname} is image-based, "
"camelot only works on text-based pages."
.format(rootname=rootname)
)
else:
warnings.warn(
"No tables found on {rootname}".format(rootname=rootname)
)
return True
return False
"""Initialize new table object, ready to be populated
Parameters

View File

@ -3,7 +3,6 @@
from __future__ import division
import os
import copy
import warnings
from .base import BaseParser
@ -312,18 +311,7 @@ class Lattice(BaseParser):
return table
def extract_tables(self, filename):
rootname = os.path.basename(self.rootname)
if not self.horizontal_text:
if self.images:
warnings.warn(
"{rootname} is image-based, "
"camelot only works on text-based pages."
.format(rootname=rootname)
)
else:
warnings.warn(
"No tables found on {rootname}".format(rootname=rootname)
)
if self._document_has_no_text():
return []
self._generate_table_bbox()

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import division
import os
import warnings
import numpy as np
@ -427,18 +426,7 @@ class Stream(BaseParser):
return table
def extract_tables(self, filename):
if not self.horizontal_text:
if self.images:
warnings.warn(
"{} is image-based, camelot only works on"
" text-based pages.".format(
os.path.basename(self.rootname))
)
else:
warnings.warn(
"No tables found on {}".format(
os.path.basename(self.rootname))
)
if self._document_has_no_text():
return []
# Identify plausible areas within the doc where tables lie,

View File

@ -1107,11 +1107,15 @@ def compare_tables(left, right):
diff_df = diff_df.append(lrow, ignore_index=True)
diff_df = diff_df.append(srow, ignore_index=True)
diff_df.insert(0, 'Table', [name_table1, name_table2])
print(f"Row {index} differs:")
print("Row {index} differs:".format(index=index))
print(diff_df.values)
break
else:
print(f"Row {index} unique to {name_table1}: {lrow}")
print("Row {index} unique to {name_table1}: {lrow}".format(
index=index,
name_table1=name_table1,
lrow=lrow
))
break
else:
print("Tables have different shapes")