More linting, refactor

pull/153/head
Frh 2020-04-19 14:42:18 -07:00
parent 50f11867af
commit c27a8026d6
5 changed files with 30 additions and 33 deletions

View File

@ -25,7 +25,7 @@ PARSERS = {
} }
class PDFHandler(object): class PDFHandler():
"""Handles all operations like temp directory creation, splitting """Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the file into single page PDFs, parsing each PDF and then removing the
temp directory. temp directory.
@ -201,8 +201,8 @@ class PDFHandler(object):
page_idx, page_idx,
layout_kwargs=layout_kwargs layout_kwargs=layout_kwargs
) )
parser._generate_layout(source_file, layout, dimensions, parser.prepare_page_parse(source_file, layout, dimensions,
page_idx, layout_kwargs) page_idx, layout_kwargs)
rootname = os.path.basename(parser.rootname) rootname = os.path.basename(parser.rootname)
if not suppress_stdout: if not suppress_stdout:
logger.info( logger.info(

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import warnings
from ..utils import ( from ..utils import (
get_text_objects, get_text_objects,
@ -40,8 +41,8 @@ class BaseParser(object):
# For plotting details of parsing algorithms # For plotting details of parsing algorithms
self.debug_info = {} self.debug_info = {}
def _generate_layout(self, filename, layout, dimensions, def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs): page_idx, layout_kwargs):
self.filename = filename self.filename = filename
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout = layout self.layout = layout
@ -59,6 +60,22 @@ class BaseParser(object):
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)
def _document_has_no_text(self):
if not self.horizontal_text:
rootname = os.path.basename(self.rootname)
if self.images:
warnings.warn(
"{rootname} is image-based, "
"camelot only works on text-based pages."
.format(rootname=rootname)
)
else:
warnings.warn(
"No tables found on {rootname}".format(rootname=rootname)
)
return True
return False
"""Initialize new table object, ready to be populated """Initialize new table object, ready to be populated
Parameters Parameters

View File

@ -3,7 +3,6 @@
from __future__ import division from __future__ import division
import os import os
import copy import copy
import warnings
from .base import BaseParser from .base import BaseParser
@ -312,18 +311,7 @@ class Lattice(BaseParser):
return table return table
def extract_tables(self, filename): def extract_tables(self, filename):
rootname = os.path.basename(self.rootname) if self._document_has_no_text():
if not self.horizontal_text:
if self.images:
warnings.warn(
"{rootname} is image-based, "
"camelot only works on text-based pages."
.format(rootname=rootname)
)
else:
warnings.warn(
"No tables found on {rootname}".format(rootname=rootname)
)
return [] return []
self._generate_table_bbox() self._generate_table_bbox()

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
import os
import warnings import warnings
import numpy as np import numpy as np
@ -427,18 +426,7 @@ class Stream(BaseParser):
return table return table
def extract_tables(self, filename): def extract_tables(self, filename):
if not self.horizontal_text: if self._document_has_no_text():
if self.images:
warnings.warn(
"{} is image-based, camelot only works on"
" text-based pages.".format(
os.path.basename(self.rootname))
)
else:
warnings.warn(
"No tables found on {}".format(
os.path.basename(self.rootname))
)
return [] return []
# Identify plausible areas within the doc where tables lie, # Identify plausible areas within the doc where tables lie,

View File

@ -1107,11 +1107,15 @@ def compare_tables(left, right):
diff_df = diff_df.append(lrow, ignore_index=True) diff_df = diff_df.append(lrow, ignore_index=True)
diff_df = diff_df.append(srow, ignore_index=True) diff_df = diff_df.append(srow, ignore_index=True)
diff_df.insert(0, 'Table', [name_table1, name_table2]) diff_df.insert(0, 'Table', [name_table1, name_table2])
print(f"Row {index} differs:") print("Row {index} differs:".format(index=index))
print(diff_df.values) print(diff_df.values)
break break
else: else:
print(f"Row {index} unique to {name_table1}: {lrow}") print("Row {index} unique to {name_table1}: {lrow}".format(
index=index,
name_table1=name_table1,
lrow=lrow
))
break break
else: else:
print("Tables have different shapes") print("Tables have different shapes")