More linting, refactor
parent
20f18b478f
commit
f37ed50fed
|
|
@ -25,7 +25,7 @@ PARSERS = {
|
|||
}
|
||||
|
||||
|
||||
class PDFHandler(object):
|
||||
class PDFHandler():
|
||||
"""Handles all operations like temp directory creation, splitting
|
||||
file into single page PDFs, parsing each PDF and then removing the
|
||||
temp directory.
|
||||
|
|
@ -201,8 +201,8 @@ class PDFHandler(object):
|
|||
page_idx,
|
||||
layout_kwargs=layout_kwargs
|
||||
)
|
||||
parser._generate_layout(source_file, layout, dimensions,
|
||||
page_idx, layout_kwargs)
|
||||
parser.prepare_page_parse(source_file, layout, dimensions,
|
||||
page_idx, layout_kwargs)
|
||||
rootname = os.path.basename(parser.rootname)
|
||||
if not suppress_stdout:
|
||||
logger.info(
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import warnings
|
||||
|
||||
from ..utils import (
|
||||
get_text_objects,
|
||||
|
|
@ -40,8 +41,8 @@ class BaseParser(object):
|
|||
# For plotting details of parsing algorithms
|
||||
self.debug_info = {}
|
||||
|
||||
def _generate_layout(self, filename, layout, dimensions,
|
||||
page_idx, layout_kwargs):
|
||||
def prepare_page_parse(self, filename, layout, dimensions,
|
||||
page_idx, layout_kwargs):
|
||||
self.filename = filename
|
||||
self.layout_kwargs = layout_kwargs
|
||||
self.layout = layout
|
||||
|
|
@ -59,6 +60,22 @@ class BaseParser(object):
|
|||
self.pdf_width, self.pdf_height = self.dimensions
|
||||
self.rootname, __ = os.path.splitext(self.filename)
|
||||
|
||||
def _document_has_no_text(self):
|
||||
if not self.horizontal_text:
|
||||
rootname = os.path.basename(self.rootname)
|
||||
if self.images:
|
||||
warnings.warn(
|
||||
"{rootname} is image-based, "
|
||||
"camelot only works on text-based pages."
|
||||
.format(rootname=rootname)
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"No tables found on {rootname}".format(rootname=rootname)
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
"""Initialize new table object, ready to be populated
|
||||
|
||||
Parameters
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
from __future__ import division
|
||||
import os
|
||||
import copy
|
||||
import warnings
|
||||
|
||||
|
||||
from .base import BaseParser
|
||||
|
|
@ -312,18 +311,7 @@ class Lattice(BaseParser):
|
|||
return table
|
||||
|
||||
def extract_tables(self, filename):
|
||||
rootname = os.path.basename(self.rootname)
|
||||
if not self.horizontal_text:
|
||||
if self.images:
|
||||
warnings.warn(
|
||||
"{rootname} is image-based, "
|
||||
"camelot only works on text-based pages."
|
||||
.format(rootname=rootname)
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"No tables found on {rootname}".format(rootname=rootname)
|
||||
)
|
||||
if self._document_has_no_text():
|
||||
return []
|
||||
|
||||
self._generate_table_bbox()
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import division
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
|
@ -427,18 +426,7 @@ class Stream(BaseParser):
|
|||
return table
|
||||
|
||||
def extract_tables(self, filename):
|
||||
if not self.horizontal_text:
|
||||
if self.images:
|
||||
warnings.warn(
|
||||
"{} is image-based, camelot only works on"
|
||||
" text-based pages.".format(
|
||||
os.path.basename(self.rootname))
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"No tables found on {}".format(
|
||||
os.path.basename(self.rootname))
|
||||
)
|
||||
if self._document_has_no_text():
|
||||
return []
|
||||
|
||||
# Identify plausible areas within the doc where tables lie,
|
||||
|
|
|
|||
|
|
@ -1107,11 +1107,15 @@ def compare_tables(left, right):
|
|||
diff_df = diff_df.append(lrow, ignore_index=True)
|
||||
diff_df = diff_df.append(srow, ignore_index=True)
|
||||
diff_df.insert(0, 'Table', [name_table1, name_table2])
|
||||
print(f"Row {index} differs:")
|
||||
print("Row {index} differs:".format(index=index))
|
||||
print(diff_df.values)
|
||||
break
|
||||
else:
|
||||
print(f"Row {index} unique to {name_table1}: {lrow}")
|
||||
print("Row {index} unique to {name_table1}: {lrow}".format(
|
||||
index=index,
|
||||
name_table1=name_table1,
|
||||
lrow=lrow
|
||||
))
|
||||
break
|
||||
else:
|
||||
print("Tables have different shapes")
|
||||
|
|
|
|||
Loading…
Reference in New Issue