More linting, refactor
parent
50f11867af
commit
c27a8026d6
|
|
@ -25,7 +25,7 @@ PARSERS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(object):
|
class PDFHandler():
|
||||||
"""Handles all operations like temp directory creation, splitting
|
"""Handles all operations like temp directory creation, splitting
|
||||||
file into single page PDFs, parsing each PDF and then removing the
|
file into single page PDFs, parsing each PDF and then removing the
|
||||||
temp directory.
|
temp directory.
|
||||||
|
|
@ -201,8 +201,8 @@ class PDFHandler(object):
|
||||||
page_idx,
|
page_idx,
|
||||||
layout_kwargs=layout_kwargs
|
layout_kwargs=layout_kwargs
|
||||||
)
|
)
|
||||||
parser._generate_layout(source_file, layout, dimensions,
|
parser.prepare_page_parse(source_file, layout, dimensions,
|
||||||
page_idx, layout_kwargs)
|
page_idx, layout_kwargs)
|
||||||
rootname = os.path.basename(parser.rootname)
|
rootname = os.path.basename(parser.rootname)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
get_text_objects,
|
get_text_objects,
|
||||||
|
|
@ -40,8 +41,8 @@ class BaseParser(object):
|
||||||
# For plotting details of parsing algorithms
|
# For plotting details of parsing algorithms
|
||||||
self.debug_info = {}
|
self.debug_info = {}
|
||||||
|
|
||||||
def _generate_layout(self, filename, layout, dimensions,
|
def prepare_page_parse(self, filename, layout, dimensions,
|
||||||
page_idx, layout_kwargs):
|
page_idx, layout_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.layout_kwargs = layout_kwargs
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout = layout
|
self.layout = layout
|
||||||
|
|
@ -59,6 +60,22 @@ class BaseParser(object):
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
||||||
|
def _document_has_no_text(self):
|
||||||
|
if not self.horizontal_text:
|
||||||
|
rootname = os.path.basename(self.rootname)
|
||||||
|
if self.images:
|
||||||
|
warnings.warn(
|
||||||
|
"{rootname} is image-based, "
|
||||||
|
"camelot only works on text-based pages."
|
||||||
|
.format(rootname=rootname)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
warnings.warn(
|
||||||
|
"No tables found on {rootname}".format(rootname=rootname)
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
"""Initialize new table object, ready to be populated
|
"""Initialize new table object, ready to be populated
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,6 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import copy
|
import copy
|
||||||
import warnings
|
|
||||||
|
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
|
|
@ -312,18 +311,7 @@ class Lattice(BaseParser):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
rootname = os.path.basename(self.rootname)
|
if self._document_has_no_text():
|
||||||
if not self.horizontal_text:
|
|
||||||
if self.images:
|
|
||||||
warnings.warn(
|
|
||||||
"{rootname} is image-based, "
|
|
||||||
"camelot only works on text-based pages."
|
|
||||||
.format(rootname=rootname)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
warnings.warn(
|
|
||||||
"No tables found on {rootname}".format(rootname=rootname)
|
|
||||||
)
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -427,18 +426,7 @@ class Stream(BaseParser):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename):
|
def extract_tables(self, filename):
|
||||||
if not self.horizontal_text:
|
if self._document_has_no_text():
|
||||||
if self.images:
|
|
||||||
warnings.warn(
|
|
||||||
"{} is image-based, camelot only works on"
|
|
||||||
" text-based pages.".format(
|
|
||||||
os.path.basename(self.rootname))
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
warnings.warn(
|
|
||||||
"No tables found on {}".format(
|
|
||||||
os.path.basename(self.rootname))
|
|
||||||
)
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Identify plausible areas within the doc where tables lie,
|
# Identify plausible areas within the doc where tables lie,
|
||||||
|
|
|
||||||
|
|
@ -1107,11 +1107,15 @@ def compare_tables(left, right):
|
||||||
diff_df = diff_df.append(lrow, ignore_index=True)
|
diff_df = diff_df.append(lrow, ignore_index=True)
|
||||||
diff_df = diff_df.append(srow, ignore_index=True)
|
diff_df = diff_df.append(srow, ignore_index=True)
|
||||||
diff_df.insert(0, 'Table', [name_table1, name_table2])
|
diff_df.insert(0, 'Table', [name_table1, name_table2])
|
||||||
print(f"Row {index} differs:")
|
print("Row {index} differs:".format(index=index))
|
||||||
print(diff_df.values)
|
print(diff_df.values)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print(f"Row {index} unique to {name_table1}: {lrow}")
|
print("Row {index} unique to {name_table1}: {lrow}".format(
|
||||||
|
index=index,
|
||||||
|
name_table1=name_table1,
|
||||||
|
lrow=lrow
|
||||||
|
))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print("Tables have different shapes")
|
print("Tables have different shapes")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue