Moved duplicated common code to base objects
* Move table initialization common areas to BaseParser * Stop relying on intermediate file name for source page index * Create table comparison utility function to help in debugging * Generate pdf as images in stream mode plots * Fix pylint errorspull/127/head
|
|
@ -10,6 +10,11 @@ from operator import itemgetter
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from .utils import (
|
||||||
|
compute_accuracy,
|
||||||
|
compute_whitespace,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# minimum number of vertical textline intersections for a textedge
|
# minimum number of vertical textline intersections for a textedge
|
||||||
# to be considered valid
|
# to be considered valid
|
||||||
|
|
@ -479,6 +484,9 @@ class Table(object):
|
||||||
self.whitespace = 0
|
self.whitespace = 0
|
||||||
self.order = None
|
self.order = None
|
||||||
self.page = None
|
self.page = None
|
||||||
|
self.flavor = None # Flavor of the parser that generated the table
|
||||||
|
self.pdf_size = None # Dimensions of the original PDF page
|
||||||
|
self.debug_info = None # Field holding debug data
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
||||||
|
|
@ -513,6 +521,17 @@ class Table(object):
|
||||||
}
|
}
|
||||||
return report
|
return report
|
||||||
|
|
||||||
|
def fill_data(self, parser):
|
||||||
|
self.flavor = parser.id
|
||||||
|
self.debug_info = parser.debug_info
|
||||||
|
data = self.data
|
||||||
|
self.df = pd.DataFrame(data)
|
||||||
|
self.shape = self.df.shape
|
||||||
|
|
||||||
|
self.whitespace = compute_whitespace(data)
|
||||||
|
|
||||||
|
self.pdf_size = (parser.pdf_width, parser.pdf_height)
|
||||||
|
|
||||||
def set_all_edges(self):
|
def set_all_edges(self):
|
||||||
"""Sets all table edges to True.
|
"""Sets all table edges to True.
|
||||||
"""
|
"""
|
||||||
|
|
@ -747,6 +766,7 @@ class Table(object):
|
||||||
"encoding": "utf-8",
|
"encoding": "utf-8",
|
||||||
}
|
}
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
|
# pylint: disable=abstract-class-instantiated
|
||||||
writer = pd.ExcelWriter(path)
|
writer = pd.ExcelWriter(path)
|
||||||
self.df.to_excel(writer, **kw)
|
self.df.to_excel(writer, **kw)
|
||||||
writer.save()
|
writer.save()
|
||||||
|
|
@ -874,6 +894,7 @@ class TableList(object):
|
||||||
self._compress_dir(**kwargs)
|
self._compress_dir(**kwargs)
|
||||||
elif f == "excel":
|
elif f == "excel":
|
||||||
filepath = os.path.join(dirname, basename)
|
filepath = os.path.join(dirname, basename)
|
||||||
|
# pylint: disable=abstract-class-instantiated
|
||||||
writer = pd.ExcelWriter(filepath)
|
writer = pd.ExcelWriter(filepath)
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
sheet_name = "page-{}-table-{}".format(table.page, table.order)
|
sheet_name = "page-{}-table-{}".format(table.page, table.order)
|
||||||
|
|
|
||||||
|
|
@ -101,26 +101,32 @@ class PDFHandler(object):
|
||||||
temp : str
|
temp : str
|
||||||
Tmp directory.
|
Tmp directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
fpath : str
|
||||||
|
The path of the single page PDF created.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
|
||||||
with open(filepath, "rb") as fileobj:
|
with open(filepath, "rb") as fileobj:
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
|
|
||||||
froot, fext = os.path.splitext(fpath)
|
froot, fext = os.path.splitext(fpath)
|
||||||
p = infile.getPage(page - 1)
|
p = infile.getPage(page - 1)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, "wb") as f:
|
with open(fpath, "wb") as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
layout, dim = get_page_layout(fpath)
|
layout, __ = get_page_layout(fpath)
|
||||||
# fix rotated PDF
|
# fix rotated PDF
|
||||||
chars = get_text_objects(layout, ltype="char")
|
chars = get_text_objects(layout, ltype="char")
|
||||||
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||||
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
||||||
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||||
if rotation != "":
|
if rotation != "":
|
||||||
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
|
fpath_new = "".join(
|
||||||
|
[froot.replace("page", "p"), "_rotated", fext])
|
||||||
os.rename(fpath, fpath_new)
|
os.rename(fpath, fpath_new)
|
||||||
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
|
|
@ -134,9 +140,11 @@ class PDFHandler(object):
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, "wb") as f:
|
with open(fpath, "wb") as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
return fpath
|
||||||
|
|
||||||
def parse(
|
def parse(
|
||||||
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
|
self, flavor="lattice", suppress_stdout=False,
|
||||||
|
layout_kwargs={}, **kwargs
|
||||||
):
|
):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page PDFs.
|
page PDFs.
|
||||||
|
|
@ -149,7 +157,7 @@ class PDFHandler(object):
|
||||||
suppress_stdout : str (default: False)
|
suppress_stdout : str (default: False)
|
||||||
Suppress logs and warnings.
|
Suppress logs and warnings.
|
||||||
layout_kwargs : dict, optional (default: {})
|
layout_kwargs : dict, optional (default: {})
|
||||||
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
|
||||||
kwargs : dict
|
kwargs : dict
|
||||||
See camelot.read_pdf kwargs.
|
See camelot.read_pdf kwargs.
|
||||||
|
|
||||||
|
|
@ -161,15 +169,22 @@ class PDFHandler(object):
|
||||||
"""
|
"""
|
||||||
tables = []
|
tables = []
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
for p in self.pages:
|
parser = \
|
||||||
self._save_page(self.filepath, p, tempdir)
|
Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
||||||
pages = [
|
|
||||||
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
|
# For each of the pages we need to parse, generate a single page
|
||||||
]
|
# .pdf in a temporary folder.
|
||||||
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
for page_idx in self.pages:
|
||||||
for p in pages:
|
single_page_pdf_file = self._save_page(
|
||||||
|
self.filepath,
|
||||||
|
page_idx,
|
||||||
|
tempdir
|
||||||
|
)
|
||||||
t = parser.extract_tables(
|
t = parser.extract_tables(
|
||||||
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
single_page_pdf_file,
|
||||||
|
page_idx,
|
||||||
|
suppress_stdout=suppress_stdout,
|
||||||
|
layout_kwargs=layout_kwargs
|
||||||
)
|
)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
return TableList(sorted(tables))
|
return TableList(sorted(tables))
|
||||||
|
|
|
||||||
|
|
@ -2,11 +2,13 @@
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
import cv2
|
from cv2 import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
def adaptive_threshold(
|
||||||
|
imagename, process_background=False, blocksize=15, c=-2
|
||||||
|
):
|
||||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||||
c : int, optional (default: -2)
|
c : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -39,7 +41,9 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
|
|
||||||
if process_background:
|
if process_background:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
gray, 255,
|
||||||
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
|
cv2.THRESH_BINARY, blocksize, c
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
|
|
@ -54,7 +58,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
|
|
||||||
|
|
||||||
def find_lines(
|
def find_lines(
|
||||||
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
|
threshold, regions=None, direction="horizontal",
|
||||||
|
line_scale=15, iterations=0
|
||||||
):
|
):
|
||||||
"""Finds horizontal and vertical lines by applying morphological
|
"""Finds horizontal and vertical lines by applying morphological
|
||||||
transformations on an image.
|
transformations on an image.
|
||||||
|
|
@ -78,7 +83,7 @@ def find_lines(
|
||||||
iterations : int, optional (default: 0)
|
iterations : int, optional (default: 0)
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -100,13 +105,14 @@ def find_lines(
|
||||||
size = threshold.shape[1] // line_scale
|
size = threshold.shape[1] // line_scale
|
||||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||||
elif direction is None:
|
elif direction is None:
|
||||||
raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
|
raise ValueError("Specify direction as either 'vertical' "
|
||||||
|
"or 'horizontal'")
|
||||||
|
|
||||||
if regions is not None:
|
if regions is not None:
|
||||||
region_mask = np.zeros(threshold.shape)
|
region_mask = np.zeros(threshold.shape)
|
||||||
for region in regions:
|
for region in regions:
|
||||||
x, y, w, h = region
|
x, y, w, h = region
|
||||||
region_mask[y : y + h, x : x + w] = 1
|
region_mask[y: y + h, x: x + w] = 1
|
||||||
threshold = np.multiply(threshold, region_mask)
|
threshold = np.multiply(threshold, region_mask)
|
||||||
|
|
||||||
threshold = cv2.erode(threshold, el)
|
threshold = cv2.erode(threshold, el)
|
||||||
|
|
@ -115,12 +121,16 @@ def find_lines(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_, contours, _ = cv2.findContours(
|
_, contours, _ = cv2.findContours(
|
||||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
threshold.astype(np.uint8),
|
||||||
|
cv2.RETR_EXTERNAL,
|
||||||
|
cv2.CHAIN_APPROX_SIMPLE
|
||||||
)
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
contours, _ = cv2.findContours(
|
contours, _ = cv2.findContours(
|
||||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
threshold.astype(np.uint8),
|
||||||
|
cv2.RETR_EXTERNAL,
|
||||||
|
cv2.CHAIN_APPROX_SIMPLE
|
||||||
)
|
)
|
||||||
|
|
||||||
for c in contours:
|
for c in contours:
|
||||||
|
|
@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal):
|
||||||
tables = {}
|
tables = {}
|
||||||
for c in contours:
|
for c in contours:
|
||||||
x, y, w, h = c
|
x, y, w, h = c
|
||||||
roi = joints[y : y + h, x : x + w]
|
roi = joints[y: y + h, x: x + w]
|
||||||
try:
|
try:
|
||||||
__, jc, __ = cv2.findContours(
|
__, jc, __ = cv2.findContours(
|
||||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
|
|
||||||
|
|
@ -2,19 +2,94 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..utils import get_page_layout, get_text_objects
|
from ..utils import (
|
||||||
|
get_page_layout,
|
||||||
|
get_text_objects
|
||||||
|
)
|
||||||
|
from ..core import Table
|
||||||
|
|
||||||
|
from ..image_processing import (
|
||||||
|
adaptive_threshold,
|
||||||
|
find_lines,
|
||||||
|
find_contours,
|
||||||
|
find_joints
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pylint can't detect contents of cv2
|
||||||
|
from cv2 import imread # pylint: disable=no-name-in-module
|
||||||
|
|
||||||
|
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
|
def __init__(self, parser_id):
|
||||||
|
self.imagename = None
|
||||||
|
self.pdf_image = None
|
||||||
|
self.id = parser_id
|
||||||
|
|
||||||
def _generate_layout(self, filename, layout_kwargs):
|
# For plotting details of parsing algorithms
|
||||||
|
self.debug_info = {}
|
||||||
|
|
||||||
|
def _generate_layout(self, filename, page_idx, layout_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.layout_kwargs = layout_kwargs
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
|
self.layout, self.dimensions = get_page_layout(
|
||||||
|
filename,
|
||||||
|
**layout_kwargs
|
||||||
|
)
|
||||||
self.images = get_text_objects(self.layout, ltype="image")
|
self.images = get_text_objects(self.layout, ltype="image")
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
|
self.horizontal_text = get_text_objects(
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
|
self.layout,
|
||||||
|
ltype="horizontal_text"
|
||||||
|
)
|
||||||
|
self.vertical_text = get_text_objects(
|
||||||
|
self.layout,
|
||||||
|
ltype="vertical_text"
|
||||||
|
)
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
||||||
|
self.page = page_idx
|
||||||
|
|
||||||
|
def generate_image(self):
|
||||||
|
if self.pdf_image is None:
|
||||||
|
self._generate_image_file()
|
||||||
|
self.pdf_image = imread(self.imagename)
|
||||||
|
|
||||||
|
def _generate_image_file(self):
|
||||||
|
if self.imagename:
|
||||||
|
return
|
||||||
|
from ..ext.ghostscript import Ghostscript
|
||||||
|
|
||||||
|
self.imagename = "".join([self.rootname, ".png"])
|
||||||
|
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
|
||||||
|
self.imagename, self.filename
|
||||||
|
)
|
||||||
|
gs_call = gs_call.encode().split()
|
||||||
|
null = open(os.devnull, "wb")
|
||||||
|
Ghostscript(*gs_call, stdout=null)
|
||||||
|
# with Ghostscript(*gs_call, stdout=null) as gs:
|
||||||
|
# pass
|
||||||
|
null.close()
|
||||||
|
|
||||||
|
"""Initialize new table object, ready to be populated
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_idx : int
|
||||||
|
Index of this table within the pdf page analyzed
|
||||||
|
cols : list
|
||||||
|
list of coordinate boundaries tuples (left, right)
|
||||||
|
rows : list
|
||||||
|
list of coordinate boundaries tuples (bottom, top)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : camelot.core.Table
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _initialize_new_table(self, table_idx, cols, rows):
|
||||||
|
table = Table(cols, rows)
|
||||||
|
table.page = self.page
|
||||||
|
table.order = table_idx + 1
|
||||||
|
return table
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,6 @@ import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import Table
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
scale_image,
|
scale_image,
|
||||||
scale_pdf,
|
scale_pdf,
|
||||||
|
|
@ -22,7 +21,6 @@ from ..utils import (
|
||||||
merge_close_lines,
|
merge_close_lines,
|
||||||
get_table_index,
|
get_table_index,
|
||||||
compute_accuracy,
|
compute_accuracy,
|
||||||
compute_whitespace,
|
|
||||||
)
|
)
|
||||||
from ..image_processing import (
|
from ..image_processing import (
|
||||||
adaptive_threshold,
|
adaptive_threshold,
|
||||||
|
|
@ -80,7 +78,7 @@ class Lattice(BaseParser):
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||||
threshold_constant : int, optional (default: -2)
|
threshold_constant : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
|
@ -114,6 +112,7 @@ class Lattice(BaseParser):
|
||||||
resolution=300,
|
resolution=300,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
|
super().__init__("lattice")
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
|
|
@ -208,19 +207,6 @@ class Lattice(BaseParser):
|
||||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
t.cells[i][j].text = t.cells[i - 1][j].text
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def _generate_image(self):
|
|
||||||
from ..ext.ghostscript import Ghostscript
|
|
||||||
|
|
||||||
self.imagename = "".join([self.rootname, ".png"])
|
|
||||||
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
|
|
||||||
self.imagename, self.filename
|
|
||||||
)
|
|
||||||
gs_call = gs_call.encode().split()
|
|
||||||
null = open(os.devnull, "wb")
|
|
||||||
with Ghostscript(*gs_call, stdout=null) as gs:
|
|
||||||
pass
|
|
||||||
null.close()
|
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
def scale_areas(areas):
|
def scale_areas(areas):
|
||||||
scaled_areas = []
|
scaled_areas = []
|
||||||
|
|
@ -234,20 +220,21 @@ class Lattice(BaseParser):
|
||||||
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||||
return scaled_areas
|
return scaled_areas
|
||||||
|
|
||||||
self.image, self.threshold = adaptive_threshold(
|
self.pdf_image, self.threshold = adaptive_threshold(
|
||||||
self.imagename,
|
self.imagename,
|
||||||
process_background=self.process_background,
|
process_background=self.process_background,
|
||||||
blocksize=self.threshold_blocksize,
|
blocksize=self.threshold_blocksize,
|
||||||
c=self.threshold_constant,
|
c=self.threshold_constant,
|
||||||
)
|
)
|
||||||
|
|
||||||
image_width = self.image.shape[1]
|
image_width = self.pdf_image.shape[1]
|
||||||
image_height = self.image.shape[0]
|
image_height = self.pdf_image.shape[0]
|
||||||
image_width_scaler = image_width / float(self.pdf_width)
|
image_width_scaler = image_width / float(self.pdf_width)
|
||||||
image_height_scaler = image_height / float(self.pdf_height)
|
image_height_scaler = image_height / float(self.pdf_height)
|
||||||
pdf_width_scaler = self.pdf_width / float(image_width)
|
pdf_width_scaler = self.pdf_width / float(image_width)
|
||||||
pdf_height_scaler = self.pdf_height / float(image_height)
|
pdf_height_scaler = self.pdf_height / float(image_height)
|
||||||
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
|
image_scalers = (image_width_scaler,
|
||||||
|
image_height_scaler, self.pdf_height)
|
||||||
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
|
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
|
||||||
|
|
||||||
if self.table_areas is None:
|
if self.table_areas is None:
|
||||||
|
|
@ -291,7 +278,11 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
||||||
|
|
||||||
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
|
[
|
||||||
|
self.table_bbox,
|
||||||
|
self.vertical_segments,
|
||||||
|
self.horizontal_segments
|
||||||
|
] = scale_image(
|
||||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -315,7 +306,10 @@ class Lattice(BaseParser):
|
||||||
rows.extend([tk[1], tk[3]])
|
rows.extend([tk[1], tk[3]])
|
||||||
# sort horizontal and vertical segments
|
# sort horizontal and vertical segments
|
||||||
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
||||||
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
|
rows = merge_close_lines(
|
||||||
|
sorted(rows, reverse=True),
|
||||||
|
line_tol=self.line_tol
|
||||||
|
)
|
||||||
# make grid using x and y coord of shortlisted rows and cols
|
# make grid using x and y coord of shortlisted rows and cols
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||||
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
||||||
|
|
@ -328,7 +322,7 @@ class Lattice(BaseParser):
|
||||||
if v_s is None or h_s is None:
|
if v_s is None or h_s is None:
|
||||||
raise ValueError("No segments found on {}".format(self.rootname))
|
raise ValueError("No segments found on {}".format(self.rootname))
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
||||||
# set table border edges to True
|
# set table border edges to True
|
||||||
|
|
@ -359,48 +353,44 @@ class Lattice(BaseParser):
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
if self.copy_text is not None:
|
if self.copy_text is not None:
|
||||||
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
|
table = Lattice._copy_spanning_text(
|
||||||
|
table,
|
||||||
|
copy_text=self.copy_text
|
||||||
|
)
|
||||||
|
|
||||||
data = table.data
|
table.fill_data(self)
|
||||||
table.df = pd.DataFrame(data)
|
|
||||||
table.shape = table.df.shape
|
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
|
||||||
table.flavor = "lattice"
|
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
|
||||||
table.order = table_idx + 1
|
|
||||||
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
_text = []
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
table._text = _text
|
table._text = _text
|
||||||
table._image = (self.image, self.table_bbox_unscaled)
|
table._image = (self.pdf_image, self.table_bbox_unscaled)
|
||||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
table._textedges = None
|
table._textedges = None
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
def extract_tables(self, filename, page_idx=1, suppress_stdout=False,
|
||||||
self._generate_layout(filename, layout_kwargs)
|
layout_kwargs={}):
|
||||||
|
self._generate_layout(filename, page_idx, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info("Processing {}".format(os.path.basename(self.rootname)))
|
logger.info(f"Processing {os.path.basename(self.rootname)}")
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
if self.images:
|
if self.images:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"{} is image-based, camelot only works on"
|
f"{os.path.basename(self.rootname)} is image-based, "
|
||||||
" text-based pages.".format(os.path.basename(self.rootname))
|
"camelot only works on text-based pages."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"No tables found on {}".format(os.path.basename(self.rootname))
|
f"No tables found on {os.path.basename(self.rootname)}"
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_image()
|
self._generate_image_file()
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
|
|
@ -408,8 +398,10 @@ class Lattice(BaseParser):
|
||||||
for table_idx, tk in enumerate(
|
for table_idx, tk in enumerate(
|
||||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||||
):
|
):
|
||||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
|
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
||||||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
table_idx, tk)
|
||||||
|
table = self._generate_table(
|
||||||
|
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
table._bbox = tk
|
table._bbox = tk
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges, Table
|
from ..core import TextEdges
|
||||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||||
compute_whitespace)
|
compute_whitespace)
|
||||||
|
|
||||||
|
|
@ -69,6 +69,7 @@ class Stream(BaseParser):
|
||||||
column_tol=0,
|
column_tol=0,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
|
super().__init__("stream")
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
|
|
@ -120,21 +121,26 @@ class Stream(BaseParser):
|
||||||
Two-dimensional list of text objects grouped into rows.
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
row_y = 0
|
row_y = None
|
||||||
rows = []
|
rows = []
|
||||||
temp = []
|
temp = []
|
||||||
for t in text:
|
non_empty_text = [t for t in text if t.get_text().strip()]
|
||||||
|
for t in non_empty_text:
|
||||||
# is checking for upright necessary?
|
# is checking for upright necessary?
|
||||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs
|
# if t.get_text().strip() and all([obj.upright \
|
||||||
|
# for obj in t._objs
|
||||||
# if type(obj) is LTChar]):
|
# if type(obj) is LTChar]):
|
||||||
if t.get_text().strip():
|
if row_y is not None and \
|
||||||
if not np.isclose(row_y, t.y0, atol=row_tol):
|
not np.isclose(row_y, t.y0, atol=row_tol) and \
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
0.5 * (t.y1 + t.y0) < row_y:
|
||||||
temp = []
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
row_y = t.y0
|
temp = []
|
||||||
temp.append(t)
|
# We update the row's bottom as we go, to be forgiving if there
|
||||||
|
# is a gradual change across multiple columns.
|
||||||
|
row_y = t.y0
|
||||||
|
|
||||||
|
temp.append(t)
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
__ = rows.pop(0) # TODO: hacky
|
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -278,7 +284,7 @@ class Stream(BaseParser):
|
||||||
def _nurminen_table_detection(self, textlines):
|
def _nurminen_table_detection(self, textlines):
|
||||||
"""A general implementation of the table detection algorithm
|
"""A general implementation of the table detection algorithm
|
||||||
described by Anssi Nurminen's master's thesis.
|
described by Anssi Nurminen's master's thesis.
|
||||||
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
|
||||||
|
|
||||||
Assumes that tables are situated relatively far apart
|
Assumes that tables are situated relatively far apart
|
||||||
vertically.
|
vertically.
|
||||||
|
|
@ -378,12 +384,29 @@ class Stream(BaseParser):
|
||||||
"No tables found in table area {}"
|
"No tables found in table area {}"
|
||||||
.format(table_idx + 1)
|
.format(table_idx + 1)
|
||||||
)
|
)
|
||||||
cols = [
|
|
||||||
(t.x0, t.x1) for r in rows_grouped if len(r) == ncols
|
# Identify rows which contain the mode of the number of columns
|
||||||
for t in r
|
full_rows = list(filter(
|
||||||
|
lambda row: len(row) == ncols,
|
||||||
|
rows_grouped))
|
||||||
|
cells_on_full_rows_xrange = [
|
||||||
|
(t.x0, t.x1) for r in full_rows for t in r
|
||||||
]
|
]
|
||||||
cols = self._merge_columns(sorted(cols),
|
# TODO: fixme / make a decision on this
|
||||||
|
# plausible_rows = list(filter(
|
||||||
|
# lambda row: len(row) <= ncols*1.2 and len(row) >= ncols*.8,
|
||||||
|
# rows_grouped))
|
||||||
|
# plausible_cells_xrange = [
|
||||||
|
# (t.x0, t.x1) for r in plausible_rows for t in r
|
||||||
|
# ]
|
||||||
|
# self.debug_info['plausible_rows'] = plausible_rows
|
||||||
|
|
||||||
|
# Identify column boundaries based on the contents of these rows
|
||||||
|
cols = self._merge_columns(sorted(cells_on_full_rows_xrange),
|
||||||
column_tol=self.column_tol)
|
column_tol=self.column_tol)
|
||||||
|
# cols = self._merge_columns(sorted(plausible_cells_xrange),
|
||||||
|
# column_tol=self.column_tol)
|
||||||
|
|
||||||
inner_text = []
|
inner_text = []
|
||||||
for i in range(1, len(cols)):
|
for i in range(1, len(cols)):
|
||||||
left = cols[i - 1][1]
|
left = cols[i - 1][1]
|
||||||
|
|
@ -409,7 +432,7 @@ class Stream(BaseParser):
|
||||||
return cols, rows
|
return cols, rows
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
table = Table(cols, rows)
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
|
|
||||||
pos_errors = []
|
pos_errors = []
|
||||||
|
|
@ -431,31 +454,25 @@ class Stream(BaseParser):
|
||||||
table.cells[r_idx][c_idx].text = text
|
table.cells[r_idx][c_idx].text = text
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
data = table.data
|
table.fill_data(self)
|
||||||
table.df = pd.DataFrame(data)
|
|
||||||
table.shape = table.df.shape
|
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
|
||||||
table.flavor = "stream"
|
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
|
||||||
table.order = table_idx + 1
|
|
||||||
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
_text = []
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
table._text = _text
|
table._text = _text
|
||||||
table._image = None
|
self.generate_image()
|
||||||
|
table._image = (self.pdf_image, self.table_bbox)
|
||||||
table._segments = None
|
table._segments = None
|
||||||
table._textedges = self.textedges
|
table._textedges = self.textedges
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False,
|
def extract_tables(self, filename, page_idx=1, suppress_stdout=False,
|
||||||
layout_kwargs={}):
|
layout_kwargs={}):
|
||||||
self._generate_layout(filename, layout_kwargs)
|
self._generate_layout(filename, page_idx, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info("Processing {}".format(
|
logger.info("Processing {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
|
|
@ -474,6 +491,8 @@ class Stream(BaseParser):
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Identify plausible areas within the doc where tables lie,
|
||||||
|
# populate table_bbox keys with these areas.
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ class PlotMethods(object):
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Lattice flavor does not support kind='{}'".format(kind)
|
"Lattice flavor does not support kind='{}'".format(kind)
|
||||||
)
|
)
|
||||||
elif table.flavor == "stream" and kind in ["joint", "line"]:
|
elif table.flavor == "stream" and kind in ["line"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Stream flavor does not support kind='{}'".format(kind)
|
"Stream flavor does not support kind='{}'".format(kind)
|
||||||
)
|
)
|
||||||
|
|
@ -64,9 +64,18 @@ class PlotMethods(object):
|
||||||
for t in table._text:
|
for t in table._text:
|
||||||
xs.extend([t[0], t[2]])
|
xs.extend([t[0], t[2]])
|
||||||
ys.extend([t[1], t[3]])
|
ys.extend([t[1], t[3]])
|
||||||
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
|
t[2] - t[0],
|
||||||
|
t[3] - t[1],
|
||||||
|
alpha=0.5
|
||||||
|
)
|
||||||
|
)
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
img, __ = table._image
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def grid(self, table):
|
def grid(self, table):
|
||||||
|
|
@ -94,6 +103,9 @@ class PlotMethods(object):
|
||||||
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
||||||
if cell.bottom:
|
if cell.bottom:
|
||||||
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
||||||
|
|
||||||
|
img, __ = table._image
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def contour(self, table):
|
def contour(self, table):
|
||||||
|
|
@ -109,12 +121,8 @@ class PlotMethods(object):
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
img, table_bbox = table._image
|
||||||
img, table_bbox = table._image
|
_FOR_LATTICE = table.flavor == "lattice"
|
||||||
_FOR_LATTICE = True
|
|
||||||
except TypeError:
|
|
||||||
img, table_bbox = (None, {table._bbox: None})
|
|
||||||
_FOR_LATTICE = False
|
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
|
|
||||||
|
|
@ -132,7 +140,8 @@ class PlotMethods(object):
|
||||||
for t in table_bbox.keys():
|
for t in table_bbox.keys():
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
|
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
||||||
|
fill=False, color="red"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if not _FOR_LATTICE:
|
if not _FOR_LATTICE:
|
||||||
|
|
@ -143,6 +152,8 @@ class PlotMethods(object):
|
||||||
|
|
||||||
if _FOR_LATTICE:
|
if _FOR_LATTICE:
|
||||||
ax.imshow(img)
|
ax.imshow(img)
|
||||||
|
else:
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def textedge(self, table):
|
def textedge(self, table):
|
||||||
|
|
@ -164,7 +175,11 @@ class PlotMethods(object):
|
||||||
xs.extend([t[0], t[2]])
|
xs.extend([t[0], t[2]])
|
||||||
ys.extend([t[1], t[3]])
|
ys.extend([t[1], t[3]])
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
||||||
|
color="blue",
|
||||||
|
alpha=0.5
|
||||||
|
)
|
||||||
)
|
)
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
|
@ -172,6 +187,8 @@ class PlotMethods(object):
|
||||||
for te in table._textedges:
|
for te in table._textedges:
|
||||||
ax.plot([te.x, te.x], [te.y0, te.y1])
|
ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||||
|
|
||||||
|
img, __ = table._image
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def joint(self, table):
|
def joint(self, table):
|
||||||
|
|
@ -220,4 +237,8 @@ class PlotMethods(object):
|
||||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
for h in horizontal:
|
for h in horizontal:
|
||||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
|
|
||||||
|
img, __ = table._image
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
|
|
||||||
return fig
|
return fig
|
||||||
|
|
|
||||||
122
camelot/utils.py
|
|
@ -13,6 +13,7 @@ from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
@ -30,6 +31,9 @@ from pdfminer.layout import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# pylint: disable=import-error
|
||||||
|
# PyLint will evaluate both branches, and will necessarily complain about one
|
||||||
|
# of them.
|
||||||
PY3 = sys.version_info[0] >= 3
|
PY3 = sys.version_info[0] >= 3
|
||||||
if PY3:
|
if PY3:
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
@ -310,7 +314,8 @@ def get_rotation(chars, horizontal_text, vertical_text):
|
||||||
if hlen < vlen:
|
if hlen < vlen:
|
||||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
|
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
|
||||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
|
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
|
||||||
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
|
rotation = "anticlockwise" if clockwise < anticlockwise \
|
||||||
|
else "clockwise"
|
||||||
return rotation
|
return rotation
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -341,12 +346,16 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
||||||
v_s = [
|
v_s = [
|
||||||
v
|
v
|
||||||
for v in v_segments
|
for v in v_segments
|
||||||
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
|
if v[1] > lb[1] - 2 and
|
||||||
|
v[3] < rt[1] + 2 and
|
||||||
|
lb[0] - 2 <= v[0] <= rt[0] + 2
|
||||||
]
|
]
|
||||||
h_s = [
|
h_s = [
|
||||||
h
|
h
|
||||||
for h in h_segments
|
for h in h_segments
|
||||||
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
|
if h[0] > lb[0] - 2 and
|
||||||
|
h[2] < rt[0] + 2 and
|
||||||
|
lb[1] - 2 <= h[1] <= rt[1] + 2
|
||||||
]
|
]
|
||||||
return v_s, h_s
|
return v_s, h_s
|
||||||
|
|
||||||
|
|
@ -464,10 +473,10 @@ def flag_font_size(textline, direction, strip_text=""):
|
||||||
for t in textline
|
for t in textline
|
||||||
if not isinstance(t, LTAnno)
|
if not isinstance(t, LTAnno)
|
||||||
]
|
]
|
||||||
l = [np.round(size, decimals=6) for text, size in d]
|
text_sizes = [np.round(size, decimals=6) for text, size in d]
|
||||||
if len(set(l)) > 1:
|
if len(set(text_sizes)) > 1:
|
||||||
flist = []
|
flist = []
|
||||||
min_size = min(l)
|
min_size = min(text_sizes)
|
||||||
for key, chars in groupby(d, itemgetter(1)):
|
for key, chars in groupby(d, itemgetter(1)):
|
||||||
if key == min_size:
|
if key == min_size:
|
||||||
fchars = [t[0] for t in chars]
|
fchars = [t[0] for t in chars]
|
||||||
|
|
@ -511,7 +520,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
of row/column and text is the an lttextline substring.
|
of row/column and text is the an lttextline substring.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
idx = 0
|
|
||||||
cut_text = []
|
cut_text = []
|
||||||
bbox = textline.bbox
|
bbox = textline.bbox
|
||||||
try:
|
try:
|
||||||
|
|
@ -528,7 +536,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
]
|
]
|
||||||
r = r_idx[0]
|
r = r_idx[0]
|
||||||
x_cuts = [
|
x_cuts = [
|
||||||
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
|
(c, table.cells[r][c].x2)
|
||||||
|
for c in x_overlap
|
||||||
|
if table.cells[r][c].right
|
||||||
]
|
]
|
||||||
if not x_cuts:
|
if not x_cuts:
|
||||||
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
||||||
|
|
@ -561,7 +571,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
]
|
]
|
||||||
c = c_idx[0]
|
c = c_idx[0]
|
||||||
y_cuts = [
|
y_cuts = [
|
||||||
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
|
(r, table.cells[r][c].y1)
|
||||||
|
for r in y_overlap
|
||||||
|
if table.cells[r][c].bottom
|
||||||
]
|
]
|
||||||
if not y_cuts:
|
if not y_cuts:
|
||||||
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
||||||
|
|
@ -644,9 +656,8 @@ def get_table_index(
|
||||||
"""
|
"""
|
||||||
r_idx, c_idx = [-1] * 2
|
r_idx, c_idx = [-1] * 2
|
||||||
for r in range(len(table.rows)):
|
for r in range(len(table.rows)):
|
||||||
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
|
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \
|
||||||
r
|
(t.y0 + t.y1) / 2.0 > table.rows[r][1]:
|
||||||
][1]:
|
|
||||||
lt_col_overlap = []
|
lt_col_overlap = []
|
||||||
for c in table.cols:
|
for c in table.cols:
|
||||||
if c[0] <= t.x1 and c[1] >= t.x0:
|
if c[0] <= t.x1 and c[1] >= t.x0:
|
||||||
|
|
@ -681,7 +692,9 @@ def get_table_index(
|
||||||
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
||||||
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
||||||
charea = X * Y
|
charea = X * Y
|
||||||
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
error = (
|
||||||
|
(X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))
|
||||||
|
) / charea
|
||||||
|
|
||||||
if split_text:
|
if split_text:
|
||||||
return (
|
return (
|
||||||
|
|
@ -697,13 +710,16 @@ def get_table_index(
|
||||||
(
|
(
|
||||||
r_idx,
|
r_idx,
|
||||||
c_idx,
|
c_idx,
|
||||||
flag_font_size(t._objs, direction, strip_text=strip_text),
|
flag_font_size(t._objs,
|
||||||
|
direction,
|
||||||
|
strip_text=strip_text),
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
error,
|
error,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
|
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \
|
||||||
|
error
|
||||||
|
|
||||||
|
|
||||||
def compute_accuracy(error_weights):
|
def compute_accuracy(error_weights):
|
||||||
|
|
@ -751,7 +767,6 @@ def compute_whitespace(d):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
whitespace = 0
|
whitespace = 0
|
||||||
r_nempty_cells, c_nempty_cells = [], []
|
|
||||||
for i in d:
|
for i in d:
|
||||||
for j in i:
|
for j in i:
|
||||||
if j.strip() == "":
|
if j.strip() == "":
|
||||||
|
|
@ -852,3 +867,78 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def compare_tables(left, right):
|
||||||
|
"""Compare two tables and displays differences in a human readable form.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
left : data frame
|
||||||
|
right : data frame
|
||||||
|
"""
|
||||||
|
diff_cols = right.shape[1]-left.shape[1]
|
||||||
|
diff_rows = right.shape[0]-left.shape[0]
|
||||||
|
differences = []
|
||||||
|
if (diff_rows):
|
||||||
|
differences.append(
|
||||||
|
f"{abs(diff_rows)} "
|
||||||
|
f"{'more' if diff_rows>0 else 'fewer'} rows"
|
||||||
|
)
|
||||||
|
if (diff_cols):
|
||||||
|
differences.append(
|
||||||
|
f"{abs(diff_cols)} "
|
||||||
|
f"{'more' if diff_cols>0 else 'fewer'} columns"
|
||||||
|
)
|
||||||
|
if differences:
|
||||||
|
differences_str = " and ".join(differences)
|
||||||
|
print(f"Right has {differences_str} than left "
|
||||||
|
f"[{right.shape[0]},{right.shape[1]}] vs "
|
||||||
|
f"[{left.shape[0]},{left.shape[1]}]")
|
||||||
|
|
||||||
|
table1, table2 = [left, right]
|
||||||
|
name_table1, name_table2 = ["left", "right"]
|
||||||
|
if not diff_rows:
|
||||||
|
# Same number of rows: compare columns since they're of the same length
|
||||||
|
if diff_cols > 0:
|
||||||
|
# Use the longest table as a reference
|
||||||
|
table1, table2 = table2, table1
|
||||||
|
name_table1, name_table2 = name_table2, name_table1
|
||||||
|
for i, col in enumerate(table1.columns):
|
||||||
|
lcol = table1.iloc[:, i]
|
||||||
|
if col in table2:
|
||||||
|
scol = table2.iloc[:, i]
|
||||||
|
if not lcol.equals(scol):
|
||||||
|
diff_df = pd.DataFrame()
|
||||||
|
diff_df[name_table1] = scol
|
||||||
|
diff_df[name_table2] = lcol
|
||||||
|
diff_df["Match"] = lcol == scol
|
||||||
|
print(
|
||||||
|
f"Column {i} different:\n"
|
||||||
|
f"{diff_df}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Column {i} unique to {name_table1}: {lcol}")
|
||||||
|
break
|
||||||
|
elif not diff_cols:
|
||||||
|
# Same number of cols: compare rows since they're of the same length
|
||||||
|
if diff_rows > 0:
|
||||||
|
# Use the longest table as a reference
|
||||||
|
table1, table2 = table2, table1
|
||||||
|
name_table1, name_table2 = name_table2, name_table1
|
||||||
|
for i in table1.iterrows():
|
||||||
|
lrow = table1.loc[i, :]
|
||||||
|
if i < table2.shape[1]:
|
||||||
|
srow = table2.loc[i, :]
|
||||||
|
if not lrow.equals(srow):
|
||||||
|
diff_df = pd.DataFrame()
|
||||||
|
diff_df = diff_df.append(lrow, ignore_index=True)
|
||||||
|
diff_df = diff_df.append(srow, ignore_index=True)
|
||||||
|
diff_df.insert(0, 'Table', [name_table1, name_table2])
|
||||||
|
print(f"Column {i} differs:")
|
||||||
|
print(diff_df.values)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f"Row {i} unique to {name_table1}: {lrow}")
|
||||||
|
break
|
||||||
|
|
|
||||||
|
|
@ -838,7 +838,7 @@ data_stream_two_tables_1 = [
|
||||||
"2,330 .9",
|
"2,330 .9",
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
"Violent crime . . . . . . . .\n . .\n . .\n . .\n" \
|
"Violent crime . . . . . . . .\n . .\n . .\n . .\n"
|
||||||
" . .\n . .",
|
" . .\n . .",
|
||||||
"467 .9",
|
"467 .9",
|
||||||
"69 .1",
|
"69 .1",
|
||||||
|
|
@ -1503,15 +1503,8 @@ data_stream_table_areas = [
|
||||||
]
|
]
|
||||||
|
|
||||||
data_stream_columns = [
|
data_stream_columns = [
|
||||||
[
|
["Clave \nEntidad", "Nombre Entidad", "Clave \nMunicipio",
|
||||||
"Clave",
|
"Nombre Municipio", "Clave \nLocalidad", "Nombre Localidad"],
|
||||||
"Nombre Entidad",
|
|
||||||
"Clave",
|
|
||||||
"Nombre Municipio",
|
|
||||||
"Clave",
|
|
||||||
"Nombre Localidad",
|
|
||||||
],
|
|
||||||
["Entidad", "", "Municipio", "", "Localidad", ""],
|
|
||||||
["01", "Aguascalientes", "001", "Aguascalientes", "0094", "Granja Adelita"],
|
["01", "Aguascalientes", "001", "Aguascalientes", "0094", "Granja Adelita"],
|
||||||
["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"],
|
["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"],
|
||||||
["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"],
|
["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"],
|
||||||
|
|
@ -2732,11 +2725,9 @@ data_stream_vertical_headers = [
|
||||||
['', '', '', '', '', '', '', '', '', '', '', 'Congress-',
|
['', '', '', '', '', '', '', '', '', '', '', 'Congress-',
|
||||||
'Senator 36th', 'Rep106th', '', 'Reg. of', '', 'Road', '', '',
|
'Senator 36th', 'Rep106th', '', 'Reg. of', '', 'Road', '', '',
|
||||||
'Distri', 'Dist', '', '', 'Dist'],
|
'Distri', 'Dist', '', '', 'Dist'],
|
||||||
['', '', '', '', '', '', '', '', '', '', '1st Dist', '', 'Dist.',
|
['', '', '', '', '', 'Governor', '', '', 'U.S. Senator', '',
|
||||||
'Dist.', '', 'Deeds', '', 'Commission', '', 'District #1',
|
'1st Dist', '', 'Dist.', 'Dist.', '', 'Deeds', '', 'Commission',
|
||||||
'ct #2', '#3', 'Dist #4', '', '#5'],
|
'', 'District #1', 'ct #2', '#3', 'Dist #4', '', '#5'],
|
||||||
['', '', '', '', '', 'Governor', '', '', 'U.S. Senator', '', '',
|
|
||||||
'', '', '', '', '', '', '', '', '', '', '', '', '', ''],
|
|
||||||
['', 'Number of Registered voters', 'Poll Book Totals',
|
['', 'Number of Registered voters', 'Poll Book Totals',
|
||||||
'Brian Calley', 'Patrick Colbeck', 'Jim Hines', 'Bill Schuette',
|
'Brian Calley', 'Patrick Colbeck', 'Jim Hines', 'Bill Schuette',
|
||||||
'John James', 'Sandy Pensler', '', 'Jack Bergman', '',
|
'John James', 'Sandy Pensler', '', 'Jack Bergman', '',
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 8.2 KiB After Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 6.7 KiB After Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 16 KiB |
|
After Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 8.9 KiB After Width: | Height: | Size: 71 KiB |
|
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 113 KiB |
|
|
@ -9,10 +9,12 @@ from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
import camelot
|
import camelot
|
||||||
from camelot.core import Table, TableList
|
from camelot.core import Table, TableList
|
||||||
|
from camelot.utils import compare_tables
|
||||||
from camelot.__version__ import generate_version
|
from camelot.__version__ import generate_version
|
||||||
|
|
||||||
from .data import *
|
from .data import *
|
||||||
|
|
||||||
|
|
||||||
import pdfminer
|
import pdfminer
|
||||||
|
|
||||||
# The version of PDFMiner has an impact on some of the tests. Unfortunately,
|
# The version of PDFMiner has an impact on some of the tests. Unfortunately,
|
||||||
|
|
@ -48,9 +50,11 @@ def test_password():
|
||||||
|
|
||||||
filename = os.path.join(testdir, "health_protected.pdf")
|
filename = os.path.join(testdir, "health_protected.pdf")
|
||||||
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
|
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
|
||||||
|
assert len(tables) == 1
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
|
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
|
||||||
|
assert len(tables) == 1
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -59,6 +63,7 @@ def test_stream():
|
||||||
|
|
||||||
filename = os.path.join(testdir, "health.pdf")
|
filename = os.path.join(testdir, "health.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
assert len(tables) == 1
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -79,6 +84,7 @@ def test_stream_table_rotated():
|
||||||
|
|
||||||
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
assert len(tables) == 1
|
||||||
result_without_first_row = pd.DataFrame(
|
result_without_first_row = pd.DataFrame(
|
||||||
tables[0].df.drop(tables[0].df.columns[0], axis=1).values)
|
tables[0].df.drop(tables[0].df.columns[0], axis=1).values)
|
||||||
assert_frame_equal(df, result_without_first_row)
|
assert_frame_equal(df, result_without_first_row)
|
||||||
|
|
@ -275,9 +281,9 @@ def test_repr():
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert \
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) == \
|
||||||
)
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
|
||||||
def test_pages():
|
def test_pages():
|
||||||
|
|
@ -285,22 +291,23 @@ def test_pages():
|
||||||
tables = camelot.read_pdf(url)
|
tables = camelot.read_pdf(url)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert \
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) == \
|
||||||
)
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
tables = camelot.read_pdf(url, pages="1-end")
|
tables = camelot.read_pdf(url, pages="1-end")
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert \
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) == \
|
||||||
)
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
tables = camelot.read_pdf(url, pages="all")
|
tables = camelot.read_pdf(url, pages="all")
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) ==
|
||||||
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -310,7 +317,8 @@ def test_url():
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) ==
|
||||||
|
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,13 @@ def test_grid_plot():
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind='grid')
|
return camelot.plot(tables[0], kind='grid')
|
||||||
|
|
||||||
|
@pytest.mark.mpl_image_compare(
|
||||||
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
|
def test_stream_grid_plot():
|
||||||
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
return camelot.plot(tables[0], kind='grid')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
|
|
|
||||||