Refactor base classes and improve plotting
Move common code to base class to reduce duplication Stream plots display pdf background for better contextpull/153/head
|
|
@ -10,6 +10,15 @@ from operator import itemgetter
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from cv2 import cv2
|
||||||
|
|
||||||
|
from .utils import (
|
||||||
|
build_file_path_in_temp_dir,
|
||||||
|
compute_accuracy,
|
||||||
|
compute_whitespace,
|
||||||
|
export_pdf_as_png
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# minimum number of vertical textline intersections for a textedge
|
# minimum number of vertical textline intersections for a textedge
|
||||||
# to be considered valid
|
# to be considered valid
|
||||||
|
|
@ -159,7 +168,10 @@ class TextEdges(object):
|
||||||
# get vertical textedges that intersect maximum number of
|
# get vertical textedges that intersect maximum number of
|
||||||
# times with horizontal textlines
|
# times with horizontal textlines
|
||||||
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
||||||
return self._textedges[relevant_align]
|
return list(filter(
|
||||||
|
lambda te: te.is_valid,
|
||||||
|
self._textedges[relevant_align])
|
||||||
|
)
|
||||||
|
|
||||||
def get_table_areas(self, textlines, relevant_textedges):
|
def get_table_areas(self, textlines, relevant_textedges):
|
||||||
"""Returns a dict of interesting table areas on the PDF page
|
"""Returns a dict of interesting table areas on the PDF page
|
||||||
|
|
@ -179,7 +191,6 @@ class TextEdges(object):
|
||||||
|
|
||||||
table_areas = {}
|
table_areas = {}
|
||||||
for te in relevant_textedges:
|
for te in relevant_textedges:
|
||||||
if te.is_valid:
|
|
||||||
if not table_areas:
|
if not table_areas:
|
||||||
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
table_areas[(te.x, te.y0, te.x, te.y1)] = None
|
||||||
else:
|
else:
|
||||||
|
|
@ -225,7 +236,8 @@ class TextEdges(object):
|
||||||
max(found[3], tl.y1),
|
max(found[3], tl.y1),
|
||||||
)
|
)
|
||||||
table_areas[updated_area] = None
|
table_areas[updated_area] = None
|
||||||
average_textline_height = sum_textline_height / float(len(textlines))
|
average_textline_height = sum_textline_height / \
|
||||||
|
float(len(textlines))
|
||||||
|
|
||||||
# add some padding to table areas
|
# add some padding to table areas
|
||||||
table_areas_padded = {}
|
table_areas_padded = {}
|
||||||
|
|
@ -339,6 +351,8 @@ class Table(object):
|
||||||
Accuracy with which text was assigned to the cell.
|
Accuracy with which text was assigned to the cell.
|
||||||
whitespace : float
|
whitespace : float
|
||||||
Percentage of whitespace in the table.
|
Percentage of whitespace in the table.
|
||||||
|
filename : str
|
||||||
|
Path of the original PDF
|
||||||
order : int
|
order : int
|
||||||
Table number on PDF page.
|
Table number on PDF page.
|
||||||
page : int
|
page : int
|
||||||
|
|
@ -356,8 +370,15 @@ class Table(object):
|
||||||
self.shape = (0, 0)
|
self.shape = (0, 0)
|
||||||
self.accuracy = 0
|
self.accuracy = 0
|
||||||
self.whitespace = 0
|
self.whitespace = 0
|
||||||
|
self.filename = None
|
||||||
self.order = None
|
self.order = None
|
||||||
self.page = None
|
self.page = None
|
||||||
|
self.flavor = None # Flavor of the parser that generated the table
|
||||||
|
self.pdf_size = None # Dimensions of the original PDF page
|
||||||
|
self.debug_info = None # Field holding debug data
|
||||||
|
|
||||||
|
self._image = None
|
||||||
|
self._image_path = None # Temporary file to hold an image of the pdf
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
||||||
|
|
@ -392,6 +413,32 @@ class Table(object):
|
||||||
}
|
}
|
||||||
return report
|
return report
|
||||||
|
|
||||||
|
def record_metadata(self, parser):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
self.flavor = parser.id
|
||||||
|
self.filename = parser.filename
|
||||||
|
self.debug_info = parser.debug_info
|
||||||
|
data = self.data
|
||||||
|
self.df = pd.DataFrame(data)
|
||||||
|
self.shape = self.df.shape
|
||||||
|
|
||||||
|
self.whitespace = compute_whitespace(data)
|
||||||
|
self.pdf_size = (parser.pdf_width, parser.pdf_height)
|
||||||
|
|
||||||
|
def get_pdf_image(self):
|
||||||
|
"""Compute pdf image and cache it
|
||||||
|
"""
|
||||||
|
if self._image is None:
|
||||||
|
if self._image_path is None:
|
||||||
|
self._image_path = build_file_path_in_temp_dir(
|
||||||
|
os.path.basename(self.filename),
|
||||||
|
".png"
|
||||||
|
)
|
||||||
|
export_pdf_as_png(self.filename, self._image_path)
|
||||||
|
self._image = cv2.imread(self._image_path)
|
||||||
|
return self._image
|
||||||
|
|
||||||
def set_all_edges(self):
|
def set_all_edges(self):
|
||||||
"""Sets all table edges to True.
|
"""Sets all table edges to True.
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
from .core import TableList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice
|
from .parsers import Stream, Lattice
|
||||||
from .utils import (
|
from .utils import (
|
||||||
TemporaryDirectory,
|
build_file_path_in_temp_dir,
|
||||||
get_page_layout,
|
get_page_layout,
|
||||||
get_text_objects,
|
get_text_objects,
|
||||||
get_rotation,
|
get_rotation,
|
||||||
|
|
@ -16,6 +16,11 @@ from .utils import (
|
||||||
download_url,
|
download_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
PARSERS = {
|
||||||
|
"lattice": Lattice,
|
||||||
|
"stream": Stream
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(object):
|
class PDFHandler(object):
|
||||||
"""Handles all operations like temp directory creation, splitting
|
"""Handles all operations like temp directory creation, splitting
|
||||||
|
|
@ -89,31 +94,47 @@ class PDFHandler(object):
|
||||||
P.extend(range(p["start"], p["end"] + 1))
|
P.extend(range(p["start"], p["end"] + 1))
|
||||||
return sorted(set(P))
|
return sorted(set(P))
|
||||||
|
|
||||||
def _save_page(self, filepath, page, temp):
|
def _read_pdf_page(self, page=1, layout_kwargs=None):
|
||||||
"""Saves specified page from PDF into a temporary directory.
|
"""Saves specified page from PDF into a temporary directory. Removes
|
||||||
|
password protection and normalizes rotation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
|
||||||
Filepath or URL of the PDF file.
|
|
||||||
page : int
|
page : int
|
||||||
Page number.
|
Page number.
|
||||||
temp : str
|
layout_kwargs : dict, optional (default: {})
|
||||||
Tmp directory.
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
|
||||||
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
layout : object
|
||||||
|
|
||||||
|
dimensions : tuple
|
||||||
|
The dimensions of the pdf page
|
||||||
|
|
||||||
|
filepath : str
|
||||||
|
The path of the single page PDF - either the original, or a
|
||||||
|
normalized version.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(filepath, "rb") as fileobj:
|
layout_kwargs = layout_kwargs or {}
|
||||||
|
with open(self.filepath, "rb") as fileobj:
|
||||||
|
# Normalize the pdf file, but skip if it's not encrypted or has
|
||||||
|
# only one page.
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
|
fpath = build_file_path_in_temp_dir(
|
||||||
|
"page-{page}.pdf".format(page=page))
|
||||||
froot, fext = os.path.splitext(fpath)
|
froot, fext = os.path.splitext(fpath)
|
||||||
p = infile.getPage(page - 1)
|
p = infile.getPage(page - 1)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, "wb") as f:
|
with open(fpath, "wb") as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
layout, __ = get_page_layout(fpath)
|
layout, dimensions = get_page_layout(
|
||||||
|
fpath, **layout_kwargs)
|
||||||
# fix rotated PDF
|
# fix rotated PDF
|
||||||
chars = get_text_objects(layout, ltype="char")
|
chars = get_text_objects(layout, ltype="char")
|
||||||
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||||
|
|
@ -121,12 +142,7 @@ class PDFHandler(object):
|
||||||
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||||
if rotation != "":
|
if rotation != "":
|
||||||
fpath_new = "".join(
|
fpath_new = "".join(
|
||||||
[
|
[froot.replace("page", "p"), "_rotated", fext])
|
||||||
froot.replace("page", "p"),
|
|
||||||
"_rotated",
|
|
||||||
fext
|
|
||||||
]
|
|
||||||
)
|
|
||||||
os.rename(fpath, fpath_new)
|
os.rename(fpath, fpath_new)
|
||||||
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
|
|
@ -140,10 +156,13 @@ class PDFHandler(object):
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, "wb") as f:
|
with open(fpath, "wb") as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
layout, dimensions = get_page_layout(
|
||||||
|
fpath, **layout_kwargs)
|
||||||
|
return layout, dimensions, fpath
|
||||||
|
|
||||||
def parse(
|
def parse(
|
||||||
self, flavor="lattice", suppress_stdout=False, layout_kwargs=None,
|
self, flavor="lattice", suppress_stdout=False,
|
||||||
**kwargs
|
layout_kwargs=None, **kwargs
|
||||||
):
|
):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page PDFs.
|
page PDFs.
|
||||||
|
|
@ -168,19 +187,22 @@ class PDFHandler(object):
|
||||||
"""
|
"""
|
||||||
layout_kwargs = layout_kwargs or {}
|
layout_kwargs = layout_kwargs or {}
|
||||||
tables = []
|
tables = []
|
||||||
with TemporaryDirectory() as tempdir:
|
|
||||||
for p in self.pages:
|
parser_obj = PARSERS[flavor]
|
||||||
self._save_page(self.filepath, p, tempdir)
|
parser = parser_obj(**kwargs)
|
||||||
pages = [
|
|
||||||
os.path.join(tempdir, "page-{0}.pdf".format(p))
|
# Read the layouts/dimensions of each of the pages we need to
|
||||||
for p in self.pages
|
# parse. This might require creating a temporary .pdf.
|
||||||
]
|
for page_idx in self.pages:
|
||||||
parser = Lattice(**kwargs) \
|
layout, dimensions, source_file = self._read_pdf_page(
|
||||||
if flavor == "lattice" else Stream(**kwargs)
|
page_idx,
|
||||||
for p in pages:
|
layout_kwargs=layout_kwargs
|
||||||
t = parser.extract_tables(
|
)
|
||||||
p, suppress_stdout=suppress_stdout,
|
parser._generate_layout(source_file, layout, dimensions,
|
||||||
layout_kwargs=layout_kwargs
|
page_idx, layout_kwargs)
|
||||||
)
|
t = parser.extract_tables(
|
||||||
tables.extend(t)
|
source_file,
|
||||||
|
suppress_stdout=suppress_stdout
|
||||||
|
)
|
||||||
|
tables.extend(t)
|
||||||
return TableList(sorted(tables))
|
return TableList(sorted(tables))
|
||||||
|
|
|
||||||
|
|
@ -2,20 +2,28 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..utils import get_page_layout, get_text_objects
|
from ..utils import (
|
||||||
|
get_text_objects
|
||||||
|
)
|
||||||
|
from ..core import Table
|
||||||
|
|
||||||
|
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
|
def __init__(self, parser_id):
|
||||||
|
self.id = parser_id
|
||||||
|
|
||||||
def _generate_layout(self, filename, layout_kwargs):
|
# For plotting details of parsing algorithms
|
||||||
|
self.debug_info = {}
|
||||||
|
|
||||||
|
def _generate_layout(self, filename, layout, dimensions,
|
||||||
|
page_idx, layout_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.layout_kwargs = layout_kwargs
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout, self.dimensions = get_page_layout(
|
self.layout = layout
|
||||||
filename,
|
self.dimensions = dimensions
|
||||||
**layout_kwargs
|
self.page = page_idx
|
||||||
)
|
|
||||||
self.images = get_text_objects(self.layout, ltype="image")
|
self.images = get_text_objects(self.layout, ltype="image")
|
||||||
self.horizontal_text = get_text_objects(
|
self.horizontal_text = get_text_objects(
|
||||||
self.layout,
|
self.layout,
|
||||||
|
|
@ -27,3 +35,25 @@ class BaseParser(object):
|
||||||
)
|
)
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
||||||
|
"""Initialize new table object, ready to be populated
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_idx : int
|
||||||
|
Index of this table within the pdf page analyzed
|
||||||
|
cols : list
|
||||||
|
list of coordinate boundaries tuples (left, right)
|
||||||
|
rows : list
|
||||||
|
list of coordinate boundaries tuples (bottom, top)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _initialize_new_table(self, table_idx, cols, rows):
|
||||||
|
table = Table(cols, rows)
|
||||||
|
table.page = self.page
|
||||||
|
table.order = table_idx + 1
|
||||||
|
return table
|
||||||
|
|
|
||||||
|
|
@ -2,15 +2,20 @@
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import copy
|
import copy
|
||||||
|
import locale
|
||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import Table
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
build_file_path_in_temp_dir,
|
||||||
|
export_pdf_as_png,
|
||||||
scale_image,
|
scale_image,
|
||||||
scale_pdf,
|
scale_pdf,
|
||||||
segments_in_bbox,
|
segments_in_bbox,
|
||||||
|
|
@ -18,7 +23,6 @@ from ..utils import (
|
||||||
merge_close_lines,
|
merge_close_lines,
|
||||||
get_table_index,
|
get_table_index,
|
||||||
compute_accuracy,
|
compute_accuracy,
|
||||||
compute_whitespace,
|
|
||||||
)
|
)
|
||||||
from ..image_processing import (
|
from ..image_processing import (
|
||||||
adaptive_threshold,
|
adaptive_threshold,
|
||||||
|
|
@ -110,13 +114,13 @@ class Lattice(BaseParser):
|
||||||
resolution=300,
|
resolution=300,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
shift_text = shift_text or ["l", "t"]
|
super().__init__("lattice")
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_scale = line_scale
|
self.line_scale = line_scale
|
||||||
self.copy_text = copy_text
|
self.copy_text = copy_text
|
||||||
self.shift_text = shift_text
|
self.shift_text = shift_text or ["l", "t"]
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
self.strip_text = strip_text
|
self.strip_text = strip_text
|
||||||
|
|
@ -126,6 +130,8 @@ class Lattice(BaseParser):
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.iterations = iterations
|
self.iterations = iterations
|
||||||
self.resolution = resolution
|
self.resolution = resolution
|
||||||
|
self.image_path = None
|
||||||
|
self.pdf_image = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
|
@ -205,18 +211,6 @@ class Lattice(BaseParser):
|
||||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
t.cells[i][j].text = t.cells[i - 1][j].text
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def _generate_image(self):
|
|
||||||
from ..ext.ghostscript import Ghostscript
|
|
||||||
|
|
||||||
self.imagename = "".join([self.rootname, ".png"])
|
|
||||||
gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
|
|
||||||
self.imagename, self.filename
|
|
||||||
)
|
|
||||||
gs_call = gs_call.encode().split()
|
|
||||||
null = open(os.devnull, "wb")
|
|
||||||
Ghostscript(*gs_call, stdout=null)
|
|
||||||
null.close()
|
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
def scale_areas(areas):
|
def scale_areas(areas):
|
||||||
scaled_areas = []
|
scaled_areas = []
|
||||||
|
|
@ -230,15 +224,20 @@ class Lattice(BaseParser):
|
||||||
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||||
return scaled_areas
|
return scaled_areas
|
||||||
|
|
||||||
self.image, self.threshold = adaptive_threshold(
|
self.image_path = build_file_path_in_temp_dir(
|
||||||
self.imagename,
|
os.path.basename(self.filename),
|
||||||
|
".png"
|
||||||
|
)
|
||||||
|
export_pdf_as_png(self.filename, self.image_path)
|
||||||
|
self.pdf_image, self.threshold = adaptive_threshold(
|
||||||
|
self.image_path,
|
||||||
process_background=self.process_background,
|
process_background=self.process_background,
|
||||||
blocksize=self.threshold_blocksize,
|
blocksize=self.threshold_blocksize,
|
||||||
c=self.threshold_constant,
|
c=self.threshold_constant,
|
||||||
)
|
)
|
||||||
|
|
||||||
image_width = self.image.shape[1]
|
image_width = self.pdf_image.shape[1]
|
||||||
image_height = self.image.shape[0]
|
image_height = self.pdf_image.shape[0]
|
||||||
image_width_scaler = image_width / float(self.pdf_width)
|
image_width_scaler = image_width / float(self.pdf_width)
|
||||||
image_height_scaler = image_height / float(self.pdf_height)
|
image_height_scaler = image_height / float(self.pdf_height)
|
||||||
pdf_width_scaler = self.pdf_width / float(image_width)
|
pdf_width_scaler = self.pdf_width / float(image_width)
|
||||||
|
|
@ -332,7 +331,7 @@ class Lattice(BaseParser):
|
||||||
if v_s is None or h_s is None:
|
if v_s is None or h_s is None:
|
||||||
raise ValueError("No segments found on {}".format(self.rootname))
|
raise ValueError("No segments found on {}".format(self.rootname))
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
||||||
# set table border edges to True
|
# set table border edges to True
|
||||||
|
|
@ -360,6 +359,7 @@ class Lattice(BaseParser):
|
||||||
)
|
)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
table.cells[r_idx][c_idx].text = text
|
table.cells[r_idx][c_idx].text = text
|
||||||
|
# FRHTODO
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
if self.copy_text is not None:
|
if self.copy_text is not None:
|
||||||
|
|
@ -368,39 +368,27 @@ class Lattice(BaseParser):
|
||||||
copy_text=self.copy_text
|
copy_text=self.copy_text
|
||||||
)
|
)
|
||||||
|
|
||||||
data = table.data
|
table.record_metadata(self)
|
||||||
table.df = pd.DataFrame(data)
|
|
||||||
table.shape = table.df.shape
|
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
|
||||||
table.flavor = "lattice"
|
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
|
||||||
table.order = table_idx + 1
|
|
||||||
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
_text = []
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
table._text = _text
|
table._text = _text
|
||||||
table._image = (self.image, self.table_bbox_unscaled)
|
table._image = self.pdf_image # Reuse the image used for calc
|
||||||
|
table._bbox_unscaled = self.table_bbox_unscaled
|
||||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
table._textedges = None
|
table._textedges = None
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(
|
def extract_tables(self, filename, suppress_stdout=False):
|
||||||
self,
|
# FRHTODO: move extract table core to the base class
|
||||||
filename,
|
|
||||||
suppress_stdout=False,
|
|
||||||
layout_kwargs=None
|
|
||||||
):
|
|
||||||
layout_kwargs = layout_kwargs or {}
|
|
||||||
self._generate_layout(filename, layout_kwargs)
|
|
||||||
rootname = os.path.basename(self.rootname)
|
rootname = os.path.basename(self.rootname)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info("Processing {rootname}".format(rootname=rootname))
|
logger.info(
|
||||||
|
"Processing {rootname}".format(rootname=rootname))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
if self.images:
|
if self.images:
|
||||||
|
|
@ -415,7 +403,6 @@ class Lattice(BaseParser):
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_image()
|
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges, Table
|
from ..core import TextEdges
|
||||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||||
compute_whitespace)
|
compute_whitespace)
|
||||||
|
|
||||||
|
|
@ -69,11 +69,9 @@ class Stream(BaseParser):
|
||||||
column_tol=0,
|
column_tol=0,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
|
super().__init__("stream")
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.table_bbox = None
|
|
||||||
self.t_bbox = None
|
|
||||||
self.textedges = []
|
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
|
|
@ -191,7 +189,8 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||||
"""Makes row coordinates continuous.
|
"""Makes row coordinates continuous. For the row to "touch"
|
||||||
|
we split the existing gap between them in half.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -206,18 +205,20 @@ class Stream(BaseParser):
|
||||||
List of continuous row y-coordinate tuples.
|
List of continuous row y-coordinate tuples.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
row_mids = [
|
row_boundaries = [
|
||||||
sum((t.y0 + t.y1) / 2 for t in r) / len(r) if len(r) > 0 else 0
|
[
|
||||||
|
max(t.y1 for t in r),
|
||||||
|
min(t.y0 for t in r)
|
||||||
|
]
|
||||||
for r in rows_grouped
|
for r in rows_grouped
|
||||||
]
|
]
|
||||||
rows = [
|
for i in range(0, len(row_boundaries)-1):
|
||||||
(row_mids[i] + row_mids[i - 1]) / 2
|
top_row = row_boundaries[i]
|
||||||
for i in range(1, len(row_mids))
|
bottom_row = row_boundaries[i+1]
|
||||||
]
|
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
|
||||||
rows.insert(0, text_y_max)
|
row_boundaries[0][0] = text_y_max
|
||||||
rows.append(text_y_min)
|
row_boundaries[-1][1] = text_y_min
|
||||||
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
return row_boundaries
|
||||||
return rows
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _add_columns(cols, text, row_tol):
|
def _add_columns(cols, text, row_tol):
|
||||||
|
|
@ -414,7 +415,7 @@ class Stream(BaseParser):
|
||||||
return cols, rows
|
return cols, rows
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
table = Table(cols, rows)
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
|
|
||||||
pos_errors = []
|
pos_errors = []
|
||||||
|
|
@ -436,32 +437,22 @@ class Stream(BaseParser):
|
||||||
table.cells[r_idx][c_idx].text = text
|
table.cells[r_idx][c_idx].text = text
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
data = table.data
|
table.record_metadata(self)
|
||||||
table.df = pd.DataFrame(data)
|
|
||||||
table.shape = table.df.shape
|
|
||||||
|
|
||||||
whitespace = compute_whitespace(data)
|
|
||||||
table.flavor = "stream"
|
|
||||||
table.accuracy = accuracy
|
table.accuracy = accuracy
|
||||||
table.whitespace = whitespace
|
|
||||||
table.order = table_idx + 1
|
|
||||||
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
|
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
_text = []
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
table._text = _text
|
table._text = _text
|
||||||
table._image = None
|
table._bbox = self.table_bbox
|
||||||
table._segments = None
|
table._segments = None
|
||||||
table._textedges = self.textedges
|
table._textedges = self.textedges
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False,
|
def extract_tables(self, filename, suppress_stdout=False):
|
||||||
layout_kwargs=None):
|
|
||||||
layout_kwargs = layout_kwargs or {}
|
|
||||||
self._generate_layout(filename, layout_kwargs)
|
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info("Processing {}".format(
|
logger.info("Processing {}".format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
|
|
|
||||||
|
|
@ -68,11 +68,14 @@ class PlotMethods(object):
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]),
|
(t[0], t[1]),
|
||||||
t[2] - t[0],
|
t[2] - t[0],
|
||||||
t[3] - t[1]
|
t[3] - t[1],
|
||||||
|
alpha=0.5
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
img = table.get_pdf_image()
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def grid(self, table):
|
def grid(self, table):
|
||||||
|
|
@ -100,6 +103,9 @@ class PlotMethods(object):
|
||||||
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
||||||
if cell.bottom:
|
if cell.bottom:
|
||||||
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
||||||
|
|
||||||
|
img = table.get_pdf_image()
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def contour(self, table):
|
def contour(self, table):
|
||||||
|
|
@ -115,12 +121,13 @@ class PlotMethods(object):
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
|
||||||
img, table_bbox = table._image
|
img = table.get_pdf_image()
|
||||||
_FOR_LATTICE = True
|
_FOR_LATTICE = table.flavor == "lattice"
|
||||||
except TypeError:
|
if _FOR_LATTICE:
|
||||||
img, table_bbox = (None, {table._bbox: None})
|
table_bbox = table._bbox_unscaled
|
||||||
_FOR_LATTICE = False
|
else:
|
||||||
|
table_bbox = {table._bbox: None}
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
|
|
||||||
|
|
@ -150,6 +157,8 @@ class PlotMethods(object):
|
||||||
|
|
||||||
if _FOR_LATTICE:
|
if _FOR_LATTICE:
|
||||||
ax.imshow(img)
|
ax.imshow(img)
|
||||||
|
else:
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def textedge(self, table):
|
def textedge(self, table):
|
||||||
|
|
@ -173,7 +182,8 @@ class PlotMethods(object):
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
||||||
color="blue"
|
color="blue",
|
||||||
|
alpha=0.5
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
|
|
@ -182,6 +192,8 @@ class PlotMethods(object):
|
||||||
for te in table._textedges:
|
for te in table._textedges:
|
||||||
ax.plot([te.x, te.x], [te.y0, te.y1])
|
ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||||
|
|
||||||
|
img = table.get_pdf_image()
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
def joint(self, table):
|
def joint(self, table):
|
||||||
|
|
@ -197,7 +209,8 @@ class PlotMethods(object):
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img, table_bbox = table._image
|
img = table.get_pdf_image()
|
||||||
|
table_bbox = table._bbox_unscaled
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
x_coord = []
|
x_coord = []
|
||||||
|
|
@ -230,4 +243,7 @@ class PlotMethods(object):
|
||||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
for h in horizontal:
|
for h in horizontal:
|
||||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
|
|
||||||
|
img = table.get_pdf_image()
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
|
||||||
234
camelot/utils.py
|
|
@ -3,6 +3,7 @@ from __future__ import division
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import atexit
|
||||||
import sys
|
import sys
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
|
|
@ -13,6 +14,7 @@ from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
@ -29,6 +31,7 @@ from pdfminer.layout import (
|
||||||
LTImage,
|
LTImage,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .ext.ghostscript import Ghostscript
|
||||||
|
|
||||||
# pylint: disable=import-error
|
# pylint: disable=import-error
|
||||||
# PyLint will evaluate both branches, and will necessarily complain about one
|
# PyLint will evaluate both branches, and will necessarily complain about one
|
||||||
|
|
@ -150,13 +153,40 @@ def remove_extra(kwargs, flavor="lattice"):
|
||||||
|
|
||||||
|
|
||||||
# https://stackoverflow.com/a/22726782
|
# https://stackoverflow.com/a/22726782
|
||||||
|
# and https://stackoverflow.com/questions/10965479
|
||||||
class TemporaryDirectory(object):
|
class TemporaryDirectory(object):
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
self.name = tempfile.mkdtemp()
|
self.name = tempfile.mkdtemp()
|
||||||
|
# Only delete the temporary directory upon
|
||||||
|
# program exit.
|
||||||
|
atexit.register(shutil.rmtree, self.name)
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
shutil.rmtree(self.name)
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def build_file_path_in_temp_dir(filename, extension=None):
|
||||||
|
"""Generates a new path within a temporary directory
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename : str
|
||||||
|
extension : str
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
file_path_in_temporary_dir : str
|
||||||
|
|
||||||
|
"""
|
||||||
|
with TemporaryDirectory() as temp_dir:
|
||||||
|
if extension:
|
||||||
|
filename = filename + extension
|
||||||
|
path = os.path.join(
|
||||||
|
temp_dir,
|
||||||
|
filename
|
||||||
|
)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
def translate(x1, x2):
|
def translate(x1, x2):
|
||||||
|
|
@ -387,6 +417,117 @@ def text_in_bbox(bbox, text):
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
|
def bbox_from_text(textlines):
|
||||||
|
"""Returns the smallest bbox containing all the text objects passed as
|
||||||
|
a parameters.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
textlines : List of PDFMiner text objects.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bbox : tuple
|
||||||
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
|
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
|
||||||
|
space.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if len(textlines) == 0:
|
||||||
|
return None
|
||||||
|
bbox = (
|
||||||
|
textlines[0].x0,
|
||||||
|
textlines[0].y0,
|
||||||
|
textlines[0].x1,
|
||||||
|
textlines[0].y1
|
||||||
|
)
|
||||||
|
|
||||||
|
for tl in textlines[1:]:
|
||||||
|
bbox = (
|
||||||
|
min(bbox[0], tl.x0),
|
||||||
|
min(bbox[1], tl.y0),
|
||||||
|
max(bbox[2], tl.x1),
|
||||||
|
max(bbox[3], tl.y1)
|
||||||
|
)
|
||||||
|
return bbox
|
||||||
|
|
||||||
|
|
||||||
|
def find_columns_coordinates(tls):
|
||||||
|
"""Given a list of text objects, guess columns boundaries and returns a
|
||||||
|
list of x-coordinates for split points between columns.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tls : list of PDFMiner text object.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols_anchors : list
|
||||||
|
List of x-coordinates for columns.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Make a list of disjunct cols boundaries across the textlines
|
||||||
|
# that comprise the table.
|
||||||
|
# [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
|
||||||
|
cols_bounds = []
|
||||||
|
tls.sort(key=lambda tl: tl.x0)
|
||||||
|
for tl in tls:
|
||||||
|
if (not cols_bounds) or cols_bounds[-1][1] < tl.x0:
|
||||||
|
cols_bounds.append([tl.x0, tl.x1])
|
||||||
|
else:
|
||||||
|
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
|
||||||
|
|
||||||
|
# From the row boundaries, identify splits by getting the mid points
|
||||||
|
# between the boundaries.
|
||||||
|
# Row boundaries: [ a ] [b] [ c ]
|
||||||
|
# Splits: | | | |
|
||||||
|
cols_anchors = list(map(
|
||||||
|
lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
|
||||||
|
range(1, len(cols_bounds)-1)
|
||||||
|
))
|
||||||
|
cols_anchors.insert(0, cols_bounds[0][0])
|
||||||
|
cols_anchors.append(cols_bounds[-1][1])
|
||||||
|
return cols_anchors
|
||||||
|
|
||||||
|
|
||||||
|
def distance_tl_to_bbox(tl, bbox):
|
||||||
|
"""Returns a tuple corresponding to the horizontal and vertical gaps
|
||||||
|
between a textline and a bbox.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tl : PDFMiner text object.
|
||||||
|
bbox : tuple (x0, y0, x1, y1)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
distance : tuple
|
||||||
|
Tuple (horizontal distance, vertical distance)
|
||||||
|
|
||||||
|
"""
|
||||||
|
v_distance, h_distance = None, None
|
||||||
|
if tl.x1 <= bbox[0]:
|
||||||
|
# tl to the left
|
||||||
|
h_distance = bbox[0] - tl.x1
|
||||||
|
elif bbox[2] <= tl.x0:
|
||||||
|
# tl to the right
|
||||||
|
h_distance = tl.x0 - bbox[2]
|
||||||
|
else:
|
||||||
|
# textline overlaps vertically
|
||||||
|
h_distance = 0
|
||||||
|
|
||||||
|
if tl.y1 <= bbox[1]:
|
||||||
|
# tl below
|
||||||
|
v_distance = bbox[1] - tl.y1
|
||||||
|
elif bbox[3] <= tl.y0:
|
||||||
|
# tl above
|
||||||
|
v_distance = tl.y0 - bbox[3]
|
||||||
|
else:
|
||||||
|
# tl overlaps horizontally
|
||||||
|
v_distance = 0
|
||||||
|
return (h_distance, v_distance)
|
||||||
|
|
||||||
|
|
||||||
def merge_close_lines(ar, line_tol=2):
|
def merge_close_lines(ar, line_tol=2):
|
||||||
"""Merges lines which are within a tolerance by calculating a
|
"""Merges lines which are within a tolerance by calculating a
|
||||||
moving mean, based on their x or y axis projections.
|
moving mean, based on their x or y axis projections.
|
||||||
|
|
@ -867,3 +1008,94 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def export_pdf_as_png(pdf_path, destination_path):
|
||||||
|
"""Generate an image from a pdf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
pdf_path : str
|
||||||
|
destination_path : str
|
||||||
|
"""
|
||||||
|
gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
|
||||||
|
gs_call = gs_call.encode().split()
|
||||||
|
null = open(os.devnull, "wb")
|
||||||
|
Ghostscript(*gs_call, stdout=null)
|
||||||
|
null.close()
|
||||||
|
|
||||||
|
|
||||||
|
def compare_tables(left, right):
|
||||||
|
"""Compare two tables and displays differences in a human readable form.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
left : data frame
|
||||||
|
right : data frame
|
||||||
|
"""
|
||||||
|
diff_cols = right.shape[1]-left.shape[1]
|
||||||
|
diff_rows = right.shape[0]-left.shape[0]
|
||||||
|
differences = []
|
||||||
|
if (diff_rows):
|
||||||
|
differences.append(
|
||||||
|
f"{abs(diff_rows)} "
|
||||||
|
f"{'more' if diff_rows>0 else 'fewer'} rows"
|
||||||
|
)
|
||||||
|
if (diff_cols):
|
||||||
|
differences.append(
|
||||||
|
f"{abs(diff_cols)} "
|
||||||
|
f"{'more' if diff_cols>0 else 'fewer'} columns"
|
||||||
|
)
|
||||||
|
if differences:
|
||||||
|
differences_str = " and ".join(differences)
|
||||||
|
print(f"Right has {differences_str} than left "
|
||||||
|
f"[{right.shape[0]},{right.shape[1]}] vs "
|
||||||
|
f"[{left.shape[0]},{left.shape[1]}]")
|
||||||
|
|
||||||
|
table1, table2 = [left, right]
|
||||||
|
name_table1, name_table2 = ["left", "right"]
|
||||||
|
if not diff_rows:
|
||||||
|
# Same number of rows: compare columns since they're of the same length
|
||||||
|
if diff_cols > 0:
|
||||||
|
# Use the longest table as a reference
|
||||||
|
table1, table2 = table2, table1
|
||||||
|
name_table1, name_table2 = name_table2, name_table1
|
||||||
|
for i, col in enumerate(table1.columns):
|
||||||
|
lcol = table1.iloc[:, i]
|
||||||
|
if col in table2:
|
||||||
|
scol = table2.iloc[:, i]
|
||||||
|
if not lcol.equals(scol):
|
||||||
|
diff_df = pd.DataFrame()
|
||||||
|
diff_df[name_table1] = scol
|
||||||
|
diff_df[name_table2] = lcol
|
||||||
|
diff_df["Match"] = lcol == scol
|
||||||
|
print(
|
||||||
|
f"Column {i} different:\n"
|
||||||
|
f"{diff_df}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Column {i} unique to {name_table1}: {lcol}")
|
||||||
|
break
|
||||||
|
elif not diff_cols:
|
||||||
|
# Same number of cols: compare rows since they're of the same length
|
||||||
|
if diff_rows > 0:
|
||||||
|
# Use the longest table as a reference
|
||||||
|
table1, table2 = table2, table1
|
||||||
|
name_table1, name_table2 = name_table2, name_table1
|
||||||
|
for index, lrow in table1.iterrows():
|
||||||
|
if index < table2.shape[1]:
|
||||||
|
srow = table2.loc[index, :]
|
||||||
|
if not lrow.equals(srow):
|
||||||
|
diff_df = pd.DataFrame()
|
||||||
|
diff_df = diff_df.append(lrow, ignore_index=True)
|
||||||
|
diff_df = diff_df.append(srow, ignore_index=True)
|
||||||
|
diff_df.insert(0, 'Table', [name_table1, name_table2])
|
||||||
|
print(f"Row {index} differs:")
|
||||||
|
print(diff_df.values)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f"Row {index} unique to {name_table1}: {lrow}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Tables have different shapes")
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 8.2 KiB After Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 6.7 KiB After Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 15 KiB |
|
Before Width: | Height: | Size: 9.7 KiB After Width: | Height: | Size: 49 KiB |
|
Before Width: | Height: | Size: 8.9 KiB After Width: | Height: | Size: 71 KiB |
|
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 113 KiB |