[ENH] Add OCR and better joint detection

* Add iterations for dilation

* Add OCRLattice and OCRStream

* Add debug
pull/2/head
Vinayak Mehta 2017-04-18 18:25:47 +05:30 committed by GitHub
parent dd909e2b53
commit 4da754ddcb
8 changed files with 411 additions and 156 deletions

View File

@ -1,3 +1,6 @@
from itertools import groupby
from operator import itemgetter
import cv2 import cv2
import numpy as np import numpy as np
@ -44,7 +47,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
return img, threshold return img, threshold
def find_lines(threshold, direction='horizontal', scale=15): def find_lines(threshold, direction='horizontal', scale=15, iterations=2):
"""Finds horizontal and vertical lines by applying morphological """Finds horizontal and vertical lines by applying morphological
transformations on an image. transformations on an image.
@ -62,6 +65,10 @@ def find_lines(threshold, direction='horizontal', scale=15):
for morph transform. for morph transform.
(optional, default: 15) (optional, default: 15)
iterations : int
Number of iterations for dilation.
(optional, default: 2)
Returns Returns
------- -------
dmask : object dmask : object
@ -85,10 +92,9 @@ def find_lines(threshold, direction='horizontal', scale=15):
raise ValueError("Specify direction as either 'vertical' or" raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'") " 'horizontal'")
threshold = cv2.erode(threshold, el, (-1, -1)) threshold = cv2.erode(threshold, el)
threshold = cv2.dilate(threshold, el, (-1, -1)) threshold = cv2.dilate(threshold, el)
dmask = cv2.dilate(threshold, el, iterations=iterations)
dmask = threshold # findContours modifies source image
try: try:
_, contours, _ = cv2.findContours( _, contours, _ = cv2.findContours(
@ -191,3 +197,32 @@ def find_table_joints(contours, vertical, horizontal):
tables[(x, y + h, x + w, y)] = joint_coords tables[(x, y + h, x + w, y)] = joint_coords
return tables return tables
def find_cuts(threshold, line_threshold=100):
"""find_cuts
Parameters
----------
threshold : object
numpy.ndarray representing the thresholded image.
line_threshold : int
Maximum intensity of projections on y-axis.
(optional, default: 100)
Returns
-------
y_cuts : list
List of cuts on y-axis.
"""
y_proj = np.sum(threshold, axis=1)
y_proj_less = np.where(y_proj < line_threshold)[0]
ranges = []
for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x):
group = map(itemgetter(1), g)
ranges.append((group[0], group[-1]))
y_cuts = []
for r in ranges:
y_cuts.append((r[0] + r[1]) / 2)
return sorted(y_cuts, reverse=True)

View File

@ -12,7 +12,7 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
from .table import Table from .table import Table
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
merge_close_values, get_table_index, get_score, count_empty, merge_close_values, get_table_index, get_score, count_empty,
encode_list, get_text_objects, get_page_layout) encode_list, get_text_objects, get_page_layout, remove_empty)
__all__ = ['Lattice'] __all__ = ['Lattice']
@ -131,20 +131,20 @@ class Lattice:
direction. direction.
(optional, default: None) (optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
mtol : list mtol : list
List of ints specifying m-tolerance parameters. List of ints specifying m-tolerance parameters.
(optional, default: [2]) (optional, default: [2])
blocksize: int jtol : list
List of ints specifying j-tolerance parameters.
(optional, default: [2])
blocksize : int
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15) (optional, default: 15)
threshold_constant: float threshold_constant : float
Constant subtracted from the mean or weighted mean Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be (see the details below). Normally, it is positive but may be
zero or negative as well. zero or negative as well.
@ -155,6 +155,10 @@ class Lattice:
element for image processing. element for image processing.
(optional, default: 15) (optional, default: 15)
iterations : int
Number of iterations for dilation.
(optional, default: 2)
invert : bool invert : bool
Whether or not to invert the image. Useful when pdfs have Whether or not to invert the image. Useful when pdfs have
tables with lines in background. tables with lines in background.
@ -187,19 +191,20 @@ class Lattice:
of detected contours, lines, joints and the table generated. of detected contours, lines, joints and the table generated.
(optional, default: None) (optional, default: None)
""" """
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2], def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2],
blocksize=15, threshold_constant=-2, scale=15, invert=False, blocksize=15, threshold_constant=-2, scale=15, iterations=2,
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
shift_text=['l', 't'], debug=None): flag_size=True, shift_text=['l', 't'], debug=None):
self.method = 'lattice' self.method = 'lattice'
self.table_area = table_area self.table_area = table_area
self.fill = fill self.fill = fill
self.headers = headers
self.mtol = mtol self.mtol = mtol
self.jtol = jtol
self.blocksize = blocksize self.blocksize = blocksize
self.threshold_constant = threshold_constant self.threshold_constant = threshold_constant
self.scale = scale self.scale = scale
self.iterations = iterations
self.invert = invert self.invert = invert
self.char_margin, self.line_margin, self.word_margin = margins self.char_margin, self.line_margin, self.word_margin = margins
self.split_text = split_text self.split_text = split_text
@ -257,17 +262,14 @@ class Lattice:
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y) factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
vmask, v_segments = find_lines(threshold, direction='vertical', vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale) scale=self.scale, iterations=self.iterations)
hmask, h_segments = find_lines(threshold, direction='horizontal', hmask, h_segments = find_lines(threshold, direction='horizontal',
scale=self.scale) scale=self.scale, iterations=self.iterations)
if self.table_area is not None: if self.table_area is not None:
if self.fill is not None: if self.fill is not None:
if len(self.table_area) != len(self.fill): if len(self.table_area) != len(self.fill):
raise ValueError("Length of fill should be equal to table_area.") raise ValueError("Length of table area and fill should be equal.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
areas = [] areas = []
for area in self.table_area: for area in self.table_area:
@ -288,6 +290,11 @@ class Lattice:
else: else:
mtolerance = copy.deepcopy(self.mtol) mtolerance = copy.deepcopy(self.mtol)
if len(self.jtol) == 1 and self.jtol[0] == 2:
jtolerance = copy.deepcopy(self.jtol) * len(table_bbox)
else:
jtolerance = copy.deepcopy(self.jtol)
if self.debug: if self.debug:
self.debug_images = (img, table_bbox) self.debug_images = (img, table_bbox)
@ -326,18 +333,9 @@ class Lattice:
rows = [(rows[i], rows[i + 1]) rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)] for i in range(0, len(rows) - 1)]
if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',')
if len(self.headers[table_no]) != len(cols):
logger.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
table = Table(cols, rows) table = Table(cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s) table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
nouse = table.nocont_ / (len(v_s) + len(h_s)) nouse = table.nocont_ / (len(v_s) + len(h_s))
table_data['line_p'] = 100 * (1 - nouse) table_data['line_p'] = 100 * (1 - nouse)
# set spanning cells to True # set spanning cells to True
@ -351,27 +349,27 @@ class Lattice:
assignment_errors = [] assignment_errors = []
table_data['split_text'] = [] table_data['split_text'] = []
table_data['superscript'] = [] table_data['superscript'] = []
for direction in t_bbox: for direction in ['vertical', 'horizontal']:
for t in t_bbox[direction]: for t in t_bbox[direction]:
indices, error = get_table_index( indices, error = get_table_index(
table, t, direction, split_text=self.split_text, table, t, direction, split_text=self.split_text,
flag_size=self.flag_size) flag_size=self.flag_size)
assignment_errors.append(error) if indices[:2] != (-1, -1):
indices = _reduce_index(table, indices, shift_text=self.shift_text,) assignment_errors.append(error)
if len(indices) > 1: indices = _reduce_index(table, indices, shift_text=self.shift_text)
table_data['split_text'].append(indices) if len(indices) > 1:
for r_idx, c_idx, text in indices: table_data['split_text'].append(indices)
if all(s in text for s in ['<s>', '</s>']): for r_idx, c_idx, text in indices:
table_data['superscript'].append((r_idx, c_idx, text)) if all(s in text for s in ['<s>', '</s>']):
table.cells[r_idx][c_idx].add_text(text) table_data['superscript'].append((r_idx, c_idx, text))
table.cells[r_idx][c_idx].add_text(text)
score = get_score([[100, assignment_errors]]) score = get_score([[100, assignment_errors]])
table_data['score'] = score table_data['score'] = score
if self.fill is not None: if self.fill is not None:
table = _fill_spanning(table, fill=self.fill[table_no]) table = _fill_spanning(table, fill=self.fill[table_no])
ar = table.get_list() ar = table.get_list()
if self.headers is not None and self.headers[table_no] != ['']: ar = remove_empty(ar)
ar.insert(0, self.headers[table_no])
ar = encode_list(ar) ar = encode_list(ar)
table_data['data'] = ar table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)

View File

@ -7,19 +7,18 @@ from PIL import Image
from .table import Table from .table import Table
from .imgproc import (adaptive_threshold, find_lines, find_table_contours, from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints) find_table_joints, find_cuts)
from .utils import merge_close_values, encode_list from .utils import merge_close_values, encode_list, remove_empty
class OCR: class OCRLattice:
"""Uses optical character recognition to get text out of image based pdfs. """Lattice, but for images.
Currently works only on pdfs with lines.
Parameters Parameters
---------- ----------
table_area : list table_area : list
List of strings of the form x1,y1,x2,y2 where List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
coordinate space, denoting table areas to analyze. coordinate space, denoting table areas to analyze.
(optional, default: None) (optional, default: None)
@ -27,12 +26,12 @@ class OCR:
List of ints specifying m-tolerance parameters. List of ints specifying m-tolerance parameters.
(optional, default: [2]) (optional, default: [2])
blocksize: int blocksize : int
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15) (optional, default: 15)
threshold_constant: float threshold_constant : float
Constant subtracted from the mean or weighted mean Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be (see the details below). Normally, it is positive but may be
zero or negative as well. zero or negative as well.
@ -51,6 +50,10 @@ class OCR:
element for image processing. element for image processing.
(optional, default: 15) (optional, default: 15)
iterations : int
Number of iterations for dilation.
(optional, default: 2)
debug : string debug : string
{'contour', 'line', 'joint', 'table'} {'contour', 'line', 'joint', 'table'}
Set to one of the above values to generate a matplotlib plot Set to one of the above values to generate a matplotlib plot
@ -58,9 +61,9 @@ class OCR:
(optional, default: None) (optional, default: None)
""" """
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2, def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
dpi=300, lang="eng", scale=15, debug=None): dpi=300, lang="eng", scale=15, iterations=2, debug=None):
self.method = 'ocr' self.method = 'ocrl'
self.table_area = table_area self.table_area = table_area
self.mtol = mtol self.mtol = mtol
self.blocksize = blocksize self.blocksize = blocksize
@ -69,11 +72,13 @@ class OCR:
self.dpi = dpi self.dpi = dpi
self.lang = lang self.lang = lang
self.scale = scale self.scale = scale
self.iterations = iterations
self.debug = debug self.debug = debug
def get_tables(self, pdfname): def get_tables(self, pdfname):
if self.tool is None: if self.tool is None:
return None return None
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png']) imagename = ''.join([bname, '.png'])
@ -91,9 +96,9 @@ class OCR:
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant) c=self.threshold_constant)
vmask, v_segments = find_lines(threshold, direction='vertical', vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale) scale=self.scale, iterations=self.iterations)
hmask, h_segments = find_lines(threshold, direction='horizontal', hmask, h_segments = find_lines(threshold, direction='horizontal',
scale=self.scale) scale=self.scale, iterations=self.iterations)
if self.table_area is not None: if self.table_area is not None:
areas = [] areas = []
@ -154,6 +159,7 @@ class OCR:
ar = table.get_list() ar = table.get_list()
ar.reverse() ar.reverse()
ar = encode_list(ar) ar = encode_list(ar)
ar = remove_empty(ar)
table_data['data'] = ar table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1 table_no += 1
@ -163,3 +169,141 @@ class OCR:
return None return None
return page return page
class OCRStream:
"""Stream, but for images.
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
coordinate space, denoting table areas to analyze.
(optional, default: None)
columns : list
List of strings where each string is comma-separated values of
x-coordinates in OpenCV's coordinate space.
(optional, default: None)
blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
(optional, default: -2)
line_threshold : int
Maximum intensity of projections on y-axis.
(optional, default: 100)
dpi : int
Dots per inch.
(optional, default: 300)
lang : string
Language to be used for OCR.
(optional, default: 'eng')
"""
def __init__(self, table_area=None, columns=None, blocksize=15,
threshold_constant=-2, line_threshold=100, dpi=300, lang="eng",
debug=False):
self.method = 'ocrs'
self.table_area = table_area
self.columns = columns
self.blocksize = blocksize
self.threshold_constant = threshold_constant
self.line_threshold = line_threshold
self.tool = pyocr.get_available_tools()[0] # fix this
self.dpi = dpi
self.lang = lang
self.debug = debug
def get_tables(self, pdfname):
if self.tool is None:
return None
bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png'])
gs_call = [
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
pdfname
]
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
gs_call.insert(0, "gs")
else:
gs_call.insert(0, "gsc")
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
stderr=subprocess.STDOUT)
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant)
height, width = threshold.shape
if self.debug:
self.debug_images = img
return None
if self.table_area is not None:
if self.columns is not None:
if len(self.table_area) != len(self.columns):
raise ValueError("Length of table area and columns should be equal.")
table_bbox = {}
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
x1 = int(x1)
y1 = int(y1)
x2 = int(x2)
y2 = int(y2)
table_bbox[(x1, y1, x2, y2)] = None
else:
table_bbox = {(0, 0, width, height): None}
page = {}
tables = {}
table_no = 0
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
if self.columns is None:
raise NotImplementedError
else:
table_data = {}
table_image = threshold[k[1]:k[3],k[0]:k[2]]
cols = self.columns[table_no].split(',')
cols = [float(c) for c in cols]
cols.insert(0, k[0])
cols.append(k[2])
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
y_cuts = find_cuts(table_image, line_threshold=self.line_threshold)
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
table = Table(cols, rows)
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
x1 = int(table.cells[i][j].x1)
y1 = int(table.cells[i][j].y1)
x2 = int(table.cells[i][j].x2)
y2 = int(table.cells[i][j].y2)
table.cells[i][j].image = table_image[y1:y2,x1:x2]
cell_image = Image.fromarray(table.cells[i][j].image)
text = self.tool.image_to_string(
cell_image,
lang=self.lang,
builder=pyocr.builders.TextBuilder()
)
table.cells[i][j].add_text(text)
ar = table.get_list()
ar.reverse()
ar = encode_list(ar)
ar = remove_empty(ar)
table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
return page

View File

@ -141,11 +141,14 @@ class Pdf:
if self.extractor.method == 'stream': if self.extractor.method == 'stream':
self.debug = self.extractor.debug self.debug = self.extractor.debug
self.debug_text = [] self.debug_text = []
elif self.extractor.method in ['lattice', 'ocr']: elif self.extractor.method in ['lattice', 'ocrl']:
self.debug = self.extractor.debug self.debug = self.extractor.debug
self.debug_images = [] self.debug_images = []
self.debug_segments = [] self.debug_segments = []
self.debug_tables = [] self.debug_tables = []
elif self.extractor.method == 'ocrs':
self.debug = self.extractor.debug
self.debug_images = []
for p in pages: for p in pages:
table = self.extractor.get_tables(p) table = self.extractor.get_tables(p)
if table is not None: if table is not None:
@ -157,6 +160,8 @@ class Pdf:
self.debug_images.append(self.extractor.debug_images) self.debug_images.append(self.extractor.debug_images)
self.debug_segments.append(self.extractor.debug_segments) self.debug_segments.append(self.extractor.debug_segments)
self.debug_tables.append(self.extractor.debug_tables) self.debug_tables.append(self.extractor.debug_tables)
elif self.extractor.method == 'ocrs':
self.debug_images.append(self.extractor.debug_images)
if self.clean: if self.clean:
self.remove_tempdir() self.remove_tempdir()
return tables return tables
@ -175,7 +180,7 @@ class Pdf:
import matplotlib.patches as patches import matplotlib.patches as patches
if self.debug is True: if self.debug is True:
try: if hasattr(self, 'debug_text'):
for text in self.debug_text: for text in self.debug_text:
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect='equal')
@ -193,8 +198,10 @@ class Pdf:
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show() plt.show()
except AttributeError: elif hasattr(self, 'debug_images'):
raise ValueError("This option only be used with Stream.") for img in self.debug_images:
plt.imshow(img)
plt.show()
elif self.debug == 'contour': elif self.debug == 'contour':
try: try:
for img, table_bbox in self.debug_images: for img, table_bbox in self.debug_images:

View File

@ -236,10 +236,6 @@ class Stream:
x-coordinates in PDFMiner's coordinate space. x-coordinates in PDFMiner's coordinate space.
(optional, default: None) (optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
ytol : list ytol : list
List of ints specifying the y-tolerance parameters. List of ints specifying the y-tolerance parameters.
(optional, default: [2]) (optional, default: [2])
@ -268,14 +264,13 @@ class Stream:
LTTextLineHorizontals in order to select table_area, columns. LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False) (optional, default: False)
""" """
def __init__(self, table_area=None, columns=None, headers=None, def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0],
ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1), margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
split_text=False, flag_size=True, debug=False): debug=False):
self.method = 'stream' self.method = 'stream'
self.table_area = table_area self.table_area = table_area
self.columns = columns self.columns = columns
self.headers = headers
self.ytol = ytol self.ytol = ytol
self.mtol = mtol self.mtol = mtol
self.char_margin, self.line_margin, self.word_margin = margins self.char_margin, self.line_margin, self.word_margin = margins
@ -312,14 +307,12 @@ class Stream:
self.debug_text = [] self.debug_text = []
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh]) self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv]) self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
return None
if self.table_area is not None: if self.table_area is not None:
if self.columns is not None: if self.columns is not None:
if len(self.table_area) != len(self.columns): if len(self.table_area) != len(self.columns):
raise ValueError("Length of columns should be equal to table_area.") raise ValueError("Length of table area and columns should be equal.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
table_bbox = {} table_bbox = {}
for area in self.table_area: for area in self.table_area:
@ -336,6 +329,7 @@ class Stream:
ytolerance = copy.deepcopy(self.ytol) * len(table_bbox) ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
else: else:
ytolerance = copy.deepcopy(self.ytol) ytolerance = copy.deepcopy(self.ytol)
if len(self.mtol) == 1 and self.mtol[0] == 0: if len(self.mtol) == 1 and self.mtol[0] == 0:
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
else: else:
@ -374,7 +368,7 @@ class Stream:
guess = True guess = True
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)
len_non_mode = len(filter(lambda x: x != ncols, elements)) len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug: if ncols == 1:
# no tables detected # no tables detected
logger.warning("{}: Only one column was detected, the pdf" logger.warning("{}: Only one column was detected, the pdf"
" may have no tables.".format( " may have no tables.".format(
@ -396,15 +390,6 @@ class Stream:
cols = _add_columns(cols, inner_text, ytolerance[table_no]) cols = _add_columns(cols, inner_text, ytolerance[table_no])
cols = _join_columns(cols, text_x_min, text_x_max) cols = _join_columns(cols, text_x_min, text_x_max)
if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',')
if len(self.headers[table_no]) != len(cols):
logger.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
table = Table(cols, rows) table = Table(cols, rows)
table = table.set_all_edges() table = table.set_all_edges()
assignment_errors = [] assignment_errors = []
@ -429,8 +414,6 @@ class Stream:
table_data['score'] = score table_data['score'] = score
ar = table.get_list() ar = table.get_list()
if self.headers is not None and self.headers[table_no] != ['']:
ar.insert(0, self.headers[table_no])
ar = encode_list(ar) ar = encode_list(ar)
table_data['data'] = ar table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)

View File

@ -188,38 +188,32 @@ class Table:
bound = self.cells[r][c].get_bounded_edges() bound = self.cells[r][c].get_bounded_edges()
if bound == 4: if bound == 4:
continue continue
elif bound == 3: elif bound == 3:
if not self.cells[r][c].left: if not self.cells[r][c].left:
if (self.cells[r][c].right and if (self.cells[r][c].right and
self.cells[r][c].top and self.cells[r][c].top and
self.cells[r][c].bottom): self.cells[r][c].bottom):
self.cells[r][c].spanning_h = True self.cells[r][c].spanning_h = True
elif not self.cells[r][c].right: elif not self.cells[r][c].right:
if (self.cells[r][c].left and if (self.cells[r][c].left and
self.cells[r][c].top and self.cells[r][c].top and
self.cells[r][c].bottom): self.cells[r][c].bottom):
self.cells[r][c].spanning_h = True self.cells[r][c].spanning_h = True
elif not self.cells[r][c].top: elif not self.cells[r][c].top:
if (self.cells[r][c].left and if (self.cells[r][c].left and
self.cells[r][c].right and self.cells[r][c].right and
self.cells[r][c].bottom): self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True self.cells[r][c].spanning_v = True
elif not self.cells[r][c].bottom: elif not self.cells[r][c].bottom:
if (self.cells[r][c].left and if (self.cells[r][c].left and
self.cells[r][c].right and self.cells[r][c].right and
self.cells[r][c].top): self.cells[r][c].top):
self.cells[r][c].spanning_v = True self.cells[r][c].spanning_v = True
elif bound == 2: elif bound == 2:
if self.cells[r][c].left and self.cells[r][c].right: if self.cells[r][c].left and self.cells[r][c].right:
if (not self.cells[r][c].top and if (not self.cells[r][c].top and
not self.cells[r][c].bottom): not self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True self.cells[r][c].spanning_v = True
elif self.cells[r][c].top and self.cells[r][c].bottom: elif self.cells[r][c].top and self.cells[r][c].bottom:
if (not self.cells[r][c].left and if (not self.cells[r][c].left and
not self.cells[r][c].right): not self.cells[r][c].right):

View File

@ -426,40 +426,43 @@ def split_textline(table, textline, direction, flag_size=True):
idx = 0 idx = 0
cut_text = [] cut_text = []
bbox = textline.bbox bbox = textline.bbox
if direction == 'horizontal' and not textline.is_empty(): try:
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]] if direction == 'horizontal' and not textline.is_empty():
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]] x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
r = r_idx[0] r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right] r = r_idx[0]
if not x_cuts: x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] if not x_cuts:
for obj in textline._objs: x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
row = table.rows[r] for obj in textline._objs:
for cut in x_cuts: row = table.rows[r]
if isinstance(obj, LTChar): for cut in x_cuts:
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and if isinstance(obj, LTChar):
(obj.x0 + obj.x1) / 2 <= cut[1]): if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
(obj.x0 + obj.x1) / 2 <= cut[1]):
cut_text.append((r, cut[0], obj))
break
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj)) cut_text.append((r, cut[0], obj))
break elif direction == 'vertical' and not textline.is_empty():
elif isinstance(obj, LTAnno): y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
cut_text.append((r, cut[0], obj)) c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
elif direction == 'vertical' and not textline.is_empty(): c = c_idx[0]
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]] y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]] if not y_cuts:
c = c_idx[0] y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom] for obj in textline._objs:
if not y_cuts: col = table.cols[c]
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] for cut in y_cuts:
for obj in textline._objs: if isinstance(obj, LTChar):
col = table.cols[c] if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
for cut in y_cuts: (obj.y0 + obj.y1) / 2 >= cut[1]):
if isinstance(obj, LTChar): cut_text.append((cut[0], c, obj))
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and break
(obj.y0 + obj.y1) / 2 >= cut[1]): elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj)) cut_text.append((cut[0], c, obj))
break except IndexError:
elif isinstance(obj, LTAnno): return [(-1, -1, textline.get_text())]
cut_text.append((cut[0], c, obj))
grouped_chars = [] grouped_chars = []
for key, chars in groupby(cut_text, itemgetter(0, 1)): for key, chars in groupby(cut_text, itemgetter(0, 1)):
if flag_size: if flag_size:

View File

@ -18,7 +18,7 @@ from PyPDF2 import PdfFileReader
from camelot.pdf import Pdf from camelot.pdf import Pdf
from camelot.lattice import Lattice from camelot.lattice import Lattice
from camelot.stream import Stream from camelot.stream import Stream
from camelot.ocr import OCR from camelot.ocr import OCRLattice, OCRStream
from camelot import utils from camelot import utils
@ -54,7 +54,8 @@ options:
camelot methods: camelot methods:
lattice Looks for lines between data. lattice Looks for lines between data.
stream Looks for spaces between data. stream Looks for spaces between data.
ocr Looks for lines in image based pdfs. ocrl Lattice, but for images.
ocrs Stream, but for images.
See 'camelot <method> -h' for more information on a specific method. See 'camelot <method> -h' for more information on a specific method.
""" """
@ -63,20 +64,22 @@ lattice_doc = """
Lattice method looks for lines between text to form a table. Lattice method looks for lines between text to form a table.
usage: usage:
camelot lattice [-t <tarea>...] [-F <fill>...] [-H <header>...] camelot lattice [-t <tarea>...] [-F <fill>...] [-m <mtol>...]
[-m <mtol>...] [options] [--] <file> [-j <jtol>...] [options] [--] <file>
options: options:
-t, --tarea <tarea> Specific table areas to analyze. -t, --tarea <tarea> Specific table areas to analyze.
-F, --fill <fill> Fill data in horizontal and/or vertical spanning -F, --fill <fill> Fill data in horizontal and/or vertical spanning
cells. Example: -F h, -F v, -F hv cells. Example: -F h, -F v, -F hv
-H, --header <header> Specify header for each table.
-m, --mtol <mtol> Tolerance to account for when merging lines -m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2] which are very close. [default: 2]
-j, --jtol <jtol> Tolerance to account for when matching line endings
with intersections. [default: 2]
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15] -b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-c, --constant <constant> See adaptive threshold doc. [default: -2] -C, --constant <constant> See adaptive threshold doc. [default: -2]
-s, --scale <scale> Scaling factor. Large scaling factor leads to -s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15] smaller lines being detected. [default: 15]
-I, --iterations <iterations> Number of iterations for dilation. [default: 2]
-i, --invert Invert pdf image to make sure that lines are -i, --invert Invert pdf image to make sure that lines are
in foreground. in foreground.
-T, --shift_text <shift_text> Specify where the text in a spanning cell -T, --shift_text <shift_text> Specify where the text in a spanning cell
@ -89,41 +92,61 @@ stream_doc = """
Stream method looks for whitespaces between text to form a table. Stream method looks for whitespaces between text to form a table.
usage: usage:
camelot stream [-t <tarea>...] [-c <columns>...] [-H <header>...] camelot stream [-t <tarea>...] [-c <columns>...] [-m <mtol>...]
[-y <ytol>...] [-m <mtol>...] [options] [--] <file> [-y <ytol>...] [options] [--] <file>
options: options:
-t, --tarea <tarea> Specific table areas to analyze. -t, --tarea <tarea> Specific table areas to analyze.
-c, --columns <columns> Comma-separated list of column x-coordinates. -c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3 Example: -c 10.1,20.2,30.3
-H, --header <header> Specify header for each table.
-y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging columns -m, --mtol <mtol> Tolerance to account for when merging columns
together. [default: 0] together. [default: 0]
-y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2]
-d, --debug Debug by visualizing textboxes. -d, --debug Debug by visualizing textboxes.
""" """
ocr_doc = """ ocrl_doc = """
OCR method looks for lines in image based pdfs. Lattice, but for images.
usage: usage:
camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file> camelot ocrl [-t <tarea>...] [-m <mtol>...] [options] [--] <file>
options: options:
-t, --tarea <tarea> Specific table areas to analyze. -t, --tarea <tarea> Specific table areas to analyze.
-m, --mtol <mtol> Tolerance to account for when merging lines -m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2] which are very close. [default: 2]
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15] -b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-c, --constant <constant> See adaptive threshold doc. [default: -2] -C, --constant <constant> See adaptive threshold doc. [default: -2]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR. -D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300] [default: 300]
-l, --lang <lang> Specify language to be used for OCR. [default: eng] -l, --lang <lang> Specify language to be used for OCR. [default: eng]
-s, --scale <scale> Scaling factor. Large scaling factor leads to -s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15] smaller lines being detected. [default: 15]
-d, --debug <debug> Debug by visualizing pdf geometry. -I, --iterations <iterations> Number of iterations for dilation. [default: 2]
(contour,line,joint,table) Example: -d table -d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
"""
ocrs_doc = """
Stream, but for images.
usage:
camelot ocrs [-t <tarea>...] [-c <columns>...] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-C, --constant <constant> See adaptive threshold doc. [default: -2]
-N, --line-threshold <line_threshold> Maximum intensity of projections on y-axis.
[default: 100]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300]
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
-d, --debug Debug by visualizing image.
""" """
@ -351,8 +374,10 @@ if __name__ == '__main__':
args.update(docopt(lattice_doc, argv=argv)) args.update(docopt(lattice_doc, argv=argv))
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
args.update(docopt(stream_doc, argv=argv)) args.update(docopt(stream_doc, argv=argv))
elif args['<method>'] == 'ocr': elif args['<method>'] == 'ocrl':
args.update(docopt(ocr_doc, argv=argv)) args.update(docopt(ocrl_doc, argv=argv))
elif args['<method>'] == 'ocrs':
args.update(docopt(ocrs_doc, argv=argv))
filename = args['<file>'] filename = args['<file>']
filedir = os.path.dirname(args['<file>']) filedir = os.path.dirname(args['<file>'])
@ -392,11 +417,12 @@ if __name__ == '__main__':
kwargs = { kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None, 'table_area': args['--tarea'] if args['--tarea'] else None,
'fill': args['--fill'] if args['--fill'] else None, 'fill': args['--fill'] if args['--fill'] else None,
'headers': args['--header'] if args['--header'] else None,
'mtol': [int(m) for m in args['--mtol']], 'mtol': [int(m) for m in args['--mtol']],
'jtol': [int(j) for j in args['--jtol']],
'blocksize': int(args['--blocksize']), 'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']), 'threshold_constant': float(args['--constant']),
'scale': int(args['--scale']), 'scale': int(args['--scale']),
'iterations': int(args['--iterations']),
'invert': args['--invert'], 'invert': args['--invert'],
'margins': margins, 'margins': margins,
'split_text': args['--split_text'], 'split_text': args['--split_text'],
@ -462,7 +488,6 @@ if __name__ == '__main__':
kwargs = { kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None, 'table_area': args['--tarea'] if args['--tarea'] else None,
'columns': args['--columns'] if args['--columns'] else None, 'columns': args['--columns'] if args['--columns'] else None,
'headers': args['--header'] if args['--header'] else None,
'ytol': [int(y) for y in args['--ytol']], 'ytol': [int(y) for y in args['--ytol']],
'mtol': [int(m) for m in args['--mtol']], 'mtol': [int(m) for m in args['--mtol']],
'margins': margins, 'margins': margins,
@ -522,7 +547,7 @@ if __name__ == '__main__':
except Exception as e: except Exception as e:
logger.exception(e.message, exc_info=True) logger.exception(e.message, exc_info=True)
sys.exit() sys.exit()
elif args['<method>'] == 'ocr': elif args['<method>'] == 'ocrl':
try: try:
kwargs = { kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None, 'table_area': args['--tarea'] if args['--tarea'] else None,
@ -532,9 +557,75 @@ if __name__ == '__main__':
'dpi': int(args['--dpi']), 'dpi': int(args['--dpi']),
'lang': args['--lang'], 'lang': args['--lang'],
'scale': int(args['--scale']), 'scale': int(args['--scale']),
'iterations': int(args['--iterations']),
'debug': args['--debug'] 'debug': args['--debug']
} }
manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True, manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel'])
data = manager.extract()
processing_time = time.time() - start_time
logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--print-stats']:
print_stats(data, processing_time)
if args['--save-stats']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['line_p'],
table['text_p'],
table['score']))
if args['--debug']:
manager.debug_plot()
except Exception as e:
logger.exception(e.message, exc_info=True)
sys.exit()
elif args['<method>'] == 'ocrs':
try:
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
'columns': args['--columns'] if args['--columns'] else None,
'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']),
'line_threshold': int(args['--line-threshold']),
'dpi': int(args['--dpi']),
'lang': args['--lang'],
'debug': args['--debug']
}
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel']) parallel=args['--parallel'])
data = manager.extract() data = manager.extract()
@ -588,7 +679,7 @@ if __name__ == '__main__':
logger.exception(e.message, exc_info=True) logger.exception(e.message, exc_info=True)
sys.exit() sys.exit()
if args['--debug']: if args.get('--debug') is not None and args['--debug']:
print("See 'camelot <method> -h' for various parameters you can tweak.") print("See 'camelot <method> -h' for various parameters you can tweak.")
else: else:
output = filedir if args['--output'] is None else args['--output'] output = filedir if args['--output'] is None else args['--output']