diff --git a/camelot/imgproc.py b/camelot/imgproc.py
index eba296b..24abac3 100644
--- a/camelot/imgproc.py
+++ b/camelot/imgproc.py
@@ -1,3 +1,6 @@
+from itertools import groupby
+from operator import itemgetter
+
import cv2
import numpy as np
@@ -44,7 +47,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
return img, threshold
-def find_lines(threshold, direction='horizontal', scale=15):
+def find_lines(threshold, direction='horizontal', scale=15, iterations=2):
"""Finds horizontal and vertical lines by applying morphological
transformations on an image.
@@ -62,6 +65,10 @@ def find_lines(threshold, direction='horizontal', scale=15):
for morph transform.
(optional, default: 15)
+ iterations : int
+ Number of iterations for dilation.
+ (optional, default: 2)
+
Returns
-------
dmask : object
@@ -85,10 +92,9 @@ def find_lines(threshold, direction='horizontal', scale=15):
raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'")
- threshold = cv2.erode(threshold, el, (-1, -1))
- threshold = cv2.dilate(threshold, el, (-1, -1))
-
- dmask = threshold # findContours modifies source image
+ threshold = cv2.erode(threshold, el)
+ threshold = cv2.dilate(threshold, el)
+ dmask = cv2.dilate(threshold, el, iterations=iterations)
try:
_, contours, _ = cv2.findContours(
@@ -190,4 +196,33 @@ def find_table_joints(contours, vertical, horizontal):
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
- return tables
\ No newline at end of file
+ return tables
+
+
+def find_cuts(threshold, line_threshold=100):
+ """find_cuts
+
+ Parameters
+ ----------
+ threshold : object
+ numpy.ndarray representing the thresholded image.
+
+ line_threshold : int
+ Maximum intensity of projections on y-axis.
+ (optional, default: 100)
+
+ Returns
+ -------
+ y_cuts : list
+ List of cuts on y-axis.
+ """
+ y_proj = np.sum(threshold, axis=1)
+ y_proj_less = np.where(y_proj < line_threshold)[0]
+ ranges = []
+ for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x):
+ group = map(itemgetter(1), g)
+ ranges.append((group[0], group[-1]))
+ y_cuts = []
+ for r in ranges:
+ y_cuts.append((r[0] + r[1]) / 2)
+ return sorted(y_cuts, reverse=True)
\ No newline at end of file
diff --git a/camelot/lattice.py b/camelot/lattice.py
index 2e82222..3a60a73 100644
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@@ -12,7 +12,7 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
from .table import Table
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
merge_close_values, get_table_index, get_score, count_empty,
- encode_list, get_text_objects, get_page_layout)
+ encode_list, get_text_objects, get_page_layout, remove_empty)
__all__ = ['Lattice']
@@ -131,20 +131,20 @@ class Lattice:
direction.
(optional, default: None)
- headers : list
- List of strings where each string is a csv header for a table.
- (optional, default: None)
-
mtol : list
List of ints specifying m-tolerance parameters.
(optional, default: [2])
- blocksize: int
+ jtol : list
+ List of ints specifying j-tolerance parameters.
+ (optional, default: [2])
+
+ blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
- threshold_constant: float
+ threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
@@ -155,6 +155,10 @@ class Lattice:
element for image processing.
(optional, default: 15)
+ iterations : int
+ Number of iterations for dilation.
+ (optional, default: 2)
+
invert : bool
Whether or not to invert the image. Useful when pdfs have
tables with lines in background.
@@ -187,19 +191,20 @@ class Lattice:
of detected contours, lines, joints and the table generated.
(optional, default: None)
"""
- def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
- blocksize=15, threshold_constant=-2, scale=15, invert=False,
- margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
- shift_text=['l', 't'], debug=None):
+ def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2],
+ blocksize=15, threshold_constant=-2, scale=15, iterations=2,
+ invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
+ flag_size=True, shift_text=['l', 't'], debug=None):
self.method = 'lattice'
self.table_area = table_area
self.fill = fill
- self.headers = headers
self.mtol = mtol
+ self.jtol = jtol
self.blocksize = blocksize
self.threshold_constant = threshold_constant
self.scale = scale
+ self.iterations = iterations
self.invert = invert
self.char_margin, self.line_margin, self.word_margin = margins
self.split_text = split_text
@@ -257,17 +262,14 @@ class Lattice:
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
vmask, v_segments = find_lines(threshold, direction='vertical',
- scale=self.scale)
+ scale=self.scale, iterations=self.iterations)
hmask, h_segments = find_lines(threshold, direction='horizontal',
- scale=self.scale)
+ scale=self.scale, iterations=self.iterations)
if self.table_area is not None:
if self.fill is not None:
if len(self.table_area) != len(self.fill):
- raise ValueError("Length of fill should be equal to table_area.")
- if self.headers is not None:
- if len(self.table_area) != len(self.headers):
- raise ValueError("Length of headers should be equal to table_area.")
+ raise ValueError("Length of table area and fill should be equal.")
areas = []
for area in self.table_area:
@@ -288,6 +290,11 @@ class Lattice:
else:
mtolerance = copy.deepcopy(self.mtol)
+ if len(self.jtol) == 1 and self.jtol[0] == 2:
+ jtolerance = copy.deepcopy(self.jtol) * len(table_bbox)
+ else:
+ jtolerance = copy.deepcopy(self.jtol)
+
if self.debug:
self.debug_images = (img, table_bbox)
@@ -326,18 +333,9 @@ class Lattice:
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
- if self.headers is not None and self.headers[table_no] != [""]:
- self.headers[table_no] = self.headers[table_no].split(',')
- if len(self.headers[table_no]) != len(cols):
- logger.warning("Length of header ({0}) specified for table is not"
- " equal to the number of columns ({1}) detected.".format(
- len(self.headers[table_no]), len(cols)))
- while len(self.headers[table_no]) != len(cols):
- self.headers[table_no].append('')
-
table = Table(cols, rows)
# set table edges to True using ver+hor lines
- table = table.set_edges(v_s, h_s)
+ table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
nouse = table.nocont_ / (len(v_s) + len(h_s))
table_data['line_p'] = 100 * (1 - nouse)
# set spanning cells to True
@@ -351,27 +349,27 @@ class Lattice:
assignment_errors = []
table_data['split_text'] = []
table_data['superscript'] = []
- for direction in t_bbox:
+ for direction in ['vertical', 'horizontal']:
for t in t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text,
flag_size=self.flag_size)
- assignment_errors.append(error)
- indices = _reduce_index(table, indices, shift_text=self.shift_text,)
- if len(indices) > 1:
- table_data['split_text'].append(indices)
- for r_idx, c_idx, text in indices:
- if all(s in text for s in ['', '']):
- table_data['superscript'].append((r_idx, c_idx, text))
- table.cells[r_idx][c_idx].add_text(text)
+ if indices[:2] != (-1, -1):
+ assignment_errors.append(error)
+ indices = _reduce_index(table, indices, shift_text=self.shift_text)
+ if len(indices) > 1:
+ table_data['split_text'].append(indices)
+ for r_idx, c_idx, text in indices:
+ if all(s in text for s in ['', '']):
+ table_data['superscript'].append((r_idx, c_idx, text))
+ table.cells[r_idx][c_idx].add_text(text)
score = get_score([[100, assignment_errors]])
table_data['score'] = score
if self.fill is not None:
table = _fill_spanning(table, fill=self.fill[table_no])
ar = table.get_list()
- if self.headers is not None and self.headers[table_no] != ['']:
- ar.insert(0, self.headers[table_no])
+ ar = remove_empty(ar)
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
diff --git a/camelot/ocr.py b/camelot/ocr.py
index 6d56cdf..4ce2791 100644
--- a/camelot/ocr.py
+++ b/camelot/ocr.py
@@ -7,19 +7,18 @@ from PIL import Image
from .table import Table
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
- find_table_joints)
-from .utils import merge_close_values, encode_list
+ find_table_joints, find_cuts)
+from .utils import merge_close_values, encode_list, remove_empty
-class OCR:
- """Uses optical character recognition to get text out of image based pdfs.
- Currently works only on pdfs with lines.
+class OCRLattice:
+ """Lattice, but for images.
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
- (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
+ (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
coordinate space, denoting table areas to analyze.
(optional, default: None)
@@ -27,12 +26,12 @@ class OCR:
List of ints specifying m-tolerance parameters.
(optional, default: [2])
- blocksize: int
+ blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
- threshold_constant: float
+ threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
@@ -51,6 +50,10 @@ class OCR:
element for image processing.
(optional, default: 15)
+ iterations : int
+ Number of iterations for dilation.
+ (optional, default: 2)
+
debug : string
{'contour', 'line', 'joint', 'table'}
Set to one of the above values to generate a matplotlib plot
@@ -58,9 +61,9 @@ class OCR:
(optional, default: None)
"""
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
- dpi=300, lang="eng", scale=15, debug=None):
+ dpi=300, lang="eng", scale=15, iterations=2, debug=None):
- self.method = 'ocr'
+ self.method = 'ocrl'
self.table_area = table_area
self.mtol = mtol
self.blocksize = blocksize
@@ -69,11 +72,13 @@ class OCR:
self.dpi = dpi
self.lang = lang
self.scale = scale
+ self.iterations = iterations
self.debug = debug
def get_tables(self, pdfname):
if self.tool is None:
return None
+
bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png'])
@@ -91,9 +96,9 @@ class OCR:
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant)
vmask, v_segments = find_lines(threshold, direction='vertical',
- scale=self.scale)
+ scale=self.scale, iterations=self.iterations)
hmask, h_segments = find_lines(threshold, direction='horizontal',
- scale=self.scale)
+ scale=self.scale, iterations=self.iterations)
if self.table_area is not None:
areas = []
@@ -154,6 +159,7 @@ class OCR:
ar = table.get_list()
ar.reverse()
ar = encode_list(ar)
+ ar = remove_empty(ar)
table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
@@ -162,4 +168,142 @@ class OCR:
if self.debug:
return None
+ return page
+
+
+class OCRStream:
+ """Stream, but for images.
+
+ Parameters
+ ----------
+ table_area : list
+ List of strings of the form x1,y1,x2,y2 where
+ (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
+ coordinate space, denoting table areas to analyze.
+ (optional, default: None)
+
+ columns : list
+ List of strings where each string is comma-separated values of
+ x-coordinates in OpenCV's coordinate space.
+ (optional, default: None)
+
+ blocksize : int
+ Size of a pixel neighborhood that is used to calculate a
+ threshold value for the pixel: 3, 5, 7, and so on.
+ (optional, default: 15)
+
+ threshold_constant : float
+ Constant subtracted from the mean or weighted mean
+ (see the details below). Normally, it is positive but may be
+ zero or negative as well.
+ (optional, default: -2)
+
+ line_threshold : int
+ Maximum intensity of projections on y-axis.
+ (optional, default: 100)
+
+ dpi : int
+ Dots per inch.
+ (optional, default: 300)
+
+ lang : string
+ Language to be used for OCR.
+ (optional, default: 'eng')
+ """
+ def __init__(self, table_area=None, columns=None, blocksize=15,
+ threshold_constant=-2, line_threshold=100, dpi=300, lang="eng",
+ debug=False):
+
+ self.method = 'ocrs'
+ self.table_area = table_area
+ self.columns = columns
+ self.blocksize = blocksize
+ self.threshold_constant = threshold_constant
+ self.line_threshold = line_threshold
+ self.tool = pyocr.get_available_tools()[0] # fix this
+ self.dpi = dpi
+ self.lang = lang
+ self.debug = debug
+
+ def get_tables(self, pdfname):
+ if self.tool is None:
+ return None
+
+ bname, __ = os.path.splitext(pdfname)
+ imagename = ''.join([bname, '.png'])
+
+ gs_call = [
+ "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
+ pdfname
+ ]
+ if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
+ gs_call.insert(0, "gs")
+ else:
+ gs_call.insert(0, "gsc")
+ subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
+ stderr=subprocess.STDOUT)
+
+ img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
+ c=self.threshold_constant)
+ height, width = threshold.shape
+ if self.debug:
+ self.debug_images = img
+ return None
+
+ if self.table_area is not None:
+ if self.columns is not None:
+ if len(self.table_area) != len(self.columns):
+ raise ValueError("Length of table area and columns should be equal.")
+
+ table_bbox = {}
+ for area in self.table_area:
+ x1, y1, x2, y2 = area.split(",")
+ x1 = int(x1)
+ y1 = int(y1)
+ x2 = int(x2)
+ y2 = int(y2)
+ table_bbox[(x1, y1, x2, y2)] = None
+ else:
+ table_bbox = {(0, 0, width, height): None}
+
+ page = {}
+ tables = {}
+ table_no = 0
+ for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
+ if self.columns is None:
+ raise NotImplementedError
+ else:
+ table_data = {}
+ table_image = threshold[k[1]:k[3],k[0]:k[2]]
+ cols = self.columns[table_no].split(',')
+ cols = [float(c) for c in cols]
+ cols.insert(0, k[0])
+ cols.append(k[2])
+ cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
+ y_cuts = find_cuts(table_image, line_threshold=self.line_threshold)
+ rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
+ table = Table(cols, rows)
+ for i in range(len(table.cells)):
+ for j in range(len(table.cells[i])):
+ x1 = int(table.cells[i][j].x1)
+ y1 = int(table.cells[i][j].y1)
+ x2 = int(table.cells[i][j].x2)
+ y2 = int(table.cells[i][j].y2)
+ table.cells[i][j].image = table_image[y1:y2,x1:x2]
+ cell_image = Image.fromarray(table.cells[i][j].image)
+ text = self.tool.image_to_string(
+ cell_image,
+ lang=self.lang,
+ builder=pyocr.builders.TextBuilder()
+ )
+ table.cells[i][j].add_text(text)
+ ar = table.get_list()
+ ar.reverse()
+ ar = encode_list(ar)
+ ar = remove_empty(ar)
+ table_data['data'] = ar
+ tables['table-{0}'.format(table_no + 1)] = table_data
+ table_no += 1
+ page[os.path.basename(bname)] = tables
+
return page
\ No newline at end of file
diff --git a/camelot/pdf.py b/camelot/pdf.py
index f9ca4eb..7c6ae0f 100644
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@@ -141,11 +141,14 @@ class Pdf:
if self.extractor.method == 'stream':
self.debug = self.extractor.debug
self.debug_text = []
- elif self.extractor.method in ['lattice', 'ocr']:
+ elif self.extractor.method in ['lattice', 'ocrl']:
self.debug = self.extractor.debug
self.debug_images = []
self.debug_segments = []
self.debug_tables = []
+ elif self.extractor.method == 'ocrs':
+ self.debug = self.extractor.debug
+ self.debug_images = []
for p in pages:
table = self.extractor.get_tables(p)
if table is not None:
@@ -157,6 +160,8 @@ class Pdf:
self.debug_images.append(self.extractor.debug_images)
self.debug_segments.append(self.extractor.debug_segments)
self.debug_tables.append(self.extractor.debug_tables)
+ elif self.extractor.method == 'ocrs':
+ self.debug_images.append(self.extractor.debug_images)
if self.clean:
self.remove_tempdir()
return tables
@@ -175,7 +180,7 @@ class Pdf:
import matplotlib.patches as patches
if self.debug is True:
- try:
+ if hasattr(self, 'debug_text'):
for text in self.debug_text:
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
@@ -193,8 +198,10 @@ class Pdf:
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
- except AttributeError:
- raise ValueError("This option only be used with Stream.")
+ elif hasattr(self, 'debug_images'):
+ for img in self.debug_images:
+ plt.imshow(img)
+ plt.show()
elif self.debug == 'contour':
try:
for img, table_bbox in self.debug_images:
diff --git a/camelot/stream.py b/camelot/stream.py
index 95c99d3..96dfedd 100644
--- a/camelot/stream.py
+++ b/camelot/stream.py
@@ -236,10 +236,6 @@ class Stream:
x-coordinates in PDFMiner's coordinate space.
(optional, default: None)
- headers : list
- List of strings where each string is a csv header for a table.
- (optional, default: None)
-
ytol : list
List of ints specifying the y-tolerance parameters.
(optional, default: [2])
@@ -268,14 +264,13 @@ class Stream:
LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False)
"""
- def __init__(self, table_area=None, columns=None, headers=None,
- ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
- split_text=False, flag_size=True, debug=False):
+ def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0],
+ margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
+ debug=False):
self.method = 'stream'
self.table_area = table_area
self.columns = columns
- self.headers = headers
self.ytol = ytol
self.mtol = mtol
self.char_margin, self.line_margin, self.word_margin = margins
@@ -312,14 +307,12 @@ class Stream:
self.debug_text = []
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
+ return None
if self.table_area is not None:
if self.columns is not None:
if len(self.table_area) != len(self.columns):
- raise ValueError("Length of columns should be equal to table_area.")
- if self.headers is not None:
- if len(self.table_area) != len(self.headers):
- raise ValueError("Length of headers should be equal to table_area.")
+ raise ValueError("Length of table area and columns should be equal.")
table_bbox = {}
for area in self.table_area:
@@ -336,6 +329,7 @@ class Stream:
ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
else:
ytolerance = copy.deepcopy(self.ytol)
+
if len(self.mtol) == 1 and self.mtol[0] == 0:
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
else:
@@ -374,7 +368,7 @@ class Stream:
guess = True
ncols = max(set(elements), key=elements.count)
len_non_mode = len(filter(lambda x: x != ncols, elements))
- if ncols == 1 and not self.debug:
+ if ncols == 1:
# no tables detected
logger.warning("{}: Only one column was detected, the pdf"
" may have no tables.".format(
@@ -396,15 +390,6 @@ class Stream:
cols = _add_columns(cols, inner_text, ytolerance[table_no])
cols = _join_columns(cols, text_x_min, text_x_max)
- if self.headers is not None and self.headers[table_no] != [""]:
- self.headers[table_no] = self.headers[table_no].split(',')
- if len(self.headers[table_no]) != len(cols):
- logger.warning("Length of header ({0}) specified for table is not"
- " equal to the number of columns ({1}) detected.".format(
- len(self.headers[table_no]), len(cols)))
- while len(self.headers[table_no]) != len(cols):
- self.headers[table_no].append('')
-
table = Table(cols, rows)
table = table.set_all_edges()
assignment_errors = []
@@ -429,8 +414,6 @@ class Stream:
table_data['score'] = score
ar = table.get_list()
- if self.headers is not None and self.headers[table_no] != ['']:
- ar.insert(0, self.headers[table_no])
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
diff --git a/camelot/table.py b/camelot/table.py
index 8549300..fc1a45e 100644
--- a/camelot/table.py
+++ b/camelot/table.py
@@ -188,38 +188,32 @@ class Table:
bound = self.cells[r][c].get_bounded_edges()
if bound == 4:
continue
-
elif bound == 3:
if not self.cells[r][c].left:
if (self.cells[r][c].right and
self.cells[r][c].top and
self.cells[r][c].bottom):
self.cells[r][c].spanning_h = True
-
elif not self.cells[r][c].right:
if (self.cells[r][c].left and
self.cells[r][c].top and
self.cells[r][c].bottom):
self.cells[r][c].spanning_h = True
-
elif not self.cells[r][c].top:
if (self.cells[r][c].left and
self.cells[r][c].right and
self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True
-
elif not self.cells[r][c].bottom:
if (self.cells[r][c].left and
self.cells[r][c].right and
self.cells[r][c].top):
self.cells[r][c].spanning_v = True
-
elif bound == 2:
if self.cells[r][c].left and self.cells[r][c].right:
if (not self.cells[r][c].top and
not self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True
-
elif self.cells[r][c].top and self.cells[r][c].bottom:
if (not self.cells[r][c].left and
not self.cells[r][c].right):
diff --git a/camelot/utils.py b/camelot/utils.py
index d128740..c5e958c 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -426,40 +426,43 @@ def split_textline(table, textline, direction, flag_size=True):
idx = 0
cut_text = []
bbox = textline.bbox
- if direction == 'horizontal' and not textline.is_empty():
- x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
- r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
- r = r_idx[0]
- x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
- if not x_cuts:
- x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
- for obj in textline._objs:
- row = table.rows[r]
- for cut in x_cuts:
- if isinstance(obj, LTChar):
- if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
- (obj.x0 + obj.x1) / 2 <= cut[1]):
+ try:
+ if direction == 'horizontal' and not textline.is_empty():
+ x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
+ r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
+ r = r_idx[0]
+ x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
+ if not x_cuts:
+ x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
+ for obj in textline._objs:
+ row = table.rows[r]
+ for cut in x_cuts:
+ if isinstance(obj, LTChar):
+ if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
+ (obj.x0 + obj.x1) / 2 <= cut[1]):
+ cut_text.append((r, cut[0], obj))
+ break
+ elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj))
- break
- elif isinstance(obj, LTAnno):
- cut_text.append((r, cut[0], obj))
- elif direction == 'vertical' and not textline.is_empty():
- y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
- c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
- c = c_idx[0]
- y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
- if not y_cuts:
- y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
- for obj in textline._objs:
- col = table.cols[c]
- for cut in y_cuts:
- if isinstance(obj, LTChar):
- if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
- (obj.y0 + obj.y1) / 2 >= cut[1]):
+ elif direction == 'vertical' and not textline.is_empty():
+ y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
+ c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
+ c = c_idx[0]
+ y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
+ if not y_cuts:
+ y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
+ for obj in textline._objs:
+ col = table.cols[c]
+ for cut in y_cuts:
+ if isinstance(obj, LTChar):
+ if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
+ (obj.y0 + obj.y1) / 2 >= cut[1]):
+ cut_text.append((cut[0], c, obj))
+ break
+ elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj))
- break
- elif isinstance(obj, LTAnno):
- cut_text.append((cut[0], c, obj))
+ except IndexError:
+ return [(-1, -1, textline.get_text())]
grouped_chars = []
for key, chars in groupby(cut_text, itemgetter(0, 1)):
if flag_size:
diff --git a/tools/camelot b/tools/camelot
index ea9d396..54a6de1 100755
--- a/tools/camelot
+++ b/tools/camelot
@@ -18,7 +18,7 @@ from PyPDF2 import PdfFileReader
from camelot.pdf import Pdf
from camelot.lattice import Lattice
from camelot.stream import Stream
-from camelot.ocr import OCR
+from camelot.ocr import OCRLattice, OCRStream
from camelot import utils
@@ -54,7 +54,8 @@ options:
camelot methods:
lattice Looks for lines between data.
stream Looks for spaces between data.
- ocr Looks for lines in image based pdfs.
+ ocrl Lattice, but for images.
+ ocrs Stream, but for images.
See 'camelot -h' for more information on a specific method.
"""
@@ -63,20 +64,22 @@ lattice_doc = """
Lattice method looks for lines between text to form a table.
usage:
- camelot lattice [-t ...] [-F ...] [-H ...]
- [-m ...] [options] [--]
+ camelot lattice [-t ...] [-F ...] [-m ...]
+ [-j ...] [options] [--]
options:
-t, --tarea Specific table areas to analyze.
-F, --fill Fill data in horizontal and/or vertical spanning
cells. Example: -F h, -F v, -F hv
- -H, --header Specify header for each table.
-m, --mtol Tolerance to account for when merging lines
which are very close. [default: 2]
+ -j, --jtol Tolerance to account for when matching line endings
+ with intersections. [default: 2]
-b, --blocksize See adaptive threshold doc. [default: 15]
- -c, --constant See adaptive threshold doc. [default: -2]
+ -C, --constant See adaptive threshold doc. [default: -2]
-s, --scale Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
+ -I, --iterations Number of iterations for dilation. [default: 2]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-T, --shift_text Specify where the text in a spanning cell
@@ -89,41 +92,61 @@ stream_doc = """
Stream method looks for whitespaces between text to form a table.
usage:
- camelot stream [-t ...] [-c ...] [-H ...]
- [-y ...] [-m ...] [options] [--]
+ camelot stream [-t ...] [-c ...] [-m ...]
+ [-y ...] [options] [--]
options:
-t, --tarea Specific table areas to analyze.
-c, --columns Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3
- -H, --header Specify header for each table.
- -y, --ytol Tolerance to account for when grouping rows
- together. [default: 2]
-m, --mtol Tolerance to account for when merging columns
together. [default: 0]
+ -y, --ytol Tolerance to account for when grouping rows
+ together. [default: 2]
-d, --debug Debug by visualizing textboxes.
"""
-ocr_doc = """
-OCR method looks for lines in image based pdfs.
+ocrl_doc = """
+Lattice, but for images.
usage:
- camelot ocr [-t ] [-m ] [options] [--]
+ camelot ocrl [-t ...] [-m ...] [options] [--]
options:
- -t, --tarea Specific table areas to analyze.
- -m, --mtol Tolerance to account for when merging lines
- which are very close. [default: 2]
- -b, --blocksize See adaptive threshold doc. [default: 15]
- -c, --constant See adaptive threshold doc. [default: -2]
- -D, --dpi Dots per inch, specify image quality to be used for OCR.
- [default: 300]
- -l, --lang Specify language to be used for OCR. [default: eng]
- -s, --scale Scaling factor. Large scaling factor leads to
- smaller lines being detected. [default: 15]
- -d, --debug Debug by visualizing pdf geometry.
- (contour,line,joint,table) Example: -d table
+ -t, --tarea Specific table areas to analyze.
+ -m, --mtol Tolerance to account for when merging lines
+ which are very close. [default: 2]
+ -b, --blocksize See adaptive threshold doc. [default: 15]
+ -C, --constant See adaptive threshold doc. [default: -2]
+ -D, --dpi Dots per inch, specify image quality to be used for OCR.
+ [default: 300]
+ -l, --lang Specify language to be used for OCR. [default: eng]
+ -s, --scale Scaling factor. Large scaling factor leads to
+ smaller lines being detected. [default: 15]
+ -I, --iterations Number of iterations for dilation. [default: 2]
+ -d, --debug Debug by visualizing pdf geometry.
+ (contour,line,joint,table) Example: -d table
+"""
+
+ocrs_doc = """
+Stream, but for images.
+
+usage:
+ camelot ocrs [-t ...] [-c ...] [options] [--]
+
+options:
+ -t, --tarea Specific table areas to analyze.
+ -c, --columns Comma-separated list of column x-coordinates.
+ Example: -c 10.1,20.2,30.3
+ -b, --blocksize See adaptive threshold doc. [default: 15]
+ -C, --constant See adaptive threshold doc. [default: -2]
+ -N, --line-threshold Maximum intensity of projections on y-axis.
+ [default: 100]
+ -D, --dpi Dots per inch, specify image quality to be used for OCR.
+ [default: 300]
+ -l, --lang Specify language to be used for OCR. [default: eng]
+ -d, --debug Debug by visualizing image.
"""
@@ -351,8 +374,10 @@ if __name__ == '__main__':
args.update(docopt(lattice_doc, argv=argv))
elif args[''] == 'stream':
args.update(docopt(stream_doc, argv=argv))
- elif args[''] == 'ocr':
- args.update(docopt(ocr_doc, argv=argv))
+ elif args[''] == 'ocrl':
+ args.update(docopt(ocrl_doc, argv=argv))
+ elif args[''] == 'ocrs':
+ args.update(docopt(ocrs_doc, argv=argv))
filename = args['']
filedir = os.path.dirname(args[''])
@@ -392,11 +417,12 @@ if __name__ == '__main__':
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
'fill': args['--fill'] if args['--fill'] else None,
- 'headers': args['--header'] if args['--header'] else None,
'mtol': [int(m) for m in args['--mtol']],
+ 'jtol': [int(j) for j in args['--jtol']],
'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']),
'scale': int(args['--scale']),
+ 'iterations': int(args['--iterations']),
'invert': args['--invert'],
'margins': margins,
'split_text': args['--split_text'],
@@ -462,7 +488,6 @@ if __name__ == '__main__':
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
'columns': args['--columns'] if args['--columns'] else None,
- 'headers': args['--header'] if args['--header'] else None,
'ytol': [int(y) for y in args['--ytol']],
'mtol': [int(m) for m in args['--mtol']],
'margins': margins,
@@ -522,7 +547,7 @@ if __name__ == '__main__':
except Exception as e:
logger.exception(e.message, exc_info=True)
sys.exit()
- elif args[''] == 'ocr':
+ elif args[''] == 'ocrl':
try:
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
@@ -532,9 +557,75 @@ if __name__ == '__main__':
'dpi': int(args['--dpi']),
'lang': args['--lang'],
'scale': int(args['--scale']),
+ 'iterations': int(args['--iterations']),
'debug': args['--debug']
}
- manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True,
+ manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True,
+ parallel=args['--parallel'])
+ data = manager.extract()
+
+ processing_time = time.time() - start_time
+ logger.info("Finished processing in " + str(processing_time) + " seconds")
+
+ if args['--plot']:
+ if args['--output']:
+ pngname = os.path.join(args['--output'], os.path.basename(pngname))
+ plot_type = args['--plot'].split(',')
+ if 'page' in plot_type:
+ for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+ page = data[page_number]
+ for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+ table = page[table_number]
+ plot_table_barchart(table['r_nempty_cells'],
+ table['c_nempty_cells'],
+ table['empty_p'],
+ page_number,
+ table_number)
+
+ if 'all' in plot_type:
+ plot_all_barchart(data, pngname)
+
+ if 'rc' in plot_type:
+ plot_rc_piechart(data, pngname)
+
+ if args['--print-stats']:
+ print_stats(data, processing_time)
+
+ if args['--save-stats']:
+ if args['--output']:
+ scorename = os.path.join(args['--output'], os.path.basename(scorename))
+ with open(scorename, 'w') as score_file:
+ score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
+ for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+ page = data[page_number]
+ for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+ table = page[table_number]
+ score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
+ ''.join([page_number, '_', table_number]),
+ table['nrows'],
+ table['ncols'],
+ table['empty_p'],
+ table['line_p'],
+ table['text_p'],
+ table['score']))
+ if args['--debug']:
+ manager.debug_plot()
+ except Exception as e:
+ logger.exception(e.message, exc_info=True)
+ sys.exit()
+ elif args[''] == 'ocrs':
+ try:
+ kwargs = {
+ 'table_area': args['--tarea'] if args['--tarea'] else None,
+ 'columns': args['--columns'] if args['--columns'] else None,
+ 'blocksize': int(args['--blocksize']),
+ 'threshold_constant': float(args['--constant']),
+ 'line_threshold': int(args['--line-threshold']),
+ 'dpi': int(args['--dpi']),
+ 'lang': args['--lang'],
+ 'debug': args['--debug']
+ }
+ manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel'])
data = manager.extract()
@@ -588,7 +679,7 @@ if __name__ == '__main__':
logger.exception(e.message, exc_info=True)
sys.exit()
- if args['--debug']:
+ if args.get('--debug') is not None and args['--debug']:
print("See 'camelot -h' for various parameters you can tweak.")
else:
output = filedir if args['--output'] is None else args['--output']