Remove ocr
parent
9753889ea2
commit
72c42c74db
|
|
@ -1,3 +1,3 @@
|
||||||
__version__ = '1.2.0'
|
__version__ = '1.2.0'
|
||||||
|
|
||||||
__all__ = ['pdf', 'lattice', 'stream', 'ocr']
|
__all__ = ['pdf', 'lattice', 'stream']
|
||||||
|
|
|
||||||
331
camelot/ocr.py
331
camelot/ocr.py
|
|
@ -1,331 +0,0 @@
|
||||||
import os
|
|
||||||
import copy
|
|
||||||
import logging
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
import pyocr
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from .table import Table
|
|
||||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
|
||||||
find_table_joints, remove_lines, find_cuts)
|
|
||||||
from .utils import merge_close_values, encode_list
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['OCRLattice', 'OCRStream']
|
|
||||||
logger = logging.getLogger('app_logger')
|
|
||||||
|
|
||||||
|
|
||||||
class OCRLattice:
|
|
||||||
"""Lattice, but for images.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
table_area : list
|
|
||||||
List of strings of the form x1,y1,x2,y2 where
|
|
||||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
|
|
||||||
coordinate space, denoting table areas to analyze.
|
|
||||||
(optional, default: None)
|
|
||||||
|
|
||||||
mtol : list
|
|
||||||
List of ints specifying m-tolerance parameters.
|
|
||||||
(optional, default: [2])
|
|
||||||
|
|
||||||
blocksize : int
|
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
|
||||||
(optional, default: 15)
|
|
||||||
|
|
||||||
threshold_constant : float
|
|
||||||
Constant subtracted from the mean or weighted mean
|
|
||||||
(see the details below). Normally, it is positive but may be
|
|
||||||
zero or negative as well.
|
|
||||||
(optional, default: -2)
|
|
||||||
|
|
||||||
dpi : int
|
|
||||||
Dots per inch.
|
|
||||||
(optional, default: 300)
|
|
||||||
|
|
||||||
layout : int
|
|
||||||
Tesseract page segmentation mode.
|
|
||||||
(optional, default: 7)
|
|
||||||
|
|
||||||
lang : string
|
|
||||||
Language to be used for OCR.
|
|
||||||
(optional, default: 'eng')
|
|
||||||
|
|
||||||
scale : int
|
|
||||||
Used to divide the height/width of a pdf to get a structuring
|
|
||||||
element for image processing.
|
|
||||||
(optional, default: 15)
|
|
||||||
|
|
||||||
iterations : int
|
|
||||||
Number of iterations for dilation.
|
|
||||||
(optional, default: 0)
|
|
||||||
|
|
||||||
debug : string
|
|
||||||
{'contour', 'line', 'joint', 'table'}
|
|
||||||
Set to one of the above values to generate a matplotlib plot
|
|
||||||
of detected contours, lines, joints and the table generated.
|
|
||||||
(optional, default: None)
|
|
||||||
"""
|
|
||||||
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
|
|
||||||
dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None):
|
|
||||||
|
|
||||||
self.method = 'ocrl'
|
|
||||||
self.table_area = table_area
|
|
||||||
self.mtol = mtol
|
|
||||||
self.blocksize = blocksize
|
|
||||||
self.threshold_constant = threshold_constant
|
|
||||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
|
||||||
self.dpi = dpi
|
|
||||||
self.layout = layout
|
|
||||||
self.lang = lang
|
|
||||||
self.scale = scale
|
|
||||||
self.iterations = iterations
|
|
||||||
self.debug = debug
|
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
|
||||||
if self.tool is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
bname, __ = os.path.splitext(pdfname)
|
|
||||||
imagename = ''.join([bname, '.png'])
|
|
||||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
|
||||||
|
|
||||||
gs_call = [
|
|
||||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
|
||||||
pdfname
|
|
||||||
]
|
|
||||||
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
|
|
||||||
gs_call.insert(0, "gs")
|
|
||||||
else:
|
|
||||||
gs_call.insert(0, "gsc")
|
|
||||||
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
|
||||||
stderr=subprocess.STDOUT)
|
|
||||||
|
|
||||||
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
|
||||||
c=self.threshold_constant)
|
|
||||||
vmask, v_segments = find_lines(threshold, direction='vertical',
|
|
||||||
scale=self.scale, iterations=self.iterations)
|
|
||||||
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
|
||||||
scale=self.scale, iterations=self.iterations)
|
|
||||||
|
|
||||||
if self.table_area is not None:
|
|
||||||
areas = []
|
|
||||||
for area in self.table_area:
|
|
||||||
x1, y1, x2, y2 = area.split(",")
|
|
||||||
x1 = int(float(x1))
|
|
||||||
y1 = int(float(y1))
|
|
||||||
x2 = int(float(x2))
|
|
||||||
y2 = int(float(y2))
|
|
||||||
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
|
||||||
table_bbox = find_table_joints(areas, vmask, hmask)
|
|
||||||
else:
|
|
||||||
contours = find_table_contours(vmask, hmask)
|
|
||||||
table_bbox = find_table_joints(contours, vmask, hmask)
|
|
||||||
|
|
||||||
if self.debug:
|
|
||||||
self.debug_images = (img, table_bbox)
|
|
||||||
self.debug_segments = (v_segments, h_segments)
|
|
||||||
self.debug_tables = []
|
|
||||||
|
|
||||||
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
|
||||||
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
|
||||||
else:
|
|
||||||
mtolerance = copy.deepcopy(self.mtol)
|
|
||||||
|
|
||||||
page = {}
|
|
||||||
tables = {}
|
|
||||||
table_no = 0
|
|
||||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
|
|
||||||
table_data = {}
|
|
||||||
cols, rows = zip(*table_bbox[k])
|
|
||||||
cols, rows = list(cols), list(rows)
|
|
||||||
cols.extend([k[0], k[2]])
|
|
||||||
rows.extend([k[1], k[3]])
|
|
||||||
cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
|
|
||||||
rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no])
|
|
||||||
cols = [(cols[i], cols[i + 1])
|
|
||||||
for i in range(0, len(cols) - 1)]
|
|
||||||
rows = [(rows[i], rows[i + 1])
|
|
||||||
for i in range(0, len(rows) - 1)]
|
|
||||||
table = Table(cols, rows)
|
|
||||||
if self.debug:
|
|
||||||
self.debug_tables.append(table)
|
|
||||||
table.image = img[k[3]:k[1],k[0]:k[2]]
|
|
||||||
for i in range(len(table.cells)):
|
|
||||||
for j in range(len(table.cells[i])):
|
|
||||||
x1 = int(table.cells[i][j].x1)
|
|
||||||
y1 = int(table.cells[i][j].y1)
|
|
||||||
x2 = int(table.cells[i][j].x2)
|
|
||||||
y2 = int(table.cells[i][j].y2)
|
|
||||||
table.cells[i][j].image = img[y1:y2,x1:x2]
|
|
||||||
text = self.tool.image_to_string(
|
|
||||||
Image.fromarray(table.cells[i][j].image),
|
|
||||||
lang=self.lang,
|
|
||||||
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
|
|
||||||
)
|
|
||||||
table.cells[i][j].add_text(text)
|
|
||||||
ar = table.get_list()
|
|
||||||
ar.reverse()
|
|
||||||
ar = encode_list(ar)
|
|
||||||
table_data['data'] = ar
|
|
||||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
|
||||||
table_no += 1
|
|
||||||
page[os.path.basename(bname)] = tables
|
|
||||||
|
|
||||||
if self.debug:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
class OCRStream:
|
|
||||||
"""Stream, but for images.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
table_area : list
|
|
||||||
List of strings of the form x1,y1,x2,y2 where
|
|
||||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
|
|
||||||
coordinate space, denoting table areas to analyze.
|
|
||||||
(optional, default: None)
|
|
||||||
|
|
||||||
columns : list
|
|
||||||
List of strings where each string is comma-separated values of
|
|
||||||
x-coordinates in OpenCV's coordinate space.
|
|
||||||
(optional, default: None)
|
|
||||||
|
|
||||||
blocksize : int
|
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
|
||||||
(optional, default: 15)
|
|
||||||
|
|
||||||
threshold_constant : float
|
|
||||||
Constant subtracted from the mean or weighted mean
|
|
||||||
(see the details below). Normally, it is positive but may be
|
|
||||||
zero or negative as well.
|
|
||||||
(optional, default: -2)
|
|
||||||
|
|
||||||
dpi : int
|
|
||||||
Dots per inch.
|
|
||||||
(optional, default: 300)
|
|
||||||
|
|
||||||
layout : int
|
|
||||||
Tesseract page segmentation mode.
|
|
||||||
(optional, default: 7)
|
|
||||||
|
|
||||||
lang : string
|
|
||||||
Language to be used for OCR.
|
|
||||||
(optional, default: 'eng')
|
|
||||||
|
|
||||||
line_scale : int
|
|
||||||
Line scaling factor.
|
|
||||||
(optional, default: 15)
|
|
||||||
|
|
||||||
char_scale : int
|
|
||||||
Char scaling factor.
|
|
||||||
(optional, default: 200)
|
|
||||||
"""
|
|
||||||
def __init__(self, table_area=None, columns=None, blocksize=15,
|
|
||||||
threshold_constant=-2, dpi=300, layout=7, lang="eng",
|
|
||||||
line_scale=15, char_scale=200, debug=False):
|
|
||||||
|
|
||||||
self.method = 'ocrs'
|
|
||||||
self.table_area = table_area
|
|
||||||
self.columns = columns
|
|
||||||
self.blocksize = blocksize
|
|
||||||
self.threshold_constant = threshold_constant
|
|
||||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
|
||||||
self.dpi = dpi
|
|
||||||
self.layout = layout
|
|
||||||
self.lang = lang
|
|
||||||
self.line_scale = line_scale
|
|
||||||
self.char_scale = char_scale
|
|
||||||
self.debug = debug
|
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
|
||||||
if self.tool is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
bname, __ = os.path.splitext(pdfname)
|
|
||||||
imagename = ''.join([bname, '.png'])
|
|
||||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
|
||||||
|
|
||||||
gs_call = [
|
|
||||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
|
||||||
pdfname
|
|
||||||
]
|
|
||||||
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
|
|
||||||
gs_call.insert(0, "gs")
|
|
||||||
else:
|
|
||||||
gs_call.insert(0, "gsc")
|
|
||||||
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
|
||||||
stderr=subprocess.STDOUT)
|
|
||||||
|
|
||||||
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
|
||||||
c=self.threshold_constant)
|
|
||||||
threshold = remove_lines(threshold, line_scale=self.line_scale)
|
|
||||||
height, width = threshold.shape
|
|
||||||
if self.debug:
|
|
||||||
self.debug_images = img
|
|
||||||
return None
|
|
||||||
|
|
||||||
if self.table_area is not None:
|
|
||||||
if self.columns is not None:
|
|
||||||
if len(self.table_area) != len(self.columns):
|
|
||||||
raise ValueError("{0}: Length of table area and columns"
|
|
||||||
" should be equal.".format(os.path.basename(bname)))
|
|
||||||
|
|
||||||
table_bbox = {}
|
|
||||||
for area in self.table_area:
|
|
||||||
x1, y1, x2, y2 = area.split(",")
|
|
||||||
x1 = int(float(x1))
|
|
||||||
y1 = int(float(y1))
|
|
||||||
x2 = int(float(x2))
|
|
||||||
y2 = int(float(y2))
|
|
||||||
table_bbox[(x1, y1, x2, y2)] = None
|
|
||||||
else:
|
|
||||||
table_bbox = {(0, 0, width, height): None}
|
|
||||||
|
|
||||||
page = {}
|
|
||||||
tables = {}
|
|
||||||
table_no = 0
|
|
||||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
|
|
||||||
if self.columns is None:
|
|
||||||
raise NotImplementedError
|
|
||||||
else:
|
|
||||||
table_data = {}
|
|
||||||
table_image = threshold[k[1]:k[3],k[0]:k[2]]
|
|
||||||
cols = self.columns[table_no].split(',')
|
|
||||||
cols = [float(c) for c in cols]
|
|
||||||
cols.insert(0, k[0])
|
|
||||||
cols.append(k[2])
|
|
||||||
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
|
|
||||||
y_cuts = find_cuts(table_image, char_scale=self.char_scale)
|
|
||||||
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
|
|
||||||
table = Table(cols, rows)
|
|
||||||
for i in range(len(table.cells)):
|
|
||||||
for j in range(len(table.cells[i])):
|
|
||||||
x1 = int(table.cells[i][j].x1)
|
|
||||||
y1 = int(table.cells[i][j].y1)
|
|
||||||
x2 = int(table.cells[i][j].x2)
|
|
||||||
y2 = int(table.cells[i][j].y2)
|
|
||||||
table.cells[i][j].image = table_image[y1:y2,x1:x2]
|
|
||||||
cell_image = Image.fromarray(table.cells[i][j].image)
|
|
||||||
text = self.tool.image_to_string(
|
|
||||||
cell_image,
|
|
||||||
lang=self.lang,
|
|
||||||
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
|
|
||||||
)
|
|
||||||
table.cells[i][j].add_text(text)
|
|
||||||
ar = table.get_list()
|
|
||||||
ar.reverse()
|
|
||||||
ar = encode_list(ar)
|
|
||||||
table_data['data'] = ar
|
|
||||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
|
||||||
table_no += 1
|
|
||||||
page[os.path.basename(bname)] = tables
|
|
||||||
|
|
||||||
return page
|
|
||||||
186
tools/camelot
186
tools/camelot
|
|
@ -18,7 +18,6 @@ from PyPDF2 import PdfFileReader
|
||||||
from camelot.pdf import Pdf
|
from camelot.pdf import Pdf
|
||||||
from camelot.lattice import Lattice
|
from camelot.lattice import Lattice
|
||||||
from camelot.stream import Stream
|
from camelot.stream import Stream
|
||||||
from camelot.ocr import OCRLattice, OCRStream
|
|
||||||
from camelot import utils
|
from camelot import utils
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -54,8 +53,6 @@ options:
|
||||||
camelot methods:
|
camelot methods:
|
||||||
lattice Looks for lines between data.
|
lattice Looks for lines between data.
|
||||||
stream Looks for spaces between data.
|
stream Looks for spaces between data.
|
||||||
ocrl Lattice, but for images.
|
|
||||||
ocrs Stream, but for images.
|
|
||||||
|
|
||||||
See 'camelot <method> -h' for more information on a specific method.
|
See 'camelot <method> -h' for more information on a specific method.
|
||||||
"""
|
"""
|
||||||
|
|
@ -107,51 +104,6 @@ options:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
ocrl_doc = """
|
|
||||||
Lattice, but for images.
|
|
||||||
|
|
||||||
usage:
|
|
||||||
camelot ocrl [-t <tarea>...] [-m <mtol>...] [options] [--] <file>
|
|
||||||
|
|
||||||
options:
|
|
||||||
-t, --tarea <tarea> Specific table areas to analyze.
|
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
|
||||||
which are very close. [default: 2]
|
|
||||||
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
|
||||||
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
|
||||||
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
|
||||||
[default: 300]
|
|
||||||
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
|
|
||||||
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
|
||||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
|
||||||
smaller lines being detected. [default: 15]
|
|
||||||
-I, --iterations <iterations> Number of iterations for dilation. [default: 0]
|
|
||||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
|
||||||
(contour,line,joint,table) Example: -d table
|
|
||||||
"""
|
|
||||||
|
|
||||||
ocrs_doc = """
|
|
||||||
Stream, but for images.
|
|
||||||
|
|
||||||
usage:
|
|
||||||
camelot ocrs [-t <tarea>...] [-c <columns>...] [options] [--] <file>
|
|
||||||
|
|
||||||
options:
|
|
||||||
-t, --tarea <tarea> Specific table areas to analyze.
|
|
||||||
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
|
||||||
Example: -c 10.1,20.2,30.3
|
|
||||||
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
|
||||||
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
|
||||||
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
|
||||||
[default: 300]
|
|
||||||
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
|
|
||||||
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
|
||||||
-G, --line-scale <line_scale> Line scaling factor. [default: 15]
|
|
||||||
-S, --char-scale <char_scale> Char scaling factor. [default: 200]
|
|
||||||
-d, --debug Debug by visualizing image.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def plot_table_barchart(r, c, p, pno, tno):
|
def plot_table_barchart(r, c, p, pno, tno):
|
||||||
row_idx = [i + 1 for i, row in enumerate(r)]
|
row_idx = [i + 1 for i, row in enumerate(r)]
|
||||||
col_idx = [i + 1 for i, col in enumerate(c)]
|
col_idx = [i + 1 for i, col in enumerate(c)]
|
||||||
|
|
@ -376,10 +328,6 @@ if __name__ == '__main__':
|
||||||
args.update(docopt(lattice_doc, argv=argv))
|
args.update(docopt(lattice_doc, argv=argv))
|
||||||
elif args['<method>'] == 'stream':
|
elif args['<method>'] == 'stream':
|
||||||
args.update(docopt(stream_doc, argv=argv))
|
args.update(docopt(stream_doc, argv=argv))
|
||||||
elif args['<method>'] == 'ocrl':
|
|
||||||
args.update(docopt(ocrl_doc, argv=argv))
|
|
||||||
elif args['<method>'] == 'ocrs':
|
|
||||||
args.update(docopt(ocrs_doc, argv=argv))
|
|
||||||
|
|
||||||
filename = args['<file>']
|
filename = args['<file>']
|
||||||
filedir = os.path.dirname(args['<file>'])
|
filedir = os.path.dirname(args['<file>'])
|
||||||
|
|
@ -551,140 +499,6 @@ if __name__ == '__main__':
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(e.message, exc_info=True)
|
logger.exception(e.message, exc_info=True)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
elif args['<method>'] == 'ocrl':
|
|
||||||
try:
|
|
||||||
kwargs = {
|
|
||||||
'table_area': args['--tarea'] if args['--tarea'] else None,
|
|
||||||
'mtol': [int(m) for m in args['--mtol']],
|
|
||||||
'blocksize': int(args['--blocksize']),
|
|
||||||
'threshold_constant': float(args['--constant']),
|
|
||||||
'dpi': int(args['--dpi']),
|
|
||||||
'layout': int(args['--layout']),
|
|
||||||
'lang': args['--lang'],
|
|
||||||
'scale': int(args['--scale']),
|
|
||||||
'iterations': int(args['--iterations']),
|
|
||||||
'debug': args['--debug']
|
|
||||||
}
|
|
||||||
manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True,
|
|
||||||
parallel=args['--parallel'])
|
|
||||||
data = manager.extract()
|
|
||||||
|
|
||||||
processing_time = time.time() - start_time
|
|
||||||
logger.info("Finished processing in " + str(processing_time) + " seconds")
|
|
||||||
|
|
||||||
if args['--plot']:
|
|
||||||
if args['--output']:
|
|
||||||
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
|
||||||
plot_type = args['--plot'].split(',')
|
|
||||||
if 'page' in plot_type:
|
|
||||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
||||||
page = data[page_number]
|
|
||||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
|
||||||
table = page[table_number]
|
|
||||||
plot_table_barchart(table['r_nempty_cells'],
|
|
||||||
table['c_nempty_cells'],
|
|
||||||
table['empty_p'],
|
|
||||||
page_number,
|
|
||||||
table_number)
|
|
||||||
|
|
||||||
if 'all' in plot_type:
|
|
||||||
plot_all_barchart(data, pngname)
|
|
||||||
|
|
||||||
if 'rc' in plot_type:
|
|
||||||
plot_rc_piechart(data, pngname)
|
|
||||||
|
|
||||||
if args['--print-stats']:
|
|
||||||
print_stats(data, processing_time)
|
|
||||||
|
|
||||||
if args['--save-stats']:
|
|
||||||
if args['--output']:
|
|
||||||
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
|
||||||
with open(scorename, 'w') as score_file:
|
|
||||||
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
|
|
||||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
||||||
page = data[page_number]
|
|
||||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
|
||||||
table = page[table_number]
|
|
||||||
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
|
|
||||||
''.join([page_number, '_', table_number]),
|
|
||||||
table['nrows'],
|
|
||||||
table['ncols'],
|
|
||||||
table['empty_p'],
|
|
||||||
table['line_p'],
|
|
||||||
table['text_p'],
|
|
||||||
table['score']))
|
|
||||||
if args['--debug']:
|
|
||||||
manager.debug_plot()
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(e.message, exc_info=True)
|
|
||||||
sys.exit()
|
|
||||||
elif args['<method>'] == 'ocrs':
|
|
||||||
try:
|
|
||||||
kwargs = {
|
|
||||||
'table_area': args['--tarea'] if args['--tarea'] else None,
|
|
||||||
'columns': args['--columns'] if args['--columns'] else None,
|
|
||||||
'blocksize': int(args['--blocksize']),
|
|
||||||
'threshold_constant': float(args['--constant']),
|
|
||||||
'dpi': int(args['--dpi']),
|
|
||||||
'layout': int(args['--layout']),
|
|
||||||
'lang': args['--lang'],
|
|
||||||
'line_scale': int(args['--line-scale']),
|
|
||||||
'char_scale': int(args['--char-scale']),
|
|
||||||
'debug': args['--debug']
|
|
||||||
}
|
|
||||||
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
|
|
||||||
parallel=args['--parallel'])
|
|
||||||
data = manager.extract()
|
|
||||||
|
|
||||||
processing_time = time.time() - start_time
|
|
||||||
logger.info("Finished processing in " + str(processing_time) + " seconds")
|
|
||||||
|
|
||||||
if args['--plot']:
|
|
||||||
if args['--output']:
|
|
||||||
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
|
||||||
plot_type = args['--plot'].split(',')
|
|
||||||
if 'page' in plot_type:
|
|
||||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
||||||
page = data[page_number]
|
|
||||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
|
||||||
table = page[table_number]
|
|
||||||
plot_table_barchart(table['r_nempty_cells'],
|
|
||||||
table['c_nempty_cells'],
|
|
||||||
table['empty_p'],
|
|
||||||
page_number,
|
|
||||||
table_number)
|
|
||||||
|
|
||||||
if 'all' in plot_type:
|
|
||||||
plot_all_barchart(data, pngname)
|
|
||||||
|
|
||||||
if 'rc' in plot_type:
|
|
||||||
plot_rc_piechart(data, pngname)
|
|
||||||
|
|
||||||
if args['--print-stats']:
|
|
||||||
print_stats(data, processing_time)
|
|
||||||
|
|
||||||
if args['--save-stats']:
|
|
||||||
if args['--output']:
|
|
||||||
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
|
||||||
with open(scorename, 'w') as score_file:
|
|
||||||
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
|
|
||||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
||||||
page = data[page_number]
|
|
||||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
|
||||||
table = page[table_number]
|
|
||||||
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
|
|
||||||
''.join([page_number, '_', table_number]),
|
|
||||||
table['nrows'],
|
|
||||||
table['ncols'],
|
|
||||||
table['empty_p'],
|
|
||||||
table['line_p'],
|
|
||||||
table['text_p'],
|
|
||||||
table['score']))
|
|
||||||
if args['--debug']:
|
|
||||||
manager.debug_plot()
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(e.message, exc_info=True)
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
if args.get('--debug') is not None and args['--debug']:
|
if args.get('--debug') is not None and args['--debug']:
|
||||||
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue