commit
9c71d87c68
|
|
@ -6,3 +6,7 @@ build/
|
|||
dist/
|
||||
*.egg-info/
|
||||
.coverage
|
||||
|
||||
.pytest_cache/
|
||||
_build/
|
||||
_static/
|
||||
|
|
|
|||
31
Makefile
31
Makefile
|
|
@ -1,31 +0,0 @@
|
|||
PYTHON ?= python
|
||||
NOSETESTS ?= nosetests
|
||||
|
||||
help:
|
||||
@echo "Please use \`make <target>' where <target> is one of"
|
||||
@echo " clean"
|
||||
@echo " dev to install in develop mode"
|
||||
@echo " undev to uninstall develop mode"
|
||||
@echo " install to install for all users"
|
||||
@echo " test to run tests"
|
||||
@echo " test-coverage to run tests with coverage report"
|
||||
|
||||
clean:
|
||||
$(PYTHON) setup.py clean
|
||||
rm -rf dist
|
||||
|
||||
dev:
|
||||
$(PYTHON) setup.py develop
|
||||
|
||||
undev:
|
||||
$(PYTHON) setup.py develop --uninstall
|
||||
|
||||
install:
|
||||
$(PYTHON) setup.py install
|
||||
|
||||
test:
|
||||
$(NOSETESTS) -s -v
|
||||
|
||||
test-coverage:
|
||||
rm -rf coverage .coverage
|
||||
$(NOSETESTS) -s -v --with-coverage
|
||||
98
README.md
98
README.md
|
|
@ -1,67 +1,31 @@
|
|||
# camelot
|
||||
# Camelot: PDF Table Parsing for Humans
|
||||
|
||||
Camelot is a Python 2.7 library and command-line tool for getting tables out of PDF files.
|
||||
Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files.
|
||||
|
||||
## Usage
|
||||
|
||||
<pre>
|
||||
from camelot.pdf import Pdf
|
||||
from camelot.lattice import Lattice
|
||||
|
||||
manager = Pdf(Lattice(), "/path/to/pdf")
|
||||
tables = manager.extract()
|
||||
</pre>
|
||||
|
||||
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
||||
|
||||
<pre>
|
||||
Camelot: PDF parsing made simpler!
|
||||
|
||||
usage:
|
||||
camelot [options] <method> [<args>...]
|
||||
|
||||
options:
|
||||
-h, --help Show this screen.
|
||||
-v, --version Show version.
|
||||
-V, --verbose Verbose.
|
||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||
Example: -p 1,3-6,10 [default: 1]
|
||||
-P, --parallel Parallelize the parsing process.
|
||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
||||
-l, --log Log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 2.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-J, --split_text Split text lines if they span across multiple cells.
|
||||
-K, --flag_size Flag substring if its size differs from the whole string.
|
||||
Useful for super and subscripts.
|
||||
-X, --print-stats List stats on the parsing process.
|
||||
-Y, --save-stats Save stats to a file.
|
||||
-Z, --plot <dist> Plot distributions. (page,all,rc)
|
||||
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
stream Looks for spaces between data.
|
||||
ocrl Lattice, but for images.
|
||||
ocrs Stream, but for images.
|
||||
|
||||
See 'camelot <method> -h' for more information on a specific method.
|
||||
>>> import camelot
|
||||
>>> tables = camelot.read_pdf("foo.pdf")
|
||||
>>> tables
|
||||
<TableList n=2>
|
||||
>>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
|
||||
>>> tables[0]
|
||||
<Table shape=(3,4)>
|
||||
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
|
||||
>>> tables[0].parsing_report
|
||||
{
|
||||
"accuracy": 96,
|
||||
"whitespace": 80,
|
||||
"order": 1,
|
||||
"page": 1
|
||||
}
|
||||
>>> df = tables[0].df
|
||||
</pre>
|
||||
|
||||
## Dependencies
|
||||
|
||||
Currently, camelot works under Python 2.7.
|
||||
|
||||
The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php).
|
||||
|
||||
### Optional
|
||||
|
||||
You'll need to install [Tesseract](https://github.com/tesseract-ocr/tesseract) if you want to extract tables from image based pdfs. Also, you'll need a tesseract language pack if your pdf isn't in english.
|
||||
The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).
|
||||
|
||||
## Installation
|
||||
|
||||
|
|
@ -73,32 +37,32 @@ pip install -U pip setuptools
|
|||
|
||||
### Installing dependencies
|
||||
|
||||
numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager.
|
||||
tk and ghostscript can be installed using your system's default package manager.
|
||||
|
||||
#### Linux
|
||||
|
||||
* Arch Linux
|
||||
|
||||
<pre>
|
||||
sudo pacman -S opencv imagemagick
|
||||
</pre>
|
||||
|
||||
* Ubuntu
|
||||
|
||||
<pre>
|
||||
sudo apt-get install libopencv-dev python-opencv imagemagick
|
||||
sudo apt-get install python-opencv python-tk ghostscript
|
||||
</pre>
|
||||
|
||||
* Arch Linux
|
||||
|
||||
<pre>
|
||||
sudo pacman -S opencv tk ghostscript
|
||||
</pre>
|
||||
|
||||
#### OS X
|
||||
|
||||
<pre>
|
||||
brew install homebrew/science/opencv imagemagick
|
||||
brew install homebrew/science/opencv ghostscript
|
||||
</pre>
|
||||
|
||||
Finally, `cd` into the project directory and install by
|
||||
|
||||
<pre>
|
||||
make install
|
||||
python setup.py install
|
||||
</pre>
|
||||
|
||||
## Development
|
||||
|
|
@ -113,12 +77,12 @@ git clone https://github.com/socialcopsdev/camelot.git
|
|||
|
||||
### Contributing
|
||||
|
||||
See [Contributing doc]().
|
||||
See [Contributing guidelines]().
|
||||
|
||||
### Testing
|
||||
|
||||
<pre>
|
||||
make test
|
||||
python setup.py test
|
||||
</pre>
|
||||
|
||||
## License
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
__version__ = '1.2.0'
|
||||
from .__version__ import __version__
|
||||
|
||||
__all__ = ['pdf', 'lattice', 'stream', 'ocr']
|
||||
from .io import read_pdf
|
||||
from .plotting import plot_geometry
|
||||
|
|
@ -0,0 +1 @@
|
|||
__version__ = '0.1.0'
|
||||
128
camelot/cell.py
128
camelot/cell.py
|
|
@ -1,128 +0,0 @@
|
|||
class Cell:
|
||||
"""Cell.
|
||||
Defines a cell object with coordinates relative to a left-bottom
|
||||
origin, which is also PDFMiner's coordinate space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1 : float
|
||||
x-coordinate of left-bottom point.
|
||||
|
||||
y1 : float
|
||||
y-coordinate of left-bottom point.
|
||||
|
||||
x2 : float
|
||||
x-coordinate of right-top point.
|
||||
|
||||
y2 : float
|
||||
y-coordinate of right-top point.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
lb : tuple
|
||||
Tuple representing left-bottom coordinates.
|
||||
|
||||
lt : tuple
|
||||
Tuple representing left-top coordinates.
|
||||
|
||||
rb : tuple
|
||||
Tuple representing right-bottom coordinates.
|
||||
|
||||
rt : tuple
|
||||
Tuple representing right-top coordinates.
|
||||
|
||||
bbox : tuple
|
||||
Tuple representing the cell's bounding box using the
|
||||
lower-bottom and right-top coordinates.
|
||||
|
||||
left : bool
|
||||
Whether or not cell is bounded on the left.
|
||||
|
||||
right : bool
|
||||
Whether or not cell is bounded on the right.
|
||||
|
||||
top : bool
|
||||
Whether or not cell is bounded on the top.
|
||||
|
||||
bottom : bool
|
||||
Whether or not cell is bounded on the bottom.
|
||||
|
||||
text_objects : list
|
||||
List of text objects assigned to cell.
|
||||
|
||||
text : string
|
||||
Text assigned to cell.
|
||||
|
||||
spanning_h : bool
|
||||
Whether or not cell spans/extends horizontally.
|
||||
|
||||
spanning_v : bool
|
||||
Whether or not cell spans/extends vertically.
|
||||
"""
|
||||
|
||||
def __init__(self, x1, y1, x2, y2):
|
||||
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
self.x2 = x2
|
||||
self.y2 = y2
|
||||
self.lb = (x1, y1)
|
||||
self.lt = (x1, y2)
|
||||
self.rb = (x2, y1)
|
||||
self.rt = (x2, y2)
|
||||
self.bbox = (x1, y1, x2, y2)
|
||||
self.left = False
|
||||
self.right = False
|
||||
self.top = False
|
||||
self.bottom = False
|
||||
self.text_objects = []
|
||||
self.text = ''
|
||||
self.spanning_h = False
|
||||
self.spanning_v = False
|
||||
self.image = None
|
||||
|
||||
def add_text(self, text):
|
||||
"""Adds text to cell.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : string
|
||||
"""
|
||||
self.text = ''.join([self.text, text])
|
||||
|
||||
def get_text(self):
|
||||
"""Returns text assigned to cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text : string
|
||||
"""
|
||||
return self.text
|
||||
|
||||
def add_object(self, t_object):
|
||||
"""Adds PDFMiner text object to cell.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t_object : object
|
||||
"""
|
||||
self.text_objects.append(t_object)
|
||||
|
||||
def get_objects(self):
|
||||
"""Returns list of text objects assigned to cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text_objects : list
|
||||
"""
|
||||
return self.text_objects
|
||||
|
||||
def get_bounded_edges(self):
|
||||
"""Returns the number of edges by which a cell is bounded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bounded_edges : int
|
||||
"""
|
||||
self.bounded_edges = self.top + self.bottom + self.left + self.right
|
||||
return self.bounded_edges
|
||||
|
|
@ -0,0 +1 @@
|
|||
import click
|
||||
|
|
@ -0,0 +1,491 @@
|
|||
import os
|
||||
import json
|
||||
import zipfile
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class Cell(object):
|
||||
"""Defines a cell in a table with coordinates relative to a
|
||||
left-bottom origin. (pdf coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1 : float
|
||||
x-coordinate of left-bottom point.
|
||||
y1 : float
|
||||
y-coordinate of left-bottom point.
|
||||
x2 : float
|
||||
x-coordinate of right-top point.
|
||||
y2 : float
|
||||
y-coordinate of right-top point.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
lb : tuple
|
||||
Tuple representing left-bottom coordinates.
|
||||
lt : tuple
|
||||
Tuple representing left-top coordinates.
|
||||
rb : tuple
|
||||
Tuple representing right-bottom coordinates.
|
||||
rt : tuple
|
||||
Tuple representing right-top coordinates.
|
||||
left : bool
|
||||
Whether or not cell is bounded on the left.
|
||||
right : bool
|
||||
Whether or not cell is bounded on the right.
|
||||
top : bool
|
||||
Whether or not cell is bounded on the top.
|
||||
bottom : bool
|
||||
Whether or not cell is bounded on the bottom.
|
||||
hspan : bool
|
||||
Whether or not cell spans horizontally.
|
||||
vspan : bool
|
||||
Whether or not cell spans vertically.
|
||||
text : string
|
||||
Text assigned to cell.
|
||||
bound
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, x1, y1, x2, y2):
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
self.x2 = x2
|
||||
self.y2 = y2
|
||||
self.lb = (x1, y1)
|
||||
self.lt = (x1, y2)
|
||||
self.rb = (x2, y1)
|
||||
self.rt = (x2, y2)
|
||||
self.left = False
|
||||
self.right = False
|
||||
self.top = False
|
||||
self.bottom = False
|
||||
self.hspan = False
|
||||
self.vspan = False
|
||||
self._text = ''
|
||||
|
||||
def __repr__(self):
|
||||
return '<Cell x1={} y1={} x2={} y2={}>'.format(
|
||||
self.x1, self.y1, self.x2, self.y2)
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
return self._text
|
||||
|
||||
@text.setter
|
||||
def text(self, t):
|
||||
self._text = ''.join([self._text, t])
|
||||
|
||||
@property
|
||||
def bound(self):
|
||||
"""The number of sides on which the cell is bounded.
|
||||
"""
|
||||
return self.top + self.bottom + self.left + self.right
|
||||
|
||||
|
||||
class Table(object):
|
||||
"""Defines a table with coordinates relative to a left-bottom
|
||||
origin. (pdf coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of tuples representing column x-coordinates in increasing
|
||||
order.
|
||||
rows : list
|
||||
List of tuples representing row y-coordinates in decreasing
|
||||
order.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
df : object
|
||||
pandas.DataFrame
|
||||
shape : tuple
|
||||
Shape of the table.
|
||||
accuracy : float
|
||||
Accuracy with which text was assigned to the cell.
|
||||
whitespace : float
|
||||
Percentage of whitespace in the table.
|
||||
order : int
|
||||
Table number on pdf page.
|
||||
page : int
|
||||
Pdf page number.
|
||||
data
|
||||
parsing_report
|
||||
|
||||
"""
|
||||
def __init__(self, cols, rows):
|
||||
self.cols = cols
|
||||
self.rows = rows
|
||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||
for c in cols] for r in rows]
|
||||
self.df = None
|
||||
self.shape = (0, 0)
|
||||
self.accuracy = 0
|
||||
self.whitespace = 0
|
||||
self.order = None
|
||||
self.page = None
|
||||
|
||||
def __repr__(self):
|
||||
return '<{} shape={}>'.format(self.__class__.__name__, self.shape)
|
||||
|
||||
@property
|
||||
def data(self):
|
||||
"""Returns two-dimensional list of strings in table.
|
||||
"""
|
||||
d = []
|
||||
for row in self.cells:
|
||||
d.append([cell.text.strip() for cell in row])
|
||||
return d
|
||||
|
||||
@property
|
||||
def parsing_report(self):
|
||||
"""Returns a parsing report with accuracy, %whitespace,
|
||||
table number on page and page number.
|
||||
"""
|
||||
# pretty?
|
||||
report = {
|
||||
'accuracy': self.accuracy,
|
||||
'whitespace': self.whitespace,
|
||||
'order': self.order,
|
||||
'page': self.page
|
||||
}
|
||||
return report
|
||||
|
||||
def set_all_edges(self):
|
||||
"""Sets all table edges to True.
|
||||
"""
|
||||
for row in self.cells:
|
||||
for cell in row:
|
||||
cell.left = cell.right = cell.top = cell.bottom = True
|
||||
return self
|
||||
|
||||
def set_edges(self, vertical, horizontal, joint_close_tol=2):
|
||||
"""Sets a cell's edges to True depending on whether the cell's
|
||||
coordinates overlap with the line's coordinates within a
|
||||
tolerance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vertical : list
|
||||
List of detected vertical lines.
|
||||
horizontal : list
|
||||
List of detected horizontal lines.
|
||||
|
||||
"""
|
||||
for v in vertical:
|
||||
# find closest x coord
|
||||
# iterate over y coords and find closest start and end points
|
||||
i = [i for i, t in enumerate(self.cols)
|
||||
if np.isclose(v[0], t[0], atol=joint_close_tol)]
|
||||
j = [j for j, t in enumerate(self.rows)
|
||||
if np.isclose(v[3], t[0], atol=joint_close_tol)]
|
||||
k = [k for k, t in enumerate(self.rows)
|
||||
if np.isclose(v[1], t[0], atol=joint_close_tol)]
|
||||
if not j:
|
||||
continue
|
||||
J = j[0]
|
||||
if i == [0]: # only left edge
|
||||
L = i[0]
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[J][L].left = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.rows)
|
||||
while J < K:
|
||||
self.cells[J][L].left = True
|
||||
J += 1
|
||||
elif i == []: # only right edge
|
||||
L = len(self.cols) - 1
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[J][L].right = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.rows)
|
||||
while J < K:
|
||||
self.cells[J][L].right = True
|
||||
J += 1
|
||||
else: # both left and right edges
|
||||
L = i[0]
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[J][L].left = True
|
||||
self.cells[J][L - 1].right = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.rows)
|
||||
while J < K:
|
||||
self.cells[J][L].left = True
|
||||
self.cells[J][L - 1].right = True
|
||||
J += 1
|
||||
|
||||
for h in horizontal:
|
||||
# find closest y coord
|
||||
# iterate over x coords and find closest start and end points
|
||||
i = [i for i, t in enumerate(self.rows)
|
||||
if np.isclose(h[1], t[0], atol=joint_close_tol)]
|
||||
j = [j for j, t in enumerate(self.cols)
|
||||
if np.isclose(h[0], t[0], atol=joint_close_tol)]
|
||||
k = [k for k, t in enumerate(self.cols)
|
||||
if np.isclose(h[2], t[0], atol=joint_close_tol)]
|
||||
if not j:
|
||||
continue
|
||||
J = j[0]
|
||||
if i == [0]: # only top edge
|
||||
L = i[0]
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[L][J].top = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.cols)
|
||||
while J < K:
|
||||
self.cells[L][J].top = True
|
||||
J += 1
|
||||
elif i == []: # only bottom edge
|
||||
I = len(self.rows) - 1
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[L][J].bottom = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.cols)
|
||||
while J < K:
|
||||
self.cells[L][J].bottom = True
|
||||
J += 1
|
||||
else: # both top and bottom edges
|
||||
L = i[0]
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[L][J].top = True
|
||||
self.cells[L - 1][J].bottom = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.cols)
|
||||
while J < K:
|
||||
self.cells[L][J].top = True
|
||||
self.cells[L - 1][J].bottom = True
|
||||
J += 1
|
||||
|
||||
return self
|
||||
|
||||
def set_border(self):
|
||||
"""Sets table border edges to True.
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
self.cells[r][0].left = True
|
||||
self.cells[r][len(self.cols) - 1].right = True
|
||||
for c in range(len(self.cols)):
|
||||
self.cells[0][c].top = True
|
||||
self.cells[len(self.rows) - 1][c].bottom = True
|
||||
return self
|
||||
|
||||
def set_span(self):
|
||||
"""Sets a cell's hspan or vspan attribute to True depending
|
||||
on whether the cell spans horizontally or vertically.
|
||||
"""
|
||||
for row in self.cells:
|
||||
for cell in row:
|
||||
left = cell.left
|
||||
right = cell.right
|
||||
top = cell.top
|
||||
bottom = cell.bottom
|
||||
if cell.bound == 4:
|
||||
continue
|
||||
elif cell.bound == 3:
|
||||
if not left and (right and top and bottom):
|
||||
cell.hspan = True
|
||||
elif not right and (left and top and bottom):
|
||||
cell.hspan = True
|
||||
elif not top and (left and right and bottom):
|
||||
cell.vspan = True
|
||||
elif not bottom and (left and right and top):
|
||||
cell.vspan = True
|
||||
elif cell.bound == 2:
|
||||
if left and right and (not top and not bottom):
|
||||
cell.vspan = True
|
||||
elif top and bottom and (not left and not right):
|
||||
cell.hspan = True
|
||||
return self
|
||||
|
||||
def to_csv(self, path, **kwargs):
|
||||
"""Write Table to a comma-separated values (csv) file.
|
||||
"""
|
||||
kw = {
|
||||
'encoding': 'utf-8',
|
||||
'index': False,
|
||||
'quoting': 1
|
||||
}
|
||||
kw.update(kwargs)
|
||||
self.df.to_csv(path, **kw)
|
||||
|
||||
def to_json(self, path, **kwargs):
|
||||
"""Write Table to a JSON file.
|
||||
"""
|
||||
kw = {
|
||||
'orient': 'records'
|
||||
}
|
||||
kw.update(kwargs)
|
||||
json_string = self.df.to_json(**kw)
|
||||
with open(path, 'w') as f:
|
||||
f.write(json_string)
|
||||
|
||||
def to_excel(self, path, **kwargs):
|
||||
"""Write Table to an Excel file.
|
||||
"""
|
||||
kw = {
|
||||
'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
|
||||
'encoding': 'utf-8'
|
||||
}
|
||||
kw.update(kwargs)
|
||||
writer = pd.ExcelWriter(path)
|
||||
self.df.to_excel(writer, **kw)
|
||||
writer.save()
|
||||
|
||||
def to_html(self, path, **kwargs):
|
||||
"""Write Table to an HTML file.
|
||||
"""
|
||||
html_string = self.df.to_html(**kwargs)
|
||||
with open(path, 'w') as f:
|
||||
f.write(html_string)
|
||||
|
||||
|
||||
class TableList(object):
|
||||
"""Defines a list of camelot.core.Table objects. Each table can
|
||||
be accessed using its index.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n : int
|
||||
Number of tables in the list.
|
||||
|
||||
"""
|
||||
def __init__(self, tables):
|
||||
self._tables = tables
|
||||
|
||||
def __repr__(self):
|
||||
return '<{} tables={}>'.format(
|
||||
self.__class__.__name__, len(self._tables))
|
||||
|
||||
def __len__(self):
|
||||
return len(self._tables)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self._tables[idx]
|
||||
|
||||
@staticmethod
|
||||
def _format_func(table, f):
|
||||
return getattr(table, 'to_{}'.format(f))
|
||||
|
||||
@property
|
||||
def n(self):
|
||||
return len(self._tables)
|
||||
|
||||
def _write_file(self, f=None, **kwargs):
|
||||
dirname = kwargs.get('dirname')
|
||||
root = kwargs.get('root')
|
||||
ext = kwargs.get('ext')
|
||||
for table in self._tables:
|
||||
filename = os.path.join('{}-page-{}-table-{}{}'.format(
|
||||
root, table.page, table.order, ext))
|
||||
filepath = os.path.join(dirname, filename)
|
||||
to_format = self._format_func(table, f)
|
||||
to_format(filepath)
|
||||
|
||||
def _compress_dir(self, **kwargs):
|
||||
path = kwargs.get('path')
|
||||
dirname = kwargs.get('dirname')
|
||||
root = kwargs.get('root')
|
||||
ext = kwargs.get('ext')
|
||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||
for table in self._tables:
|
||||
filename = os.path.join('{}-page-{}-table-{}{}'.format(
|
||||
root, table.page, table.order, ext))
|
||||
filepath = os.path.join(dirname, filename)
|
||||
z.write(filepath, os.path.basename(filepath))
|
||||
|
||||
def export(self, path, f='csv', compress=False):
|
||||
"""Exports the list of tables to specified file format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Filepath
|
||||
f : str
|
||||
File format. Can be csv, json, excel and html.
|
||||
compress : bool
|
||||
Whether or not to add files to a ZIP archive.
|
||||
|
||||
"""
|
||||
dirname = os.path.dirname(path)
|
||||
basename = os.path.basename(path)
|
||||
root, ext = os.path.splitext(basename)
|
||||
if compress:
|
||||
dirname = tempfile.mkdtemp()
|
||||
|
||||
kwargs = {
|
||||
'path': path,
|
||||
'dirname': dirname,
|
||||
'root': root,
|
||||
'ext': ext
|
||||
}
|
||||
|
||||
if f in ['csv', 'json', 'html']:
|
||||
self._write_file(f=f, **kwargs)
|
||||
if compress:
|
||||
self._compress_dir(**kwargs)
|
||||
elif f == 'excel':
|
||||
filepath = os.path.join(dirname, basename)
|
||||
writer = pd.ExcelWriter(filepath)
|
||||
for table in self._tables:
|
||||
sheet_name = 'page-{}-table-{}'.format(table.page, table.order)
|
||||
table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8')
|
||||
writer.save()
|
||||
if compress:
|
||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||
z.write(filepath, os.path.basename(filepath))
|
||||
|
||||
|
||||
class Geometry(object):
|
||||
def __init__(self):
|
||||
self.text = []
|
||||
self.images = ()
|
||||
self.segments = ()
|
||||
self.tables = []
|
||||
|
||||
def __repr__(self):
|
||||
return '<{} text={} images={} segments={} tables={}>'.format(
|
||||
self.__class__.__name__,
|
||||
len(self.text),
|
||||
len(self.images),
|
||||
len(self.segments),
|
||||
len(self.tables))
|
||||
|
||||
|
||||
class GeometryList(object):
|
||||
def __init__(self, geometry):
|
||||
self.text = [g.text for g in geometry]
|
||||
self.images = [g.images for g in geometry]
|
||||
self.segments = [g.segments for g in geometry]
|
||||
self.tables = [g.tables for g in geometry]
|
||||
|
||||
def __repr__(self):
|
||||
return '<{} text={} images={} segments={} tables={}>'.format(
|
||||
self.__class__.__name__,
|
||||
len(self.text),
|
||||
len(self.images),
|
||||
len(self.segments),
|
||||
len(self.tables))
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
import os
|
||||
import tempfile
|
||||
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
from .core import TableList, GeometryList
|
||||
from .parsers import Stream, Lattice
|
||||
from .utils import get_page_layout, get_text_objects, get_rotation
|
||||
|
||||
|
||||
class PDFHandler(object):
|
||||
"""Handles all operations like temp directory creation, splitting
|
||||
file into single page pdfs, parsing each pdf and then removing the
|
||||
temp directory.
|
||||
|
||||
Parameter
|
||||
---------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
|
||||
"""
|
||||
def __init__(self, filename, pages='1'):
|
||||
self.filename = filename
|
||||
if not self.filename.endswith('.pdf'):
|
||||
raise TypeError("File format not supported.")
|
||||
self.pages = self._get_pages(self.filename, pages)
|
||||
self.tempdir = tempfile.mkdtemp()
|
||||
|
||||
def _get_pages(self, filename, pages):
|
||||
"""Converts pages string to list of ints.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
|
||||
Returns
|
||||
-------
|
||||
P : list
|
||||
List of int page numbers.
|
||||
|
||||
"""
|
||||
page_numbers = []
|
||||
if pages == '1':
|
||||
page_numbers.append({'start': 1, 'end': 1})
|
||||
else:
|
||||
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
||||
if pages == 'all':
|
||||
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
|
||||
else:
|
||||
for r in pages.split(','):
|
||||
if '-' in r:
|
||||
a, b = r.split('-')
|
||||
if b == 'end':
|
||||
b = infile.getNumPages()
|
||||
page_numbers.append({'start': int(a), 'end': int(b)})
|
||||
else:
|
||||
page_numbers.append({'start': int(r), 'end': int(r)})
|
||||
P = []
|
||||
for p in page_numbers:
|
||||
P.extend(range(p['start'], p['end'] + 1))
|
||||
return sorted(set(P))
|
||||
|
||||
def _save_page(self, filename, page, temp):
|
||||
"""Saves specified page from pdf into a temporary directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to pdf file.
|
||||
page : int
|
||||
Page number
|
||||
temp : str
|
||||
Tmp directory
|
||||
|
||||
"""
|
||||
with open(filename, 'rb') as fileobj:
|
||||
infile = PdfFileReader(fileobj, strict=False)
|
||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||
froot, fext = os.path.splitext(fpath)
|
||||
p = infile.getPage(page - 1)
|
||||
outfile = PdfFileWriter()
|
||||
outfile.addPage(p)
|
||||
with open(fpath, 'wb') as f:
|
||||
outfile.write(f)
|
||||
layout, dim = get_page_layout(fpath)
|
||||
# fix rotated pdf
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
||||
if rotation != '':
|
||||
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
||||
os.rename(fpath, fpath_new)
|
||||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||
outfile = PdfFileWriter()
|
||||
p = infile.getPage(0)
|
||||
if rotation == 'anticlockwise':
|
||||
p.rotateClockwise(90)
|
||||
elif rotation == 'clockwise':
|
||||
p.rotateCounterClockwise(90)
|
||||
outfile.addPage(p)
|
||||
with open(fpath, 'wb') as f:
|
||||
outfile.write(f)
|
||||
|
||||
def parse(self, mesh=False, **kwargs):
|
||||
"""Extracts tables by calling parser.get_tables on all single
|
||||
page pdfs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mesh : bool (default: False)
|
||||
Whether or not to use Lattice method of parsing. Stream
|
||||
is used by default.
|
||||
kwargs : dict
|
||||
See camelot.read_pdf kwargs.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables : camelot.core.TableList
|
||||
List of tables found in pdf.
|
||||
geometry : camelot.core.GeometryList
|
||||
List of geometry objects (contours, lines, joints)
|
||||
found in pdf.
|
||||
|
||||
"""
|
||||
for p in self.pages:
|
||||
self._save_page(self.filename, p, self.tempdir)
|
||||
pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
|
||||
for p in self.pages]
|
||||
tables = []
|
||||
geometry = []
|
||||
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
|
||||
for p in pages:
|
||||
t, g = parser.extract_tables(p)
|
||||
tables.extend(t)
|
||||
geometry.append(g)
|
||||
return TableList(tables), GeometryList(geometry)
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
from __future__ import division
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
|
||||
|
|
@ -7,40 +8,38 @@ import numpy as np
|
|||
from .utils import merge_tuples
|
||||
|
||||
|
||||
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
imagename : string
|
||||
Path to image file.
|
||||
|
||||
invert : bool
|
||||
Whether or not to invert the image. Useful when pdfs have
|
||||
tables with lines in background.
|
||||
(optional, default: False)
|
||||
|
||||
blocksize: int
|
||||
process_background : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
c: float
|
||||
Constant subtracted from the mean or weighted mean
|
||||
(see the details below). Normally, it is positive but may be
|
||||
zero or negative as well.
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
c : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
|
||||
Returns
|
||||
-------
|
||||
img : object
|
||||
numpy.ndarray representing the original image.
|
||||
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
|
||||
"""
|
||||
img = cv2.imread(imagename)
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
if invert:
|
||||
if process_background:
|
||||
threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY, blocksize, c)
|
||||
else:
|
||||
|
|
@ -49,7 +48,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
|||
return img, threshold
|
||||
|
||||
|
||||
def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
|
||||
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
|
||||
"""Finds horizontal and vertical lines by applying morphological
|
||||
transformations on an image.
|
||||
|
||||
|
|
@ -57,38 +56,37 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
|
|||
----------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
|
||||
direction : string
|
||||
direction : string, optional (default: 'horizontal')
|
||||
Specifies whether to find vertical or horizontal lines.
|
||||
(default: 'horizontal')
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
scale : int
|
||||
Used to divide the height/width to get a structuring element
|
||||
for morph transform.
|
||||
(optional, default: 15)
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
iterations : int
|
||||
Number of iterations for dilation.
|
||||
(optional, default: 2)
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dmask : object
|
||||
numpy.ndarray representing pixels where vertical/horizontal
|
||||
lines lie.
|
||||
|
||||
lines : list
|
||||
List of tuples representing vertical/horizontal lines with
|
||||
coordinates relative to a left-top origin in
|
||||
OpenCV's coordinate space.
|
||||
image coordinate space.
|
||||
|
||||
"""
|
||||
lines = []
|
||||
|
||||
if direction == 'vertical':
|
||||
size = threshold.shape[0] // scale
|
||||
size = threshold.shape[0] // line_size_scaling
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
elif direction == 'horizontal':
|
||||
size = threshold.shape[1] // scale
|
||||
size = threshold.shape[1] // line_size_scaling
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||
elif direction is None:
|
||||
raise ValueError("Specify direction as either 'vertical' or"
|
||||
|
|
@ -110,9 +108,9 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
|
|||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
if direction == 'vertical':
|
||||
lines.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
||||
lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
|
||||
elif direction == 'horizontal':
|
||||
lines.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
|
||||
lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
|
||||
|
||||
return dmask, lines
|
||||
|
||||
|
|
@ -124,7 +122,6 @@ def find_table_contours(vertical, horizontal):
|
|||
----------
|
||||
vertical : object
|
||||
numpy.ndarray representing pixels where vertical lines lie.
|
||||
|
||||
horizontal : object
|
||||
numpy.ndarray representing pixels where horizontal lines lie.
|
||||
|
||||
|
|
@ -133,7 +130,8 @@ def find_table_contours(vertical, horizontal):
|
|||
cont : list
|
||||
List of tuples representing table boundaries. Each tuple is of
|
||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||
h -> height in OpenCV's coordinate space.
|
||||
h -> height in image coordinate space.
|
||||
|
||||
"""
|
||||
mask = vertical + horizontal
|
||||
|
||||
|
|
@ -161,11 +159,9 @@ def find_table_joints(contours, vertical, horizontal):
|
|||
contours : list
|
||||
List of tuples representing table boundaries. Each tuple is of
|
||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||
h -> height in OpenCV's coordinate space.
|
||||
|
||||
h -> height in image coordinate space.
|
||||
vertical : object
|
||||
numpy.ndarray representing pixels where vertical lines lie.
|
||||
|
||||
horizontal : object
|
||||
numpy.ndarray representing pixels where horizontal lines lie.
|
||||
|
||||
|
|
@ -174,9 +170,9 @@ def find_table_joints(contours, vertical, horizontal):
|
|||
tables : dict
|
||||
Dict with table boundaries as keys and list of intersections
|
||||
in that boundary as their value.
|
||||
|
||||
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
||||
and (x2, y2) -> rt in OpenCV's coordinate space.
|
||||
and (x2, y2) -> rt in image coordinate space.
|
||||
|
||||
"""
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
tables = {}
|
||||
|
|
@ -194,32 +190,35 @@ def find_table_joints(contours, vertical, horizontal):
|
|||
joint_coords = []
|
||||
for j in jc:
|
||||
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
|
||||
c1, c2 = x + (2 * jx + jw) // 2, y + (2 * jy + jh) // 2
|
||||
joint_coords.append((c1, c2))
|
||||
tables[(x, y + h, x + w, y)] = joint_coords
|
||||
|
||||
return tables
|
||||
|
||||
|
||||
def remove_lines(threshold, line_scale=15):
|
||||
def remove_lines(threshold, line_size_scaling=15):
|
||||
"""Removes lines from a thresholded image.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
line_scale : int
|
||||
Line scaling factor.
|
||||
(optional, default: 15)
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
|
||||
Returns
|
||||
-------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image
|
||||
with horizontal and vertical lines removed.
|
||||
|
||||
"""
|
||||
size = threshold.shape[0] // line_scale
|
||||
size = threshold.shape[0] // line_size_scaling
|
||||
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||
dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
|
||||
|
|
@ -235,24 +234,26 @@ def remove_lines(threshold, line_scale=15):
|
|||
return threshold
|
||||
|
||||
|
||||
def find_cuts(threshold, char_scale=200):
|
||||
def find_cuts(threshold, char_size_scaling=200):
|
||||
"""Finds cuts made by text projections on y-axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
line_size_scaling : int, optional (default: 200)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
char_scale : int
|
||||
Char scaling factor.
|
||||
(optional, default: 200)
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_cuts : list
|
||||
List of cuts on y-axis.
|
||||
"""
|
||||
size = threshold.shape[0] // char_scale
|
||||
size = threshold.shape[0] // char_size_scaling
|
||||
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
|
||||
threshold = cv2.erode(threshold, char_el)
|
||||
|
|
@ -268,5 +269,5 @@ def find_cuts(threshold, char_scale=200):
|
|||
contours = [cv2.boundingRect(c) for c in contours]
|
||||
y_cuts = [(c[1], c[1] + c[3]) for c in contours]
|
||||
y_cuts = list(merge_tuples(sorted(y_cuts)))
|
||||
y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) / 2 for i in range(1, len(y_cuts))]
|
||||
y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) // 2 for i in range(1, len(y_cuts))]
|
||||
return sorted(y_cuts, reverse=True)
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
from .handlers import PDFHandler
|
||||
|
||||
|
||||
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||
"""Read PDF and return parsed data tables.
|
||||
|
||||
Note: kwargs annotated with ^ can only be used with mesh=False
|
||||
and kwargs annotated with * can only be used with mesh=True.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
mesh : bool (default: False)
|
||||
Whether or not to use Lattice method of parsing. Stream
|
||||
is used by default.
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
columns^ : list, optional (default: None)
|
||||
List of column x-coordinates as strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
row_close_tol^ : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
col_close_tol^ : int, optional (default: 0)
|
||||
Columns will be formed by combining text horizontally
|
||||
within this tolerance.
|
||||
process_background* : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
line_size_scaling* : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
copy_text* : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
shift_text* : list, optional (default: ['l', 't'])
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify where the text in a spanning cell should flow.
|
||||
line_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
joint_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
threshold_blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
threshold_constant : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
values for table_area(s) and debugging.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables : camelot.core.TableList
|
||||
|
||||
"""
|
||||
# validate kwargs?
|
||||
p = PDFHandler(filepath, pages)
|
||||
tables, __ = p.parse(mesh=mesh, **kwargs)
|
||||
return tables
|
||||
|
|
@ -1,382 +0,0 @@
|
|||
from __future__ import division
|
||||
import os
|
||||
import sys
|
||||
import copy
|
||||
import types
|
||||
import logging
|
||||
import copy_reg
|
||||
import warnings
|
||||
import subprocess
|
||||
|
||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||
find_table_joints)
|
||||
from .table import Table
|
||||
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
||||
merge_close_values, get_table_index, get_score, count_empty,
|
||||
encode_list, get_text_objects, get_page_layout)
|
||||
|
||||
|
||||
__all__ = ['Lattice']
|
||||
logger = logging.getLogger('app_logger')
|
||||
|
||||
|
||||
def _reduce_method(m):
|
||||
if m.im_self is None:
|
||||
return getattr, (m.im_class, m.im_func.func_name)
|
||||
else:
|
||||
return getattr, (m.im_self, m.im_func.func_name)
|
||||
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||
|
||||
|
||||
def _reduce_index(t, idx, shift_text):
|
||||
"""Reduces index of a text object if it lies within a spanning
|
||||
cell.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : object
|
||||
camelot.table.Table
|
||||
|
||||
idx : list
|
||||
List of tuples of the form (r_idx, c_idx, text).
|
||||
|
||||
shift_text : list
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more from above and pass them as a list to
|
||||
specify where the text in a spanning cell should flow.
|
||||
|
||||
Returns
|
||||
-------
|
||||
indices : list
|
||||
List of tuples of the form (idx, text) where idx is the reduced
|
||||
index of row/column and text is the an lttextline substring.
|
||||
"""
|
||||
indices = []
|
||||
for r_idx, c_idx, text in idx:
|
||||
for d in shift_text:
|
||||
if d == 'l':
|
||||
if t.cells[r_idx][c_idx].spanning_h:
|
||||
while not t.cells[r_idx][c_idx].left:
|
||||
c_idx -= 1
|
||||
if d == 'r':
|
||||
if t.cells[r_idx][c_idx].spanning_h:
|
||||
while not t.cells[r_idx][c_idx].right:
|
||||
c_idx += 1
|
||||
if d == 't':
|
||||
if t.cells[r_idx][c_idx].spanning_v:
|
||||
while not t.cells[r_idx][c_idx].top:
|
||||
r_idx -= 1
|
||||
if d == 'b':
|
||||
if t.cells[r_idx][c_idx].spanning_v:
|
||||
while not t.cells[r_idx][c_idx].bottom:
|
||||
r_idx += 1
|
||||
indices.append((r_idx, c_idx, text))
|
||||
return indices
|
||||
|
||||
|
||||
def _fill_spanning(t, fill=None):
|
||||
"""Fills spanning cells.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : object
|
||||
camelot.table.Table
|
||||
|
||||
fill : list
|
||||
{'h', 'v'}
|
||||
Specify to fill spanning cells in horizontal or vertical
|
||||
direction.
|
||||
(optional, default: None)
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : object
|
||||
camelot.table.Table
|
||||
"""
|
||||
for f in fill:
|
||||
if f == "h":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
if t.cells[i][j].get_text().strip() == '':
|
||||
if t.cells[i][j].spanning_h and not t.cells[i][j].left:
|
||||
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
|
||||
elif f == "v":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
if t.cells[i][j].get_text().strip() == '':
|
||||
if t.cells[i][j].spanning_v and not t.cells[i][j].top:
|
||||
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
|
||||
return t
|
||||
|
||||
|
||||
class Lattice:
|
||||
"""Lattice looks for lines in the pdf to form a table.
|
||||
|
||||
If you want to give fill and mtol for each table when specifying
|
||||
multiple table areas, make sure that the length of fill and mtol
|
||||
is equal to the length of table_area. Mapping between them is based
|
||||
on index.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list
|
||||
List of strings of the form x1,y1,x2,y2 where
|
||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
|
||||
coordinate space, denoting table areas to analyze.
|
||||
(optional, default: None)
|
||||
|
||||
fill : list
|
||||
List of strings specifying directions to fill spanning cells.
|
||||
{'h', 'v'} to fill spanning cells in horizontal or vertical
|
||||
direction.
|
||||
(optional, default: None)
|
||||
|
||||
mtol : list
|
||||
List of ints specifying m-tolerance parameters.
|
||||
(optional, default: [2])
|
||||
|
||||
jtol : list
|
||||
List of ints specifying j-tolerance parameters.
|
||||
(optional, default: [2])
|
||||
|
||||
blocksize : int
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
(optional, default: 15)
|
||||
|
||||
threshold_constant : float
|
||||
Constant subtracted from the mean or weighted mean
|
||||
(see the details below). Normally, it is positive but may be
|
||||
zero or negative as well.
|
||||
(optional, default: -2)
|
||||
|
||||
scale : int
|
||||
Used to divide the height/width of a pdf to get a structuring
|
||||
element for image processing.
|
||||
(optional, default: 15)
|
||||
|
||||
iterations : int
|
||||
Number of iterations for dilation.
|
||||
(optional, default: 0)
|
||||
|
||||
invert : bool
|
||||
Whether or not to invert the image. Useful when pdfs have
|
||||
tables with lines in background.
|
||||
(optional, default: False)
|
||||
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
(optional, default: (1.0, 0.5, 0.1))
|
||||
|
||||
split_text : bool
|
||||
Whether or not to split a text line if it spans across
|
||||
different cells.
|
||||
(optional, default: False)
|
||||
|
||||
flag_size : bool
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
(optional, default: True)
|
||||
|
||||
shift_text : list
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more from above and pass them as a list to
|
||||
specify where the text in a spanning cell should flow.
|
||||
(optional, default: ['l', 't'])
|
||||
|
||||
debug : string
|
||||
{'contour', 'line', 'joint', 'table'}
|
||||
Set to one of the above values to generate a matplotlib plot
|
||||
of detected contours, lines, joints and the table generated.
|
||||
(optional, default: None)
|
||||
"""
|
||||
def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2],
|
||||
blocksize=15, threshold_constant=-2, scale=15, iterations=0,
|
||||
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
|
||||
flag_size=True, shift_text=['l', 't'], debug=None):
|
||||
|
||||
self.method = 'lattice'
|
||||
self.table_area = table_area
|
||||
self.fill = fill
|
||||
self.mtol = mtol
|
||||
self.jtol = jtol
|
||||
self.blocksize = blocksize
|
||||
self.threshold_constant = threshold_constant
|
||||
self.scale = scale
|
||||
self.iterations = iterations
|
||||
self.invert = invert
|
||||
self.char_margin, self.line_margin, self.word_margin = margins
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.shift_text = shift_text
|
||||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
"""Expects a single page pdf as input with rotation corrected.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdfname : string
|
||||
Path to single page pdf file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
page : dict
|
||||
"""
|
||||
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
|
||||
line_margin=self.line_margin, word_margin=self.word_margin)
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
width, height = dim
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||
if not ltchar:
|
||||
warnings.warn("{0}: Page contains no text.".format(
|
||||
os.path.basename(bname)))
|
||||
return {os.path.basename(bname): None}
|
||||
|
||||
imagename = ''.join([bname, '.png'])
|
||||
gs_call = [
|
||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r600", pdfname
|
||||
]
|
||||
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
|
||||
gs_call.insert(0, "gs")
|
||||
else:
|
||||
gs_call.insert(0, "gsc")
|
||||
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
||||
stderr=subprocess.STDOUT)
|
||||
|
||||
img, threshold = adaptive_threshold(imagename, invert=self.invert,
|
||||
blocksize=self.blocksize, c=self.threshold_constant)
|
||||
pdf_x = width
|
||||
pdf_y = height
|
||||
img_x = img.shape[1]
|
||||
img_y = img.shape[0]
|
||||
sc_x_image = img_x / float(pdf_x)
|
||||
sc_y_image = img_y / float(pdf_y)
|
||||
sc_x_pdf = pdf_x / float(img_x)
|
||||
sc_y_pdf = pdf_y / float(img_y)
|
||||
factors_image = (sc_x_image, sc_y_image, pdf_y)
|
||||
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
|
||||
|
||||
vmask, v_segments = find_lines(threshold, direction='vertical',
|
||||
scale=self.scale, iterations=self.iterations)
|
||||
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
||||
scale=self.scale, iterations=self.iterations)
|
||||
|
||||
if self.table_area is not None:
|
||||
areas = []
|
||||
for area in self.table_area:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = float(x1)
|
||||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image)
|
||||
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||
table_bbox = find_table_joints(areas, vmask, hmask)
|
||||
else:
|
||||
contours = find_table_contours(vmask, hmask)
|
||||
table_bbox = find_table_joints(contours, vmask, hmask)
|
||||
|
||||
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
||||
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
||||
else:
|
||||
mtolerance = copy.deepcopy(self.mtol)
|
||||
|
||||
if len(self.jtol) == 1 and self.jtol[0] == 2:
|
||||
jtolerance = copy.deepcopy(self.jtol) * len(table_bbox)
|
||||
else:
|
||||
jtolerance = copy.deepcopy(self.jtol)
|
||||
|
||||
if self.debug:
|
||||
self.debug_images = (img, table_bbox)
|
||||
|
||||
table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
|
||||
h_segments, factors_pdf)
|
||||
|
||||
if self.debug:
|
||||
self.debug_segments = (v_segments, h_segments)
|
||||
self.debug_tables = []
|
||||
|
||||
page = {}
|
||||
tables = {}
|
||||
# sort tables based on y-coord
|
||||
for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
t_bbox = {}
|
||||
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
||||
t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
|
||||
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
|
||||
char_bbox = text_in_bbox(k, ltchar)
|
||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||
for direction in t_bbox:
|
||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||
cols, rows = zip(*table_bbox[k])
|
||||
cols, rows = list(cols), list(rows)
|
||||
cols.extend([k[0], k[2]])
|
||||
rows.extend([k[1], k[3]])
|
||||
# sort horizontal and vertical segments
|
||||
cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
|
||||
rows = merge_close_values(
|
||||
sorted(rows, reverse=True), mtol=mtolerance[table_no])
|
||||
# make grid using x and y coord of shortlisted rows and cols
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
|
||||
table = Table(cols, rows)
|
||||
# set table edges to True using ver+hor lines
|
||||
table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
|
||||
nouse = table.nocont_ / (len(v_s) + len(h_s))
|
||||
table_data['line_p'] = 100 * (1 - nouse)
|
||||
# set spanning cells to True
|
||||
table = table.set_spanning()
|
||||
# set table border edges to True
|
||||
table = table.set_border_edges()
|
||||
|
||||
if self.debug:
|
||||
self.debug_tables.append(table)
|
||||
|
||||
assignment_errors = []
|
||||
table_data['split_text'] = []
|
||||
table_data['superscript'] = []
|
||||
for direction in ['vertical', 'horizontal']:
|
||||
for t in t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table, t, direction, split_text=self.split_text,
|
||||
flag_size=self.flag_size)
|
||||
if indices[:2] != (-1, -1):
|
||||
assignment_errors.append(error)
|
||||
indices = _reduce_index(table, indices, shift_text=self.shift_text)
|
||||
if len(indices) > 1:
|
||||
table_data['split_text'].append(indices)
|
||||
for r_idx, c_idx, text in indices:
|
||||
if all(s in text for s in ['<s>', '</s>']):
|
||||
table_data['superscript'].append((r_idx, c_idx, text))
|
||||
table.cells[r_idx][c_idx].add_text(text)
|
||||
score = get_score([[100, assignment_errors]])
|
||||
table_data['score'] = score
|
||||
|
||||
if self.fill is not None:
|
||||
table = _fill_spanning(table, fill=self.fill)
|
||||
ar = table.get_list()
|
||||
ar = encode_list(ar)
|
||||
table_data['data'] = ar
|
||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||
table_data['empty_p'] = empty_p
|
||||
table_data['r_nempty_cells'] = r_nempty_cells
|
||||
table_data['c_nempty_cells'] = c_nempty_cells
|
||||
table_data['nrows'] = len(ar)
|
||||
table_data['ncols'] = len(ar[0])
|
||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||
page[os.path.basename(bname)] = tables
|
||||
|
||||
if self.debug:
|
||||
return None
|
||||
|
||||
return page
|
||||
331
camelot/ocr.py
331
camelot/ocr.py
|
|
@ -1,331 +0,0 @@
|
|||
import os
|
||||
import copy
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
import pyocr
|
||||
from PIL import Image
|
||||
|
||||
from .table import Table
|
||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||
find_table_joints, remove_lines, find_cuts)
|
||||
from .utils import merge_close_values, encode_list
|
||||
|
||||
|
||||
__all__ = ['OCRLattice', 'OCRStream']
|
||||
logger = logging.getLogger('app_logger')
|
||||
|
||||
|
||||
class OCRLattice:
|
||||
"""Lattice, but for images.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list
|
||||
List of strings of the form x1,y1,x2,y2 where
|
||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
|
||||
coordinate space, denoting table areas to analyze.
|
||||
(optional, default: None)
|
||||
|
||||
mtol : list
|
||||
List of ints specifying m-tolerance parameters.
|
||||
(optional, default: [2])
|
||||
|
||||
blocksize : int
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
(optional, default: 15)
|
||||
|
||||
threshold_constant : float
|
||||
Constant subtracted from the mean or weighted mean
|
||||
(see the details below). Normally, it is positive but may be
|
||||
zero or negative as well.
|
||||
(optional, default: -2)
|
||||
|
||||
dpi : int
|
||||
Dots per inch.
|
||||
(optional, default: 300)
|
||||
|
||||
layout : int
|
||||
Tesseract page segmentation mode.
|
||||
(optional, default: 7)
|
||||
|
||||
lang : string
|
||||
Language to be used for OCR.
|
||||
(optional, default: 'eng')
|
||||
|
||||
scale : int
|
||||
Used to divide the height/width of a pdf to get a structuring
|
||||
element for image processing.
|
||||
(optional, default: 15)
|
||||
|
||||
iterations : int
|
||||
Number of iterations for dilation.
|
||||
(optional, default: 0)
|
||||
|
||||
debug : string
|
||||
{'contour', 'line', 'joint', 'table'}
|
||||
Set to one of the above values to generate a matplotlib plot
|
||||
of detected contours, lines, joints and the table generated.
|
||||
(optional, default: None)
|
||||
"""
|
||||
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
|
||||
dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None):
|
||||
|
||||
self.method = 'ocrl'
|
||||
self.table_area = table_area
|
||||
self.mtol = mtol
|
||||
self.blocksize = blocksize
|
||||
self.threshold_constant = threshold_constant
|
||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||
self.dpi = dpi
|
||||
self.layout = layout
|
||||
self.lang = lang
|
||||
self.scale = scale
|
||||
self.iterations = iterations
|
||||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
if self.tool is None:
|
||||
return None
|
||||
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
imagename = ''.join([bname, '.png'])
|
||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||
|
||||
gs_call = [
|
||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
||||
pdfname
|
||||
]
|
||||
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
|
||||
gs_call.insert(0, "gs")
|
||||
else:
|
||||
gs_call.insert(0, "gsc")
|
||||
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
||||
stderr=subprocess.STDOUT)
|
||||
|
||||
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
||||
c=self.threshold_constant)
|
||||
vmask, v_segments = find_lines(threshold, direction='vertical',
|
||||
scale=self.scale, iterations=self.iterations)
|
||||
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
||||
scale=self.scale, iterations=self.iterations)
|
||||
|
||||
if self.table_area is not None:
|
||||
areas = []
|
||||
for area in self.table_area:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = int(float(x1))
|
||||
y1 = int(float(y1))
|
||||
x2 = int(float(x2))
|
||||
y2 = int(float(y2))
|
||||
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||
table_bbox = find_table_joints(areas, vmask, hmask)
|
||||
else:
|
||||
contours = find_table_contours(vmask, hmask)
|
||||
table_bbox = find_table_joints(contours, vmask, hmask)
|
||||
|
||||
if self.debug:
|
||||
self.debug_images = (img, table_bbox)
|
||||
self.debug_segments = (v_segments, h_segments)
|
||||
self.debug_tables = []
|
||||
|
||||
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
||||
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
||||
else:
|
||||
mtolerance = copy.deepcopy(self.mtol)
|
||||
|
||||
page = {}
|
||||
tables = {}
|
||||
table_no = 0
|
||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
|
||||
table_data = {}
|
||||
cols, rows = zip(*table_bbox[k])
|
||||
cols, rows = list(cols), list(rows)
|
||||
cols.extend([k[0], k[2]])
|
||||
rows.extend([k[1], k[3]])
|
||||
cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
|
||||
rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no])
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
table = Table(cols, rows)
|
||||
if self.debug:
|
||||
self.debug_tables.append(table)
|
||||
table.image = img[k[3]:k[1],k[0]:k[2]]
|
||||
for i in range(len(table.cells)):
|
||||
for j in range(len(table.cells[i])):
|
||||
x1 = int(table.cells[i][j].x1)
|
||||
y1 = int(table.cells[i][j].y1)
|
||||
x2 = int(table.cells[i][j].x2)
|
||||
y2 = int(table.cells[i][j].y2)
|
||||
table.cells[i][j].image = img[y1:y2,x1:x2]
|
||||
text = self.tool.image_to_string(
|
||||
Image.fromarray(table.cells[i][j].image),
|
||||
lang=self.lang,
|
||||
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
|
||||
)
|
||||
table.cells[i][j].add_text(text)
|
||||
ar = table.get_list()
|
||||
ar.reverse()
|
||||
ar = encode_list(ar)
|
||||
table_data['data'] = ar
|
||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||
table_no += 1
|
||||
page[os.path.basename(bname)] = tables
|
||||
|
||||
if self.debug:
|
||||
return None
|
||||
|
||||
return page
|
||||
|
||||
|
||||
class OCRStream:
|
||||
"""Stream, but for images.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list
|
||||
List of strings of the form x1,y1,x2,y2 where
|
||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
|
||||
coordinate space, denoting table areas to analyze.
|
||||
(optional, default: None)
|
||||
|
||||
columns : list
|
||||
List of strings where each string is comma-separated values of
|
||||
x-coordinates in OpenCV's coordinate space.
|
||||
(optional, default: None)
|
||||
|
||||
blocksize : int
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
(optional, default: 15)
|
||||
|
||||
threshold_constant : float
|
||||
Constant subtracted from the mean or weighted mean
|
||||
(see the details below). Normally, it is positive but may be
|
||||
zero or negative as well.
|
||||
(optional, default: -2)
|
||||
|
||||
dpi : int
|
||||
Dots per inch.
|
||||
(optional, default: 300)
|
||||
|
||||
layout : int
|
||||
Tesseract page segmentation mode.
|
||||
(optional, default: 7)
|
||||
|
||||
lang : string
|
||||
Language to be used for OCR.
|
||||
(optional, default: 'eng')
|
||||
|
||||
line_scale : int
|
||||
Line scaling factor.
|
||||
(optional, default: 15)
|
||||
|
||||
char_scale : int
|
||||
Char scaling factor.
|
||||
(optional, default: 200)
|
||||
"""
|
||||
def __init__(self, table_area=None, columns=None, blocksize=15,
|
||||
threshold_constant=-2, dpi=300, layout=7, lang="eng",
|
||||
line_scale=15, char_scale=200, debug=False):
|
||||
|
||||
self.method = 'ocrs'
|
||||
self.table_area = table_area
|
||||
self.columns = columns
|
||||
self.blocksize = blocksize
|
||||
self.threshold_constant = threshold_constant
|
||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||
self.dpi = dpi
|
||||
self.layout = layout
|
||||
self.lang = lang
|
||||
self.line_scale = line_scale
|
||||
self.char_scale = char_scale
|
||||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
if self.tool is None:
|
||||
return None
|
||||
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
imagename = ''.join([bname, '.png'])
|
||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||
|
||||
gs_call = [
|
||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
||||
pdfname
|
||||
]
|
||||
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
|
||||
gs_call.insert(0, "gs")
|
||||
else:
|
||||
gs_call.insert(0, "gsc")
|
||||
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
||||
stderr=subprocess.STDOUT)
|
||||
|
||||
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
||||
c=self.threshold_constant)
|
||||
threshold = remove_lines(threshold, line_scale=self.line_scale)
|
||||
height, width = threshold.shape
|
||||
if self.debug:
|
||||
self.debug_images = img
|
||||
return None
|
||||
|
||||
if self.table_area is not None:
|
||||
if self.columns is not None:
|
||||
if len(self.table_area) != len(self.columns):
|
||||
raise ValueError("{0}: Length of table area and columns"
|
||||
" should be equal.".format(os.path.basename(bname)))
|
||||
|
||||
table_bbox = {}
|
||||
for area in self.table_area:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = int(float(x1))
|
||||
y1 = int(float(y1))
|
||||
x2 = int(float(x2))
|
||||
y2 = int(float(y2))
|
||||
table_bbox[(x1, y1, x2, y2)] = None
|
||||
else:
|
||||
table_bbox = {(0, 0, width, height): None}
|
||||
|
||||
page = {}
|
||||
tables = {}
|
||||
table_no = 0
|
||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
|
||||
if self.columns is None:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
table_data = {}
|
||||
table_image = threshold[k[1]:k[3],k[0]:k[2]]
|
||||
cols = self.columns[table_no].split(',')
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, k[0])
|
||||
cols.append(k[2])
|
||||
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
|
||||
y_cuts = find_cuts(table_image, char_scale=self.char_scale)
|
||||
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
|
||||
table = Table(cols, rows)
|
||||
for i in range(len(table.cells)):
|
||||
for j in range(len(table.cells[i])):
|
||||
x1 = int(table.cells[i][j].x1)
|
||||
y1 = int(table.cells[i][j].y1)
|
||||
x2 = int(table.cells[i][j].x2)
|
||||
y2 = int(table.cells[i][j].y2)
|
||||
table.cells[i][j].image = table_image[y1:y2,x1:x2]
|
||||
cell_image = Image.fromarray(table.cells[i][j].image)
|
||||
text = self.tool.image_to_string(
|
||||
cell_image,
|
||||
lang=self.lang,
|
||||
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
|
||||
)
|
||||
table.cells[i][j].add_text(text)
|
||||
ar = table.get_list()
|
||||
ar.reverse()
|
||||
ar = encode_list(ar)
|
||||
table_data['data'] = ar
|
||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||
table_no += 1
|
||||
page[os.path.basename(bname)] = tables
|
||||
|
||||
return page
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
from .stream import Stream
|
||||
from .lattice import Lattice
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
import os
|
||||
|
||||
from ..core import Geometry
|
||||
from ..utils import get_page_layout, get_text_objects
|
||||
|
||||
|
||||
class BaseParser(object):
|
||||
"""Defines a base parser.
|
||||
"""
|
||||
def _generate_layout(self, filename):
|
||||
self.filename = filename
|
||||
self.layout, self.dimensions = get_page_layout(
|
||||
self.filename,
|
||||
char_margin=self.char_margin,
|
||||
line_margin=self.line_margin,
|
||||
word_margin=self.word_margin)
|
||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
||||
self.pdf_width, self.pdf_height = self.dimensions
|
||||
self.rootname, __ = os.path.splitext(self.filename)
|
||||
self.g = Geometry()
|
||||
|
|
@ -0,0 +1,336 @@
|
|||
from __future__ import division
|
||||
import os
|
||||
import copy
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from .base import BaseParser
|
||||
from ..core import Table
|
||||
from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
||||
merge_close_lines, get_table_index, compute_accuracy,
|
||||
compute_whitespace, setup_logging, encode_)
|
||||
from ..image_processing import (adaptive_threshold, find_lines,
|
||||
find_table_contours, find_table_joints)
|
||||
|
||||
|
||||
logger = setup_logging(__name__)
|
||||
|
||||
|
||||
class Lattice(BaseParser):
|
||||
"""Lattice method of parsing looks for lines between text
|
||||
to form a table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
process_background : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
line_size_scaling : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
copy_text : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
shift_text : list, optional (default: ['l', 't'])
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify where the text in a spanning cell should flow.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
line_close_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
joint_close_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
threshold_blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
threshold_constant : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
values for table_area(s) and debugging.
|
||||
|
||||
"""
|
||||
def __init__(self, table_area=None, process_background=False,
|
||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||
split_text=False, flag_size=False, line_close_tol=2,
|
||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||
iterations=0, margins=(1.0, 0.5, 0.1), debug=False):
|
||||
self.table_area = table_area
|
||||
self.process_background = process_background
|
||||
self.line_size_scaling = line_size_scaling
|
||||
self.copy_text = copy_text
|
||||
self.shift_text = shift_text
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.line_close_tol = line_close_tol
|
||||
self.joint_close_tol = joint_close_tol
|
||||
self.threshold_blocksize = threshold_blocksize
|
||||
self.threshold_constant = threshold_constant
|
||||
self.iterations = iterations
|
||||
self.char_margin, self.line_margin, self.word_margin = margins
|
||||
self.debug = debug
|
||||
|
||||
@staticmethod
|
||||
def _reduce_index(t, idx, shift_text):
|
||||
"""Reduces index of a text object if it lies within a spanning
|
||||
cell.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
idx : list
|
||||
List of tuples of the form (r_idx, c_idx, text).
|
||||
shift_text : list
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a
|
||||
list to specify where the text in a spanning cell should
|
||||
flow.
|
||||
|
||||
Returns
|
||||
-------
|
||||
indices : list
|
||||
List of tuples of the form (r_idx, c_idx, text) where
|
||||
r_idx and c_idx are new row and column indices for text.
|
||||
|
||||
"""
|
||||
indices = []
|
||||
for r_idx, c_idx, text in idx:
|
||||
for d in shift_text:
|
||||
if d == 'l':
|
||||
if t.cells[r_idx][c_idx].hspan:
|
||||
while not t.cells[r_idx][c_idx].left:
|
||||
c_idx -= 1
|
||||
if d == 'r':
|
||||
if t.cells[r_idx][c_idx].hspan:
|
||||
while not t.cells[r_idx][c_idx].right:
|
||||
c_idx += 1
|
||||
if d == 't':
|
||||
if t.cells[r_idx][c_idx].vspan:
|
||||
while not t.cells[r_idx][c_idx].top:
|
||||
r_idx -= 1
|
||||
if d == 'b':
|
||||
if t.cells[r_idx][c_idx].vspan:
|
||||
while not t.cells[r_idx][c_idx].bottom:
|
||||
r_idx += 1
|
||||
indices.append((r_idx, c_idx, text))
|
||||
return indices
|
||||
|
||||
@staticmethod
|
||||
def _copy_spanning_text(t, copy_text=None):
|
||||
"""Copies over text in empty spanning cells.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : camelot.core.Table
|
||||
copy_text : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : camelot.core.Table
|
||||
|
||||
"""
|
||||
for f in copy_text:
|
||||
if f == "h":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
if t.cells[i][j].text.strip() == '':
|
||||
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
||||
t.cells[i][j].text = t.cells[i][j - 1].text
|
||||
elif f == "v":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
if t.cells[i][j].text.strip() == '':
|
||||
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
||||
return t
|
||||
|
||||
def _generate_image(self):
|
||||
self.imagename = ''.join([self.rootname, '.png'])
|
||||
gs_call = [
|
||||
"-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename
|
||||
]
|
||||
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
|
||||
gs_call.insert(0, "gs")
|
||||
else:
|
||||
gs_call.insert(0, "gsc")
|
||||
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
||||
stderr=subprocess.STDOUT)
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background,
|
||||
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
||||
image_width = self.image.shape[1]
|
||||
image_height = self.image.shape[0]
|
||||
image_width_scaler = image_width / float(self.pdf_width)
|
||||
image_height_scaler = image_height / float(self.pdf_height)
|
||||
pdf_width_scaler = self.pdf_width / float(image_width)
|
||||
pdf_height_scaler = self.pdf_height / float(image_height)
|
||||
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
|
||||
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
|
||||
|
||||
vertical_mask, vertical_segments = find_lines(
|
||||
self.threshold, direction='vertical',
|
||||
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
|
||||
horizontal_mask, horizontal_segments = find_lines(
|
||||
self.threshold, direction='horizontal',
|
||||
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
|
||||
|
||||
if self.table_area is not None:
|
||||
areas = []
|
||||
for area in self.table_area:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = float(x1)
|
||||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
|
||||
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||
table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
|
||||
else:
|
||||
contours = find_table_contours(vertical_mask, horizontal_mask)
|
||||
table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
|
||||
|
||||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
||||
|
||||
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
|
||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers)
|
||||
|
||||
def _generate_columns_and_rows(self, table_idx, tk):
|
||||
# select elements which lie within table_bbox
|
||||
t_bbox = {}
|
||||
v_s, h_s = segments_in_bbox(
|
||||
tk, self.vertical_segments, self.horizontal_segments)
|
||||
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
|
||||
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
|
||||
self.t_bbox = t_bbox
|
||||
|
||||
for direction in t_bbox:
|
||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||
|
||||
cols, rows = zip(*self.table_bbox[tk])
|
||||
cols, rows = list(cols), list(rows)
|
||||
cols.extend([tk[0], tk[2]])
|
||||
rows.extend([tk[1], tk[3]])
|
||||
# sort horizontal and vertical segments
|
||||
cols = merge_close_lines(
|
||||
sorted(cols), line_close_tol=self.line_close_tol)
|
||||
rows = merge_close_lines(
|
||||
sorted(rows, reverse=True), line_close_tol=self.line_close_tol)
|
||||
# make grid using x and y coord of shortlisted rows and cols
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
|
||||
return cols, rows, v_s, h_s
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
v_s = kwargs.get('v_s')
|
||||
h_s = kwargs.get('h_s')
|
||||
if v_s is None or h_s is None:
|
||||
raise ValueError('No segments found on {}'.format(self.rootname))
|
||||
|
||||
table = Table(cols, rows)
|
||||
# set table edges to True using ver+hor lines
|
||||
table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol)
|
||||
# set table border edges to True
|
||||
table = table.set_border()
|
||||
# set spanning cells to True
|
||||
table = table.set_span()
|
||||
|
||||
pos_errors = []
|
||||
for direction in self.t_bbox:
|
||||
for t in self.t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table, t, direction, split_text=self.split_text,
|
||||
flag_size=self.flag_size)
|
||||
if indices[:2] != (-1, -1):
|
||||
pos_errors.append(error)
|
||||
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
|
||||
for r_idx, c_idx, text in indices:
|
||||
table.cells[r_idx][c_idx].text = text
|
||||
accuracy = compute_accuracy([[100, pos_errors]])
|
||||
|
||||
if self.copy_text is not None:
|
||||
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
|
||||
|
||||
data = table.data
|
||||
data = encode_(data)
|
||||
table.df = pd.DataFrame(data)
|
||||
table.shape = table.df.shape
|
||||
|
||||
whitespace = compute_whitespace(data)
|
||||
table.accuracy = accuracy
|
||||
table.whitespace = whitespace
|
||||
table.order = table_idx + 1
|
||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||
|
||||
return table
|
||||
|
||||
def extract_tables(self, filename):
|
||||
logger.info('Processing {}'.format(os.path.basename(filename)))
|
||||
self._generate_layout(filename)
|
||||
|
||||
if not self.horizontal_text:
|
||||
logger.info("No tables found on {}".format(
|
||||
os.path.basename(self.rootname)))
|
||||
return [], self.g
|
||||
|
||||
self._generate_image()
|
||||
self._generate_table_bbox()
|
||||
|
||||
_tables = []
|
||||
# sort tables based on y-coord
|
||||
for table_idx, tk in enumerate(sorted(self.table_bbox.keys(),
|
||||
key=lambda x: x[1], reverse=True)):
|
||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
|
||||
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||
_tables.append(table)
|
||||
|
||||
if self.debug:
|
||||
text = []
|
||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||
self.g.text = text
|
||||
self.g.images = (self.image, self.table_bbox_unscaled)
|
||||
self.g.segments = (self.vertical_segments, self.horizontal_segments)
|
||||
self.g.tables = _tables
|
||||
|
||||
return _tables, self.g
|
||||
|
|
@ -0,0 +1,370 @@
|
|||
from __future__ import division
|
||||
import os
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from .base import BaseParser
|
||||
from ..core import Table
|
||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||
compute_whitespace, setup_logging, encode_)
|
||||
|
||||
|
||||
logger = setup_logging(__name__)
|
||||
|
||||
|
||||
class Stream(BaseParser):
|
||||
"""Stream method of parsing looks for spaces between text
|
||||
to form a table.
|
||||
|
||||
If you want to specify columns when specifying multiple table
|
||||
areas, make sure that the length of both lists are equal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
columns : list, optional (default: None)
|
||||
List of column x-coordinates as strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
row_close_tol : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
col_close_tol : int, optional (default: 0)
|
||||
Columns will be formed by combining text horizontally
|
||||
within this tolerance.
|
||||
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
values for table_area(s), columns and debugging.
|
||||
|
||||
"""
|
||||
def __init__(self, table_area=None, columns=None, split_text=False,
|
||||
flag_size=False, row_close_tol=2, col_close_tol=0,
|
||||
margins=(1.0, 0.5, 0.1), debug=False):
|
||||
self.table_area = table_area
|
||||
self.columns = columns
|
||||
self._validate_columns()
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.row_close_tol = row_close_tol
|
||||
self.col_close_tol = col_close_tol
|
||||
self.char_margin, self.line_margin, self.word_margin = margins
|
||||
self.debug = debug
|
||||
|
||||
@staticmethod
|
||||
def _text_bbox(t_bbox):
|
||||
"""Returns bounding box for the text present on a page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t_bbox : dict
|
||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text_bbox : tuple
|
||||
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
||||
|
||||
"""
|
||||
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
||||
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
||||
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
||||
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
|
||||
text_bbox = (xmin, ymin, xmax, ymax)
|
||||
return text_bbox
|
||||
|
||||
@staticmethod
|
||||
def _group_rows(text, row_close_tol=2):
|
||||
"""Groups PDFMiner text objects into rows vertically
|
||||
within a tolerance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
row_close_tol : int, optional (default: 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
Two-dimensional list of text objects grouped into rows.
|
||||
|
||||
"""
|
||||
row_y = 0
|
||||
rows = []
|
||||
temp = []
|
||||
for t in text:
|
||||
# is checking for upright necessary?
|
||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
||||
# type(obj) is LTChar]):
|
||||
if t.get_text().strip():
|
||||
if not np.isclose(row_y, t.y0, atol=row_close_tol):
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
temp = []
|
||||
row_y = t.y0
|
||||
temp.append(t)
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
__ = rows.pop(0) # hacky
|
||||
return rows
|
||||
|
||||
@staticmethod
|
||||
def _merge_columns(l, col_close_tol=0):
|
||||
"""Merges column boundaries horizontally if they overlap
|
||||
or lie within a tolerance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
l : list
|
||||
List of column x-coordinate tuples.
|
||||
col_close_tol : int, optional (default: 0)
|
||||
|
||||
Returns
|
||||
-------
|
||||
merged : list
|
||||
List of merged column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
merged = []
|
||||
for higher in l:
|
||||
if not merged:
|
||||
merged.append(higher)
|
||||
else:
|
||||
lower = merged[-1]
|
||||
if col_close_tol >= 0:
|
||||
if (higher[0] <= lower[1] or
|
||||
np.isclose(higher[0], lower[1], atol=col_close_tol)):
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
else:
|
||||
merged.append(higher)
|
||||
elif col_close_tol < 0:
|
||||
if higher[0] <= lower[1]:
|
||||
if np.isclose(higher[0], lower[1], atol=abs(col_close_tol)):
|
||||
merged.append(higher)
|
||||
else:
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
else:
|
||||
merged.append(higher)
|
||||
return merged
|
||||
|
||||
@staticmethod
|
||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||
"""Makes row coordinates continuous.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
rows_grouped : list
|
||||
Two-dimensional list of text objects grouped into rows.
|
||||
text_y_max : int
|
||||
text_y_min : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
List of continuous row y-coordinate tuples.
|
||||
|
||||
"""
|
||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||
if len(r) > 0 else 0 for r in rows_grouped]
|
||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||
rows.insert(0, text_y_max)
|
||||
rows.append(text_y_min)
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
return rows
|
||||
|
||||
@staticmethod
|
||||
def _add_columns(cols, text, row_close_tol):
|
||||
"""Adds columns to existing list by taking into account
|
||||
the text that lies outside the current column x-coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of column x-coordinate tuples.
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
ytol : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
Updated list of column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
if text:
|
||||
text = Stream._group_rows(text, row_close_tol=row_close_tol)
|
||||
elements = [len(r) for r in text]
|
||||
new_cols = [(t.x0, t.x1)
|
||||
for r in text if len(r) == max(elements) for t in r]
|
||||
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
||||
return cols
|
||||
|
||||
@staticmethod
|
||||
def _join_columns(cols, text_x_min, text_x_max):
|
||||
"""Makes column coordinates continuous.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of column x-coordinate tuples.
|
||||
text_x_min : int
|
||||
text_y_max : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
Updated list of column x-coordinate tuples.
|
||||
|
||||
"""
|
||||
cols = sorted(cols)
|
||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
return cols
|
||||
|
||||
def _validate_columns(self):
|
||||
if self.table_area is not None and self.columns is not None:
|
||||
if len(self.table_area) != len(self.columns):
|
||||
raise ValueError("Length of table_area and columns"
|
||||
" should be equal")
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
if self.table_area is not None:
|
||||
table_bbox = {}
|
||||
for area in self.table_area:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = float(x1)
|
||||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
table_bbox[(x1, y2, x2, y1)] = None
|
||||
else:
|
||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||
self.table_bbox = table_bbox
|
||||
|
||||
def _generate_columns_and_rows(self, table_idx, tk):
|
||||
# select elements which lie within table_bbox
|
||||
t_bbox = {}
|
||||
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
|
||||
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
|
||||
self.t_bbox = t_bbox
|
||||
|
||||
for direction in self.t_bbox:
|
||||
self.t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
||||
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_close_tol=self.row_close_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
if self.columns is not None and self.columns[table_idx] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_idx].split(',')
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
if ncols == 1:
|
||||
logger.info("No tables found on {}".format(
|
||||
os.path.basename(self.rootname)))
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend([t for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > left and t.x1 < right])
|
||||
outer_text = [t for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||
inner_text.extend(outer_text)
|
||||
cols = self._add_columns(cols, inner_text, self.row_close_tol)
|
||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||
|
||||
return cols, rows
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
table = Table(cols, rows)
|
||||
table = table.set_all_edges()
|
||||
pos_errors = []
|
||||
for direction in self.t_bbox:
|
||||
for t in self.t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table, t, direction, split_text=self.split_text,
|
||||
flag_size=self.flag_size)
|
||||
if indices[:2] != (-1, -1):
|
||||
pos_errors.append(error)
|
||||
for r_idx, c_idx, text in indices:
|
||||
table.cells[r_idx][c_idx].text = text
|
||||
accuracy = compute_accuracy([[100, pos_errors]])
|
||||
|
||||
data = table.data
|
||||
data = encode_(data)
|
||||
table.df = pd.DataFrame(data)
|
||||
table.shape = table.df.shape
|
||||
|
||||
whitespace = compute_whitespace(data)
|
||||
table.accuracy = accuracy
|
||||
table.whitespace = whitespace
|
||||
table.order = table_idx + 1
|
||||
table.page = int(os.path.basename(self.rootname).replace('page-', ''))
|
||||
|
||||
return table
|
||||
|
||||
def extract_tables(self, filename):
|
||||
logger.info('Processing {}'.format(os.path.basename(filename)))
|
||||
self._generate_layout(filename)
|
||||
|
||||
if not self.horizontal_text:
|
||||
logger.info("No tables found on {}".format(
|
||||
os.path.basename(self.rootname)))
|
||||
return [], self.g
|
||||
|
||||
self._generate_table_bbox()
|
||||
|
||||
_tables = []
|
||||
# sort tables based on y-coord
|
||||
for table_idx, tk in enumerate(sorted(self.table_bbox.keys(),
|
||||
key=lambda x: x[1], reverse=True)):
|
||||
cols, rows = self._generate_columns_and_rows(table_idx, tk)
|
||||
table = self._generate_table(table_idx, cols, rows)
|
||||
_tables.append(table)
|
||||
|
||||
if self.debug:
|
||||
text = []
|
||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||
self.g.text = text
|
||||
self.g.tables = _tables
|
||||
|
||||
return _tables, self.g
|
||||
268
camelot/pdf.py
268
camelot/pdf.py
|
|
@ -1,268 +0,0 @@
|
|||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import itertools
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
|
||||
import cv2
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
from .utils import get_page_layout, get_text_objects, get_rotation
|
||||
|
||||
|
||||
__all__ = ['Pdf']
|
||||
|
||||
|
||||
def _parse_page_numbers(pagenos):
|
||||
"""Converts list of dicts to list of ints.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pagenos : list
|
||||
List of dicts representing page ranges. A dict must have only
|
||||
two keys named 'start' and 'end' having int as their value.
|
||||
|
||||
Returns
|
||||
-------
|
||||
page_numbers : list
|
||||
List of int page numbers.
|
||||
"""
|
||||
page_numbers = []
|
||||
for p in pagenos:
|
||||
page_numbers.extend(range(p['start'], p['end'] + 1))
|
||||
page_numbers = sorted(set(page_numbers))
|
||||
return page_numbers
|
||||
|
||||
|
||||
def _save_page(temp, pdfname, pageno):
|
||||
with open(pdfname, 'rb') as pdffile:
|
||||
infile = PdfFileReader(pdffile, strict=False)
|
||||
sp_path = os.path.join(temp, 'page-{0}.pdf'.format(pageno))
|
||||
sp_name, sp_ext = os.path.splitext(sp_path)
|
||||
page = infile.getPage(pageno - 1)
|
||||
outfile = PdfFileWriter()
|
||||
outfile.addPage(page)
|
||||
with open(sp_path, 'wb') as f:
|
||||
outfile.write(f)
|
||||
layout, dim = get_page_layout(sp_path)
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
||||
if rotation != '':
|
||||
sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
|
||||
os.rename(sp_path, sp_new_path)
|
||||
sp_in = PdfFileReader(open(sp_new_path, 'rb'),
|
||||
strict=False)
|
||||
sp_out = PdfFileWriter()
|
||||
sp_page = sp_in.getPage(0)
|
||||
if rotation == 'left':
|
||||
sp_page.rotateClockwise(90)
|
||||
elif rotation == 'right':
|
||||
sp_page.rotateCounterClockwise(90)
|
||||
sp_out.addPage(sp_page)
|
||||
with open(sp_path, 'wb') as pdf_out:
|
||||
sp_out.write(pdf_out)
|
||||
|
||||
|
||||
class Pdf:
|
||||
"""Pdf manager.
|
||||
Handles all operations like temp directory creation, splitting file
|
||||
into single page pdfs, running extraction using multiple processes
|
||||
and removing the temp directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
extractor : object
|
||||
camelot.stream.Stream or camelot.lattice.Lattice extractor
|
||||
object.
|
||||
|
||||
pdfname : string
|
||||
Path to pdf file.
|
||||
|
||||
pagenos : list
|
||||
List of dicts representing page ranges. A dict must have only
|
||||
two keys named 'start' and 'end' having int as their value.
|
||||
(optional, default: [{'start': 1, 'end': 1}])
|
||||
|
||||
parallel : bool
|
||||
Whether or not to run using multiple processes.
|
||||
(optional, default: False)
|
||||
|
||||
clean : bool
|
||||
Whether or not to remove the temp directory.
|
||||
(optional, default: False)
|
||||
"""
|
||||
|
||||
def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
|
||||
parallel=False, clean=False):
|
||||
|
||||
self.extractor = extractor
|
||||
self.pdfname = pdfname
|
||||
if not self.pdfname.endswith('.pdf'):
|
||||
raise TypeError("File format not supported.")
|
||||
self.pagenos = _parse_page_numbers(pagenos)
|
||||
self.parallel = parallel
|
||||
if self.parallel:
|
||||
self.cpu_count = mp.cpu_count()
|
||||
self.pool = mp.Pool(processes=self.cpu_count)
|
||||
self.clean = clean
|
||||
self.temp = tempfile.mkdtemp()
|
||||
|
||||
def split(self):
|
||||
"""Splits file into single page pdfs.
|
||||
"""
|
||||
if self.parallel:
|
||||
pfunc = partial(_save_page, self.temp, self.pdfname)
|
||||
self.pool.map(pfunc, self.pagenos)
|
||||
else:
|
||||
for p in self.pagenos:
|
||||
_save_page(self.temp, self.pdfname, p)
|
||||
|
||||
|
||||
def extract(self):
|
||||
"""Runs table extraction by calling extractor.get_tables
|
||||
on all single page pdfs.
|
||||
"""
|
||||
self.split()
|
||||
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||
for p in self.pagenos]
|
||||
if self.parallel:
|
||||
tables = self.pool.map(self.extractor.get_tables, pages)
|
||||
tables = {k: v for d in tables if d is not None for k, v in d.items()}
|
||||
else:
|
||||
tables = {}
|
||||
if self.extractor.debug:
|
||||
if self.extractor.method == 'stream':
|
||||
self.debug = self.extractor.debug
|
||||
self.debug_text = []
|
||||
elif self.extractor.method in ['lattice', 'ocrl']:
|
||||
self.debug = self.extractor.debug
|
||||
self.debug_images = []
|
||||
self.debug_segments = []
|
||||
self.debug_tables = []
|
||||
elif self.extractor.method == 'ocrs':
|
||||
self.debug = self.extractor.debug
|
||||
self.debug_images = []
|
||||
for p in pages:
|
||||
table = self.extractor.get_tables(p)
|
||||
if table is not None:
|
||||
tables.update(table)
|
||||
if self.extractor.debug:
|
||||
if self.extractor.method == 'stream':
|
||||
self.debug_text.append(self.extractor.debug_text)
|
||||
elif self.extractor.method in ['lattice', 'ocr']:
|
||||
self.debug_images.append(self.extractor.debug_images)
|
||||
self.debug_segments.append(self.extractor.debug_segments)
|
||||
self.debug_tables.append(self.extractor.debug_tables)
|
||||
elif self.extractor.method == 'ocrs':
|
||||
self.debug_images.append(self.extractor.debug_images)
|
||||
if self.clean:
|
||||
self.remove_tempdir()
|
||||
return tables
|
||||
|
||||
def remove_tempdir(self):
|
||||
"""Removes temporary directory that was created to save single
|
||||
page pdfs and their images.
|
||||
"""
|
||||
shutil.rmtree(self.temp)
|
||||
|
||||
def debug_plot(self):
|
||||
"""Generates a matplotlib plot based on the selected extractor
|
||||
debug option.
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
||||
if self.debug is True:
|
||||
if hasattr(self, 'debug_text'):
|
||||
for text in self.debug_text:
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
xs, ys = [], []
|
||||
for t in text:
|
||||
xs.extend([t[0], t[1]])
|
||||
ys.extend([t[2], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1]
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
plt.show()
|
||||
elif hasattr(self, 'debug_images'):
|
||||
for img in self.debug_images:
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
elif self.debug == 'contour':
|
||||
try:
|
||||
for img, table_bbox in self.debug_images:
|
||||
for t in table_bbox.keys():
|
||||
cv2.rectangle(img, (t[0], t[1]),
|
||||
(t[2], t[3]), (255, 0, 0), 3)
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'joint':
|
||||
try:
|
||||
for img, table_bbox in self.debug_images:
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in table_bbox.keys():
|
||||
for coord in table_bbox[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
max_x, max_y = max(x_coord), max(y_coord)
|
||||
plt.plot(x_coord, y_coord, 'ro')
|
||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'line':
|
||||
try:
|
||||
for v_s, h_s in self.debug_segments:
|
||||
for v in v_s:
|
||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in h_s:
|
||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'table':
|
||||
try:
|
||||
for tables in self.debug_tables:
|
||||
for table in tables:
|
||||
for r in range(len(table.rows)):
|
||||
for c in range(len(table.cols)):
|
||||
if table.cells[r][c].left:
|
||||
plt.plot([table.cells[r][c].lb[0],
|
||||
table.cells[r][c].lt[0]],
|
||||
[table.cells[r][c].lb[1],
|
||||
table.cells[r][c].lt[1]])
|
||||
if table.cells[r][c].right:
|
||||
plt.plot([table.cells[r][c].rb[0],
|
||||
table.cells[r][c].rt[0]],
|
||||
[table.cells[r][c].rb[1],
|
||||
table.cells[r][c].rt[1]])
|
||||
if table.cells[r][c].top:
|
||||
plt.plot([table.cells[r][c].lt[0],
|
||||
table.cells[r][c].rt[0]],
|
||||
[table.cells[r][c].lt[1],
|
||||
table.cells[r][c].rt[1]])
|
||||
if table.cells[r][c].bottom:
|
||||
plt.plot([table.cells[r][c].lb[0],
|
||||
table.cells[r][c].rb[0]],
|
||||
[table.cells[r][c].lb[1],
|
||||
table.cells[r][c].rb[1]])
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
else:
|
||||
raise UserWarning("This method can only be called after"
|
||||
" debug has been specified.")
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
import cv2
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
||||
from .handlers import PDFHandler
|
||||
|
||||
|
||||
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
|
||||
"""Plot geometry found on pdf page based on type specified,
|
||||
useful for debugging and playing with different parameters to get
|
||||
the best output.
|
||||
|
||||
Note: kwargs annotated with ^ can only be used with mesh=False
|
||||
and kwargs annotated with * can only be used with mesh=True.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath : str
|
||||
Path to pdf file.
|
||||
pages : str
|
||||
Comma-separated page numbers to parse.
|
||||
Example: 1,3,4 or 1,4-end
|
||||
mesh : bool (default: False)
|
||||
Whether or not to use Lattice method of parsing. Stream
|
||||
is used by default.
|
||||
geometry_type : str, optional (default: 'text')
|
||||
'text' : Plot text objects found on page, useful to get
|
||||
table_area and columns coordinates.
|
||||
'table' : Plot parsed table.
|
||||
'contour'* : Plot detected rectangles.
|
||||
'joint'* : Plot detected line intersections.
|
||||
'line'* : Plot detected lines.
|
||||
table_area : list, optional (default: None)
|
||||
List of table areas to analyze as strings of the form
|
||||
x1,y1,x2,y2 where (x1, y1) -> left-top and
|
||||
(x2, y2) -> right-bottom in pdf coordinate space.
|
||||
columns^ : list, optional (default: None)
|
||||
List of column x-coordinates as strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
row_close_tol^ : int, optional (default: 2)
|
||||
Rows will be formed by combining text vertically
|
||||
within this tolerance.
|
||||
col_close_tol^ : int, optional (default: 0)
|
||||
Columns will be formed by combining text horizontally
|
||||
within this tolerance.
|
||||
process_background* : bool, optional (default: False)
|
||||
Whether or not to process lines that are in background.
|
||||
line_size_scaling* : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
copy_text* : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
shift_text* : list, optional (default: ['l', 't'])
|
||||
{'l', 'r', 't', 'b'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify where the text in a spanning cell should flow.
|
||||
line_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to merge vertical and horizontal
|
||||
detected lines which lie close to each other.
|
||||
joint_close_tol* : int, optional (default: 2)
|
||||
Tolerance parameter used to decide whether the detected lines
|
||||
and points lie close to each other.
|
||||
threshold_blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
threshold_constant : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
|
||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||
debug : bool, optional (default: False)
|
||||
Whether or not to return all text objects on the page
|
||||
which can be used to generate a matplotlib plot, to get
|
||||
values for table_area(s) and debugging.
|
||||
|
||||
"""
|
||||
# validate kwargs?
|
||||
p = PDFHandler(filepath, pages)
|
||||
debug = True if geometry_type else False
|
||||
kwargs.update({'debug': debug})
|
||||
__, geometry = p.parse(mesh=mesh, **kwargs)
|
||||
|
||||
if geometry_type == 'text':
|
||||
for text in geometry.text:
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
xs, ys = [], []
|
||||
for t in text:
|
||||
xs.extend([t[0], t[1]])
|
||||
ys.extend([t[2], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1]
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
plt.show()
|
||||
elif geometry_type == 'table':
|
||||
for tables in geometry.tables:
|
||||
for table in tables:
|
||||
for row in table.cells:
|
||||
for cell in row:
|
||||
if cell.left:
|
||||
plt.plot([cell.lb[0], cell.lt[0]],
|
||||
[cell.lb[1], cell.lt[1]])
|
||||
if cell.right:
|
||||
plt.plot([cell.rb[0], cell.rt[0]],
|
||||
[cell.rb[1], cell.rt[1]])
|
||||
if cell.top:
|
||||
plt.plot([cell.lt[0], cell.rt[0]],
|
||||
[cell.lt[1], cell.rt[1]])
|
||||
if cell.bottom:
|
||||
plt.plot([cell.lb[0], cell.rb[0]],
|
||||
[cell.lb[1], cell.rb[1]])
|
||||
plt.show()
|
||||
elif geometry_type == 'contour':
|
||||
if not mesh:
|
||||
raise ValueError("Use mesh=True")
|
||||
for img, table_bbox in geometry.images:
|
||||
for t in table_bbox.keys():
|
||||
cv2.rectangle(img, (t[0], t[1]),
|
||||
(t[2], t[3]), (255, 0, 0), 3)
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
elif geometry_type == 'joint':
|
||||
if not mesh:
|
||||
raise ValueError("Use mesh=True")
|
||||
for img, table_bbox in geometry.images:
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in table_bbox.keys():
|
||||
for coord in table_bbox[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
max_x, max_y = max(x_coord), max(y_coord)
|
||||
plt.plot(x_coord, y_coord, 'ro')
|
||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
elif geometry_type == 'line':
|
||||
if not mesh:
|
||||
raise ValueError("Use mesh=True")
|
||||
for v_s, h_s in geometry.segments:
|
||||
for v in v_s:
|
||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in h_s:
|
||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
|
|
@ -1,428 +0,0 @@
|
|||
from __future__ import division
|
||||
import os
|
||||
import copy
|
||||
import types
|
||||
import logging
|
||||
import copy_reg
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .table import Table
|
||||
from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
|
||||
encode_list, get_text_objects, get_page_layout)
|
||||
|
||||
|
||||
__all__ = ['Stream']
|
||||
logger = logging.getLogger('app_logger')
|
||||
|
||||
|
||||
def _reduce_method(m):
|
||||
if m.im_self is None:
|
||||
return getattr, (m.im_class, m.im_func.func_name)
|
||||
else:
|
||||
return getattr, (m.im_self, m.im_func.func_name)
|
||||
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||
|
||||
|
||||
def _text_bbox(t_bbox):
|
||||
"""Returns bounding box for the text present on a page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t_bbox : dict
|
||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text_bbox : tuple
|
||||
Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
|
||||
space.
|
||||
"""
|
||||
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
|
||||
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
|
||||
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
|
||||
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
|
||||
text_bbox = (xmin, ymin, xmax, ymax)
|
||||
return text_bbox
|
||||
|
||||
|
||||
def _group_rows(text, ytol=2):
|
||||
"""Groups PDFMiner text objects into rows using their
|
||||
y-coordinates taking into account some tolerance ytol.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
|
||||
ytol : int
|
||||
Tolerance parameter.
|
||||
(optional, default: 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
Two-dimensional list of text objects grouped into rows.
|
||||
"""
|
||||
row_y = 0
|
||||
rows = []
|
||||
temp = []
|
||||
for t in text:
|
||||
# is checking for upright necessary?
|
||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
||||
# type(obj) is LTChar]):
|
||||
if t.get_text().strip():
|
||||
if not np.isclose(row_y, t.y0, atol=ytol):
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
temp = []
|
||||
row_y = t.y0
|
||||
temp.append(t)
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
__ = rows.pop(0) # hacky
|
||||
return rows
|
||||
|
||||
|
||||
def _merge_columns(l, mtol=0):
|
||||
"""Merges column boundaries if they overlap or lie within some
|
||||
tolerance mtol.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
l : list
|
||||
List of column coordinate tuples.
|
||||
|
||||
mtol : int
|
||||
TODO
|
||||
(optional, default: 0)
|
||||
|
||||
Returns
|
||||
-------
|
||||
merged : list
|
||||
List of merged column coordinate tuples.
|
||||
"""
|
||||
merged = []
|
||||
for higher in l:
|
||||
if not merged:
|
||||
merged.append(higher)
|
||||
else:
|
||||
lower = merged[-1]
|
||||
if mtol >= 0:
|
||||
if (higher[0] <= lower[1] or
|
||||
np.isclose(higher[0], lower[1], atol=mtol)):
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
else:
|
||||
merged.append(higher)
|
||||
elif mtol < 0:
|
||||
if higher[0] <= lower[1]:
|
||||
if np.isclose(higher[0], lower[1], atol=abs(mtol)):
|
||||
merged.append(higher)
|
||||
else:
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
else:
|
||||
merged.append(higher)
|
||||
return merged
|
||||
|
||||
|
||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||
"""Makes row coordinates continuous.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
rows_grouped : list
|
||||
Two-dimensional list of text objects grouped into rows.
|
||||
|
||||
text_y_max : int
|
||||
|
||||
text_y_min : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list
|
||||
List of continuous row coordinate tuples.
|
||||
"""
|
||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||
if len(r) > 0 else 0 for r in rows_grouped]
|
||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||
rows.insert(0, text_y_max)
|
||||
rows.append(text_y_min)
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
return rows
|
||||
|
||||
|
||||
def _join_columns(cols, text_x_min, text_x_max):
|
||||
"""Makes column coordinates continuous.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of column coordinate tuples.
|
||||
|
||||
text_x_min : int
|
||||
|
||||
text_y_max : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
Updated list of column coordinate tuples.
|
||||
"""
|
||||
cols = sorted(cols)
|
||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
return cols
|
||||
|
||||
|
||||
def _add_columns(cols, text, ytol):
|
||||
"""Adds columns to existing list by taking into account
|
||||
the text that lies outside the current column coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of column coordinate tuples.
|
||||
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
|
||||
ytol : int
|
||||
Tolerance parameter.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
Updated list of column coordinate tuples.
|
||||
"""
|
||||
if text:
|
||||
text = _group_rows(text, ytol=ytol)
|
||||
elements = [len(r) for r in text]
|
||||
new_cols = [(t.x0, t.x1)
|
||||
for r in text if len(r) == max(elements) for t in r]
|
||||
cols.extend(_merge_columns(sorted(new_cols)))
|
||||
return cols
|
||||
|
||||
|
||||
class Stream:
|
||||
"""Stream looks for spaces between text elements to form a table.
|
||||
|
||||
If you want to give columns, ytol or mtol for each table
|
||||
when specifying multiple table areas, make sure that their length
|
||||
is equal to the length of table_area. Mapping between them is based
|
||||
on index.
|
||||
|
||||
If you don't want to specify columns for the some tables in a pdf
|
||||
page having multiple tables, pass them as empty strings.
|
||||
For example: ['', 'x1,x2,x3,x4', '']
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_area : list
|
||||
List of strings of the form x1,y1,x2,y2 where
|
||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
|
||||
coordinate space, denoting table areas to analyze.
|
||||
(optional, default: None)
|
||||
|
||||
columns : list
|
||||
List of strings where each string is comma-separated values of
|
||||
x-coordinates in PDFMiner's coordinate space.
|
||||
(optional, default: None)
|
||||
|
||||
ytol : list
|
||||
List of ints specifying the y-tolerance parameters.
|
||||
(optional, default: [2])
|
||||
|
||||
mtol : list
|
||||
List of ints specifying the m-tolerance parameters.
|
||||
(optional, default: [0])
|
||||
|
||||
margins : tuple
|
||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||
(optional, default: (1.0, 0.5, 0.1))
|
||||
|
||||
split_text : bool
|
||||
Whether or not to split a text line if it spans across
|
||||
different cells.
|
||||
(optional, default: False)
|
||||
|
||||
flag_size : bool
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
(optional, default: True)
|
||||
|
||||
debug : bool
|
||||
Set to True to generate a matplotlib plot of
|
||||
LTTextLineHorizontals in order to select table_area, columns.
|
||||
(optional, default: False)
|
||||
"""
|
||||
def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0],
|
||||
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
|
||||
debug=False):
|
||||
|
||||
self.method = 'stream'
|
||||
self.table_area = table_area
|
||||
self.columns = columns
|
||||
self.ytol = ytol
|
||||
self.mtol = mtol
|
||||
self.char_margin, self.line_margin, self.word_margin = margins
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
"""Expects a single page pdf as input with rotation corrected.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
pdfname : string
|
||||
Path to single page pdf file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
page : dict
|
||||
"""
|
||||
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
|
||||
line_margin=self.line_margin, word_margin=self.word_margin)
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
width, height = dim
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||
if not lttextlh:
|
||||
warnings.warn("{0}: Page contains no text.".format(
|
||||
os.path.basename(bname)))
|
||||
return {os.path.basename(bname): None}
|
||||
|
||||
if self.debug:
|
||||
self.debug_text = []
|
||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
|
||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
|
||||
return None
|
||||
|
||||
if self.table_area is not None:
|
||||
if self.columns is not None:
|
||||
if len(self.table_area) != len(self.columns):
|
||||
raise ValueError("{0}: Length of table area and columns"
|
||||
" should be equal.".format(os.path.basename(bname)))
|
||||
|
||||
table_bbox = {}
|
||||
for area in self.table_area:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = float(x1)
|
||||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
table_bbox[(x1, y2, x2, y1)] = None
|
||||
else:
|
||||
table_bbox = {(0, 0, width, height): None}
|
||||
|
||||
if len(self.ytol) == 1 and self.ytol[0] == 2:
|
||||
ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
|
||||
else:
|
||||
ytolerance = copy.deepcopy(self.ytol)
|
||||
|
||||
if len(self.mtol) == 1 and self.mtol[0] == 0:
|
||||
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
||||
else:
|
||||
mtolerance = copy.deepcopy(self.mtol)
|
||||
|
||||
page = {}
|
||||
tables = {}
|
||||
# sort tables based on y-coord
|
||||
for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
t_bbox = {}
|
||||
t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
|
||||
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
|
||||
char_bbox = text_in_bbox(k, ltchar)
|
||||
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
|
||||
for direction in t_bbox:
|
||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
|
||||
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
|
||||
rows = _join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
guess = False
|
||||
if self.columns is not None and self.columns[table_no] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_no].split(',')
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
guess = True
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||
if ncols == 1:
|
||||
# no tables detected
|
||||
warnings.warn("{0}: Page contains no tables.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend([t for direction in t_bbox
|
||||
for t in t_bbox[direction]
|
||||
if t.x0 > left and t.x1 < right])
|
||||
outer_text = [t for direction in t_bbox
|
||||
for t in t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||
inner_text.extend(outer_text)
|
||||
cols = _add_columns(cols, inner_text, ytolerance[table_no])
|
||||
cols = _join_columns(cols, text_x_min, text_x_max)
|
||||
|
||||
table = Table(cols, rows)
|
||||
table = table.set_all_edges()
|
||||
assignment_errors = []
|
||||
table_data['split_text'] = []
|
||||
table_data['superscript'] = []
|
||||
for direction in t_bbox:
|
||||
for t in t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table, t, direction, split_text=self.split_text,
|
||||
flag_size=self.flag_size)
|
||||
assignment_errors.append(error)
|
||||
if len(indices) > 1:
|
||||
table_data['split_text'].append(indices)
|
||||
for r_idx, c_idx, text in indices:
|
||||
if all(s in text for s in ['<s>', '</s>']):
|
||||
table_data['superscript'].append((r_idx, c_idx, text))
|
||||
table.cells[r_idx][c_idx].add_text(text)
|
||||
if guess:
|
||||
score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]])
|
||||
else:
|
||||
score = get_score([[100, assignment_errors]])
|
||||
|
||||
table_data['score'] = score
|
||||
ar = table.get_list()
|
||||
ar = encode_list(ar)
|
||||
table_data['data'] = ar
|
||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||
table_data['empty_p'] = empty_p
|
||||
table_data['r_nempty_cells'] = r_nempty_cells
|
||||
table_data['c_nempty_cells'] = c_nempty_cells
|
||||
table_data['nrows'] = len(ar)
|
||||
table_data['ncols'] = len(ar[0])
|
||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||
page[os.path.basename(bname)] = tables
|
||||
|
||||
return page
|
||||
236
camelot/table.py
236
camelot/table.py
|
|
@ -1,236 +0,0 @@
|
|||
import numpy as np
|
||||
|
||||
from .cell import Cell
|
||||
|
||||
|
||||
class Table:
|
||||
"""Table.
|
||||
Defines a table object with coordinates relative to a left-bottom
|
||||
origin, which is also PDFMiner's coordinate space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cols : list
|
||||
List of tuples representing column x-coordinates in increasing
|
||||
order.
|
||||
|
||||
rows : list
|
||||
List of tuples representing row y-coordinates in decreasing
|
||||
order.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cells : list
|
||||
List of cell objects with row-major ordering.
|
||||
|
||||
nocont_ : int
|
||||
Number of lines that did not contribute to setting cell edges.
|
||||
"""
|
||||
|
||||
def __init__(self, cols, rows):
|
||||
|
||||
self.cols = cols
|
||||
self.rows = rows
|
||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||
for c in cols] for r in rows]
|
||||
self.nocont_ = 0
|
||||
self.image = None
|
||||
|
||||
def set_all_edges(self):
|
||||
"""Sets all table edges to True.
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
for c in range(len(self.cols)):
|
||||
self.cells[r][c].left = True
|
||||
self.cells[r][c].right = True
|
||||
self.cells[r][c].top = True
|
||||
self.cells[r][c].bottom = True
|
||||
return self
|
||||
|
||||
def set_border_edges(self):
|
||||
"""Sets table border edges to True.
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
self.cells[r][0].left = True
|
||||
self.cells[r][len(self.cols) - 1].right = True
|
||||
for c in range(len(self.cols)):
|
||||
self.cells[0][c].top = True
|
||||
self.cells[len(self.rows) - 1][c].bottom = True
|
||||
return self
|
||||
|
||||
def set_edges(self, vertical, horizontal, jtol=2):
|
||||
"""Sets a cell's edges to True depending on whether they
|
||||
overlap with lines found by imgproc.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vertical : list
|
||||
List of vertical lines detected by imgproc. Coordinates
|
||||
scaled and translated to the PDFMiner's coordinate space.
|
||||
|
||||
horizontal : list
|
||||
List of horizontal lines detected by imgproc. Coordinates
|
||||
scaled and translated to the PDFMiner's coordinate space.
|
||||
"""
|
||||
for v in vertical:
|
||||
# find closest x coord
|
||||
# iterate over y coords and find closest points
|
||||
i = [i for i, t in enumerate(self.cols)
|
||||
if np.isclose(v[0], t[0], atol=jtol)]
|
||||
j = [j for j, t in enumerate(self.rows)
|
||||
if np.isclose(v[3], t[0], atol=jtol)]
|
||||
k = [k for k, t in enumerate(self.rows)
|
||||
if np.isclose(v[1], t[0], atol=jtol)]
|
||||
if not j:
|
||||
self.nocont_ += 1
|
||||
continue
|
||||
J = j[0]
|
||||
if i == [0]: # only left edge
|
||||
I = i[0]
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[J][I].left = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.rows)
|
||||
while J < K:
|
||||
self.cells[J][I].left = True
|
||||
J += 1
|
||||
elif i == []: # only right edge
|
||||
I = len(self.cols) - 1
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[J][I].right = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.rows)
|
||||
while J < K:
|
||||
self.cells[J][I].right = True
|
||||
J += 1
|
||||
else: # both left and right edges
|
||||
I = i[0]
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[J][I].left = True
|
||||
self.cells[J][I - 1].right = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.rows)
|
||||
while J < K:
|
||||
self.cells[J][I].left = True
|
||||
self.cells[J][I - 1].right = True
|
||||
J += 1
|
||||
|
||||
for h in horizontal:
|
||||
# find closest y coord
|
||||
# iterate over x coords and find closest points
|
||||
i = [i for i, t in enumerate(self.rows)
|
||||
if np.isclose(h[1], t[0], atol=jtol)]
|
||||
j = [j for j, t in enumerate(self.cols)
|
||||
if np.isclose(h[0], t[0], atol=jtol)]
|
||||
k = [k for k, t in enumerate(self.cols)
|
||||
if np.isclose(h[2], t[0], atol=jtol)]
|
||||
if not j:
|
||||
self.nocont_ += 1
|
||||
continue
|
||||
J = j[0]
|
||||
if i == [0]: # only top edge
|
||||
I = i[0]
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[I][J].top = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.cols)
|
||||
while J < K:
|
||||
self.cells[I][J].top = True
|
||||
J += 1
|
||||
elif i == []: # only bottom edge
|
||||
I = len(self.rows) - 1
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[I][J].bottom = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.cols)
|
||||
while J < K:
|
||||
self.cells[I][J].bottom = True
|
||||
J += 1
|
||||
else: # both top and bottom edges
|
||||
I = i[0]
|
||||
if k:
|
||||
K = k[0]
|
||||
while J < K:
|
||||
self.cells[I][J].top = True
|
||||
self.cells[I - 1][J].bottom = True
|
||||
J += 1
|
||||
else:
|
||||
K = len(self.cols)
|
||||
while J < K:
|
||||
self.cells[I][J].top = True
|
||||
self.cells[I - 1][J].bottom = True
|
||||
J += 1
|
||||
|
||||
return self
|
||||
|
||||
def set_spanning(self):
|
||||
"""Sets a cell's spanning_h or spanning_v attribute to True
|
||||
depending on whether the cell spans/extends horizontally or
|
||||
vertically.
|
||||
"""
|
||||
for r in range(len(self.rows)):
|
||||
for c in range(len(self.cols)):
|
||||
bound = self.cells[r][c].get_bounded_edges()
|
||||
if bound == 4:
|
||||
continue
|
||||
elif bound == 3:
|
||||
if not self.cells[r][c].left:
|
||||
if (self.cells[r][c].right and
|
||||
self.cells[r][c].top and
|
||||
self.cells[r][c].bottom):
|
||||
self.cells[r][c].spanning_h = True
|
||||
elif not self.cells[r][c].right:
|
||||
if (self.cells[r][c].left and
|
||||
self.cells[r][c].top and
|
||||
self.cells[r][c].bottom):
|
||||
self.cells[r][c].spanning_h = True
|
||||
elif not self.cells[r][c].top:
|
||||
if (self.cells[r][c].left and
|
||||
self.cells[r][c].right and
|
||||
self.cells[r][c].bottom):
|
||||
self.cells[r][c].spanning_v = True
|
||||
elif not self.cells[r][c].bottom:
|
||||
if (self.cells[r][c].left and
|
||||
self.cells[r][c].right and
|
||||
self.cells[r][c].top):
|
||||
self.cells[r][c].spanning_v = True
|
||||
elif bound == 2:
|
||||
if self.cells[r][c].left and self.cells[r][c].right:
|
||||
if (not self.cells[r][c].top and
|
||||
not self.cells[r][c].bottom):
|
||||
self.cells[r][c].spanning_v = True
|
||||
elif self.cells[r][c].top and self.cells[r][c].bottom:
|
||||
if (not self.cells[r][c].left and
|
||||
not self.cells[r][c].right):
|
||||
self.cells[r][c].spanning_h = True
|
||||
|
||||
return self
|
||||
|
||||
def get_list(self):
|
||||
"""Returns a two-dimensional list of text assigned to each
|
||||
cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ar : list
|
||||
"""
|
||||
ar = []
|
||||
for r in range(len(self.rows)):
|
||||
ar.append([self.cells[r][c].get_text().strip()
|
||||
for c in range(len(self.cols))])
|
||||
return ar
|
||||
398
camelot/utils.py
398
camelot/utils.py
|
|
@ -18,18 +18,47 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
|||
LTTextLineVertical)
|
||||
|
||||
|
||||
def setup_logging(name):
|
||||
"""Sets up a logger with StreamHandler.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
logger : logging.Logger
|
||||
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
|
||||
format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
|
||||
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setLevel(logging.INFO)
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
logger.addHandler(handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
logger = setup_logging(__name__)
|
||||
|
||||
|
||||
def translate(x1, x2):
|
||||
"""Translates x2 by x1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1 : float
|
||||
|
||||
x2 : float
|
||||
|
||||
Returns
|
||||
-------
|
||||
x2 : float
|
||||
|
||||
"""
|
||||
x2 += x1
|
||||
return x2
|
||||
|
|
@ -41,12 +70,12 @@ def scale(x, s):
|
|||
Parameters
|
||||
----------
|
||||
x : float
|
||||
|
||||
s : float
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : float
|
||||
|
||||
"""
|
||||
x *= s
|
||||
return x
|
||||
|
|
@ -58,21 +87,17 @@ def rotate(x1, y1, x2, y2, angle):
|
|||
Parameters
|
||||
----------
|
||||
x1 : float
|
||||
|
||||
y1 : float
|
||||
|
||||
x2 : float
|
||||
|
||||
y2 : float
|
||||
|
||||
angle : float
|
||||
Angle in radians.
|
||||
|
||||
Returns
|
||||
-------
|
||||
xnew : float
|
||||
|
||||
ynew : float
|
||||
|
||||
"""
|
||||
s = np.sin(angle)
|
||||
c = np.cos(angle)
|
||||
|
|
@ -85,17 +110,16 @@ def rotate(x1, y1, x2, y2, angle):
|
|||
return xnew, ynew
|
||||
|
||||
|
||||
def scale_to_image(k, factors):
|
||||
"""Translates and scales PDFMiner coordinates to OpenCV's coordinate
|
||||
space.
|
||||
def scale_pdf(k, factors):
|
||||
"""Translates and scales pdf coordinate space to image
|
||||
coordinate space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k : tuple
|
||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
||||
(x1, y1) -> lt and (x2, y2) -> rb in PDFMiner's coordinate
|
||||
(x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
|
||||
space.
|
||||
|
||||
factors : tuple
|
||||
Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
|
||||
first two elements are scaling factors and pdf_y is height of
|
||||
|
|
@ -105,8 +129,9 @@ def scale_to_image(k, factors):
|
|||
-------
|
||||
knew : tuple
|
||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
||||
(x1, y1) -> lt and (x2, y2) -> rb in OpenCV's coordinate
|
||||
(x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
|
||||
space.
|
||||
|
||||
"""
|
||||
x1, y1, x2, y2 = k
|
||||
scaling_factor_x, scaling_factor_y, pdf_y = factors
|
||||
|
|
@ -118,22 +143,19 @@ def scale_to_image(k, factors):
|
|||
return knew
|
||||
|
||||
|
||||
def scale_to_pdf(tables, v_segments, h_segments, factors):
|
||||
"""Translates and scales OpenCV coordinates to PDFMiner's coordinate
|
||||
space.
|
||||
def scale_image(tables, v_segments, h_segments, factors):
|
||||
"""Translates and scales image coordinate space to pdf
|
||||
coordinate space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tables : dict
|
||||
Dict with table boundaries as keys and list of intersections
|
||||
in that boundary as their value.
|
||||
|
||||
in that boundary as value.
|
||||
v_segments : list
|
||||
List of vertical line segments.
|
||||
|
||||
h_segments : list
|
||||
List of horizontal line segments.
|
||||
|
||||
factors : tuple
|
||||
Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
|
||||
first two elements are scaling factors and img_y is height of
|
||||
|
|
@ -142,10 +164,9 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
|
|||
Returns
|
||||
-------
|
||||
tables_new : dict
|
||||
|
||||
v_segments_new : dict
|
||||
|
||||
h_segments_new : dict
|
||||
|
||||
"""
|
||||
scaling_factor_x, scaling_factor_y, img_y = factors
|
||||
tables_new = {}
|
||||
|
|
@ -178,54 +199,26 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
|
|||
return tables_new, v_segments_new, h_segments_new
|
||||
|
||||
|
||||
def setup_logging(log_filepath):
|
||||
"""Setup logging
|
||||
Args:
|
||||
log_filepath (string): Path to log file
|
||||
Returns:
|
||||
logging.Logger: Logger object
|
||||
"""
|
||||
logger = logging.getLogger("app_logger")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
# Log File Handler (Associating one log file per webservice run)
|
||||
log_file_handler = logging.FileHandler(log_filepath,
|
||||
mode='a',
|
||||
encoding='utf-8')
|
||||
log_file_handler.setLevel(logging.DEBUG)
|
||||
format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
|
||||
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
||||
log_file_handler.setFormatter(formatter)
|
||||
logger.addHandler(log_file_handler)
|
||||
# Stream Log Handler (For console)
|
||||
stream_log_handler = logging.StreamHandler()
|
||||
stream_log_handler.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
||||
stream_log_handler.setFormatter(formatter)
|
||||
logger.addHandler(stream_log_handler)
|
||||
return logger
|
||||
|
||||
|
||||
def get_rotation(lttextlh, lttextlv, ltchar):
|
||||
"""Detects if text in table is vertical or not using the current
|
||||
"""Detects if text in table is rotated or not using the current
|
||||
transformation matrix (CTM) and returns its orientation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lttextlh : list
|
||||
List of PDFMiner LTTextLineHorizontal objects.
|
||||
|
||||
lttextlv : list
|
||||
List of PDFMiner LTTextLineVertical objects.
|
||||
|
||||
ltchar : list
|
||||
List of PDFMiner LTChar objects.
|
||||
|
||||
Returns
|
||||
-------
|
||||
rotation : string
|
||||
{'', 'left', 'right'}
|
||||
'' if text in table is upright, 'left' if rotated 90 degree
|
||||
anti-clockwise and 'right' if rotated 90 degree clockwise.
|
||||
'' if text in table is upright, 'anticlockwise' if
|
||||
rotated 90 degree anticlockwise and 'clockwise' if
|
||||
rotated 90 degree clockwise.
|
||||
|
||||
"""
|
||||
rotation = ''
|
||||
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
||||
|
|
@ -233,23 +226,21 @@ def get_rotation(lttextlh, lttextlv, ltchar):
|
|||
if hlen < vlen:
|
||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
||||
rotation = 'left' if clockwise < anticlockwise else 'right'
|
||||
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
|
||||
return rotation
|
||||
|
||||
|
||||
def segments_bbox(bbox, v_segments, h_segments):
|
||||
"""Returns all line segments present inside a
|
||||
table's bounding box.
|
||||
def segments_in_bbox(bbox, v_segments, h_segments):
|
||||
"""Returns all line segments present inside a bounding box.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bbox : tuple
|
||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
|
||||
|
||||
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
|
||||
space.
|
||||
v_segments : list
|
||||
List of vertical line segments.
|
||||
|
||||
h_segments : list
|
||||
List of vertical horizontal segments.
|
||||
|
||||
|
|
@ -257,9 +248,9 @@ def segments_bbox(bbox, v_segments, h_segments):
|
|||
-------
|
||||
v_s : list
|
||||
List of vertical line segments that lie inside table.
|
||||
|
||||
h_s : list
|
||||
List of horizontal line segments that lie inside table.
|
||||
|
||||
"""
|
||||
lb = (bbox[0], bbox[1])
|
||||
rt = (bbox[2], bbox[3])
|
||||
|
|
@ -271,45 +262,43 @@ def segments_bbox(bbox, v_segments, h_segments):
|
|||
|
||||
|
||||
def text_in_bbox(bbox, text):
|
||||
"""Returns all text objects present inside a
|
||||
table's bounding box.
|
||||
"""Returns all text objects present inside a bounding box.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bbox : tuple
|
||||
Tuple (x1, y1, x2, y2) representing table bounding box where
|
||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
|
||||
|
||||
text : list
|
||||
List of PDFMiner text objects.
|
||||
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
|
||||
space.
|
||||
text : List of PDFMiner text objects.
|
||||
|
||||
Returns
|
||||
-------
|
||||
t_bbox : list
|
||||
List of PDFMiner text objects that lie inside table.
|
||||
|
||||
"""
|
||||
lb = (bbox[0], bbox[1])
|
||||
rt = (bbox[2], bbox[3])
|
||||
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
||||
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
|
||||
<= rt[1] + 2]
|
||||
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
|
||||
<= rt[1] + 2]
|
||||
return t_bbox
|
||||
|
||||
|
||||
def remove_close_values(ar, mtol=2):
|
||||
"""Removes values which are within a tolerance of mtol of another value
|
||||
present in list.
|
||||
def remove_close_lines(ar, line_close_tol=2):
|
||||
"""Removes lines which are within a tolerance, based on their x or
|
||||
y axis projections.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ar : list
|
||||
|
||||
mtol : int
|
||||
(optional, default: 2)
|
||||
line_close_tol : int, optional (default: 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
ret : list
|
||||
|
||||
"""
|
||||
ret = []
|
||||
for a in ar:
|
||||
|
|
@ -317,27 +306,26 @@ def remove_close_values(ar, mtol=2):
|
|||
ret.append(a)
|
||||
else:
|
||||
temp = ret[-1]
|
||||
if np.isclose(temp, a, atol=mtol):
|
||||
if np.isclose(temp, a, atol=line_close_tol):
|
||||
pass
|
||||
else:
|
||||
ret.append(a)
|
||||
return ret
|
||||
|
||||
|
||||
def merge_close_values(ar, mtol=2):
|
||||
"""Merges values which are within a tolerance of mtol by calculating
|
||||
a moving mean.
|
||||
def merge_close_lines(ar, line_close_tol=2):
|
||||
"""Merges lines which are within a tolerance by calculating a
|
||||
moving mean, based on their x or y axis projections.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ar : list
|
||||
|
||||
mtol : int
|
||||
(optional, default: 2)
|
||||
line_close_tol : int, optional (default: 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
ret : list
|
||||
|
||||
"""
|
||||
ret = []
|
||||
for a in ar:
|
||||
|
|
@ -345,7 +333,7 @@ def merge_close_values(ar, mtol=2):
|
|||
ret.append(a)
|
||||
else:
|
||||
temp = ret[-1]
|
||||
if np.isclose(temp, a, atol=mtol):
|
||||
if np.isclose(temp, a, atol=line_close_tol):
|
||||
temp = (temp + a) / 2.0
|
||||
ret[-1] = temp
|
||||
else:
|
||||
|
|
@ -353,22 +341,21 @@ def merge_close_values(ar, mtol=2):
|
|||
return ret
|
||||
|
||||
|
||||
def flag_on_size(textline, direction):
|
||||
"""Flags a super/subscript by enclosing it with <s></s>. May give
|
||||
false positives.
|
||||
def flag_font_size(textline, direction):
|
||||
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
||||
May give false positives.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
textline : list
|
||||
List of PDFMiner LTChar objects.
|
||||
|
||||
direction : string
|
||||
{'horizontal', 'vertical'}
|
||||
Direction of the PDFMiner LTTextLine object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
fstring : string
|
||||
|
||||
"""
|
||||
if direction == 'horizontal':
|
||||
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
||||
|
|
@ -395,33 +382,28 @@ def flag_on_size(textline, direction):
|
|||
return fstring
|
||||
|
||||
|
||||
def split_textline(table, textline, direction, flag_size=True):
|
||||
def split_textline(table, textline, direction, flag_size=False):
|
||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||
multiple rows/columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : object
|
||||
camelot.pdf.Pdf
|
||||
|
||||
table : camelot.core.Table
|
||||
textline : object
|
||||
PDFMiner LTTextLine object.
|
||||
|
||||
direction : string
|
||||
{'horizontal', 'vertical'}
|
||||
Direction of the PDFMiner LTTextLine object.
|
||||
|
||||
flag_size : bool
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
(optional, default: True)
|
||||
|
||||
Returns
|
||||
-------
|
||||
grouped_chars : list
|
||||
List of tuples of the form (idx, text) where idx is the index
|
||||
of row/column and text is the an lttextline substring.
|
||||
|
||||
"""
|
||||
idx = 0
|
||||
cut_text = []
|
||||
|
|
@ -466,46 +448,37 @@ def split_textline(table, textline, direction, flag_size=True):
|
|||
grouped_chars = []
|
||||
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
||||
if flag_size:
|
||||
grouped_chars.append((key[0], key[1], flag_on_size([t[2] for t in chars], direction)))
|
||||
grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction)))
|
||||
else:
|
||||
gchars = [t[2].get_text() for t in chars]
|
||||
grouped_chars.append((key[0], key[1], ''.join(gchars).strip('\n')))
|
||||
return grouped_chars
|
||||
|
||||
|
||||
def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
||||
"""Gets indices of the cell where given text object lies by
|
||||
def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||
"""Gets indices of the table cell where given text object lies by
|
||||
comparing their y and x-coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : object
|
||||
camelot.table.Table
|
||||
|
||||
table : camelot.core.Table
|
||||
t : object
|
||||
PDFMiner LTTextLine object.
|
||||
|
||||
direction : string
|
||||
{'horizontal', 'vertical'}
|
||||
Direction of the PDFMiner LTTextLine object.
|
||||
|
||||
split_text : bool
|
||||
split_text : bool, optional (default: False)
|
||||
Whether or not to split a text line if it spans across
|
||||
multiple cells.
|
||||
(optional, default: False)
|
||||
|
||||
flag_size : bool
|
||||
flag_size : bool, optional (default: False)
|
||||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string, useful for
|
||||
super and subscripts.
|
||||
(optional, default: True)
|
||||
|
||||
Returns
|
||||
-------
|
||||
indices : list
|
||||
List of tuples of the form (idx, text) where idx is the index
|
||||
of row/column and text is the an lttextline substring.
|
||||
|
||||
List of tuples of the form (r_idx, c_idx, text) where r_idx
|
||||
and c_idx are row and column indices.
|
||||
error : float
|
||||
Assignment error, percentage of text area that lies outside
|
||||
a cell.
|
||||
|
|
@ -514,6 +487,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
|||
| [Text bounding box]
|
||||
| |
|
||||
+-------+
|
||||
|
||||
"""
|
||||
r_idx, c_idx = [-1] * 2
|
||||
for r in range(len(table.rows)):
|
||||
|
|
@ -528,7 +502,11 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
|||
else:
|
||||
lt_col_overlap.append(-1)
|
||||
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
|
||||
logging.warning("Text did not fit any column.")
|
||||
text = t.get_text().strip('\n')
|
||||
text_range = (t.x0, t.x1)
|
||||
col_range = (table.cols[0][0], table.cols[-1][1])
|
||||
logger.info("{} {} does not lie in column range {}".format(
|
||||
text, text_range, col_range))
|
||||
r_idx = r
|
||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||
break
|
||||
|
|
@ -552,14 +530,14 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
|||
return split_textline(table, t, direction, flag_size=flag_size), error
|
||||
else:
|
||||
if flag_size:
|
||||
return [(r_idx, c_idx, flag_on_size(t._objs, direction))], error
|
||||
return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error
|
||||
else:
|
||||
return [(r_idx, c_idx, t.get_text().strip('\n'))], error
|
||||
|
||||
|
||||
def get_score(error_weights):
|
||||
"""Calculates score based on weights assigned to various parameters,
|
||||
and their error percentages.
|
||||
def compute_accuracy(error_weights):
|
||||
"""Calculates a score based on weights assigned to various
|
||||
parameters and their error percentages.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -571,6 +549,7 @@ def get_score(error_weights):
|
|||
Returns
|
||||
-------
|
||||
score : float
|
||||
|
||||
"""
|
||||
SCORE_VAL = 100
|
||||
try:
|
||||
|
|
@ -586,6 +565,30 @@ def get_score(error_weights):
|
|||
return score
|
||||
|
||||
|
||||
def compute_whitespace(d):
|
||||
"""Calculates the percentage of empty strings in a
|
||||
two-dimensional list.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
whitespace : float
|
||||
Percentage of empty cells.
|
||||
|
||||
"""
|
||||
whitespace = 0
|
||||
r_nempty_cells, c_nempty_cells = [], []
|
||||
for i in d:
|
||||
for j in i:
|
||||
if j.strip() == '':
|
||||
whitespace += 1
|
||||
whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
|
||||
return whitespace
|
||||
|
||||
|
||||
def remove_empty(d):
|
||||
"""Removes empty rows and columns from a two-dimensional list.
|
||||
|
||||
|
|
@ -596,6 +599,7 @@ def remove_empty(d):
|
|||
Returns
|
||||
-------
|
||||
d : list
|
||||
|
||||
"""
|
||||
for i, row in enumerate(d):
|
||||
if row == [''] * len(row):
|
||||
|
|
@ -606,50 +610,8 @@ def remove_empty(d):
|
|||
return d
|
||||
|
||||
|
||||
def count_empty(d):
|
||||
"""Counts empty rows and columns in a two-dimensional list.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
n_empty_rows : list
|
||||
Number of empty rows.
|
||||
|
||||
n_empty_cols : list
|
||||
Number of empty columns.
|
||||
|
||||
empty_p : float
|
||||
Percentage of empty cells.
|
||||
"""
|
||||
empty_p = 0
|
||||
r_nempty_cells, c_nempty_cells = [], []
|
||||
for i in d:
|
||||
for j in i:
|
||||
if j.strip() == '':
|
||||
empty_p += 1
|
||||
empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
|
||||
for row in d:
|
||||
r_nempty_c = 0
|
||||
for r in row:
|
||||
if r.strip() != '':
|
||||
r_nempty_c += 1
|
||||
r_nempty_cells.append(r_nempty_c)
|
||||
d = zip(*d)
|
||||
d = [list(col) for col in d]
|
||||
for col in d:
|
||||
c_nempty_c = 0
|
||||
for c in col:
|
||||
if c.strip() != '':
|
||||
c_nempty_c += 1
|
||||
c_nempty_cells.append(c_nempty_c)
|
||||
return empty_p, r_nempty_cells, c_nempty_cells
|
||||
|
||||
|
||||
def encode_list(ar):
|
||||
"""Encodes list of text.
|
||||
def encode_(ar):
|
||||
"""Encodes two-dimensional list into unicode.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -658,52 +620,13 @@ def encode_list(ar):
|
|||
Returns
|
||||
-------
|
||||
ar : list
|
||||
|
||||
"""
|
||||
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
||||
return ar
|
||||
|
||||
|
||||
def get_text_objects(layout, ltype="char", t=None):
|
||||
"""Recursively parses pdf layout to get a list of
|
||||
text objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
layout : object
|
||||
PDFMiner LTPage object.
|
||||
|
||||
ltype : string
|
||||
{'char', 'lh', 'lv'}
|
||||
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
|
||||
and LTTextLineVertical objects respectively.
|
||||
|
||||
t : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : list
|
||||
List of PDFMiner text objects.
|
||||
"""
|
||||
if ltype == "char":
|
||||
LTObject = LTChar
|
||||
elif ltype == "lh":
|
||||
LTObject = LTTextLineHorizontal
|
||||
elif ltype == "lv":
|
||||
LTObject = LTTextLineVertical
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += get_text_objects(obj, ltype=ltype)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||
def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||
detect_vertical=True, all_texts=True):
|
||||
"""Returns a PDFMiner LTPage object and page dimension of a single
|
||||
page pdf. See https://euske.github.io/pdfminer/ to get definitions
|
||||
|
|
@ -711,28 +634,23 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
|||
|
||||
Parameters
|
||||
----------
|
||||
pname : string
|
||||
filename : string
|
||||
Path to pdf file.
|
||||
|
||||
char_margin : float
|
||||
|
||||
line_margin : float
|
||||
|
||||
word_margin : float
|
||||
|
||||
detect_vertical : bool
|
||||
|
||||
all_texts : bool
|
||||
|
||||
Returns
|
||||
-------
|
||||
layout : object
|
||||
PDFMiner LTPage object.
|
||||
|
||||
dim : tuple
|
||||
pdf page dimension of the form (width, height).
|
||||
Dimension of pdf page in the form (width, height).
|
||||
|
||||
"""
|
||||
with open(pname, 'r') as f:
|
||||
with open(filename, 'r') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
|
|
@ -754,16 +672,56 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
|||
return layout, dim
|
||||
|
||||
|
||||
def get_text_objects(layout, ltype="char", t=None):
|
||||
"""Recursively parses pdf layout to get a list of
|
||||
PDFMiner text objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
layout : object
|
||||
PDFMiner LTPage object.
|
||||
ltype : string
|
||||
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
|
||||
and LTTextLineVertical objects respectively.
|
||||
t : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : list
|
||||
List of PDFMiner text objects.
|
||||
|
||||
"""
|
||||
if ltype == "char":
|
||||
LTObject = LTChar
|
||||
elif ltype == "lh":
|
||||
LTObject = LTTextLineHorizontal
|
||||
elif ltype == "lv":
|
||||
LTObject = LTTextLineVertical
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += get_text_objects(obj, ltype=ltype)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
def merge_tuples(tuples):
|
||||
"""Merges a list of overlapping tuples.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tuples : list
|
||||
List of tuples where a tuple is a single axis coordinate pair.
|
||||
|
||||
Yields
|
||||
------
|
||||
tuple
|
||||
|
||||
Returns
|
||||
-------
|
||||
merged : list
|
||||
"""
|
||||
merged = list(tuples[0])
|
||||
for s, e in tuples:
|
||||
|
|
|
|||
|
|
@ -1,53 +0,0 @@
|
|||
"""
|
||||
usage: python hough_opencv.py file.png
|
||||
|
||||
finds lines present in an image using opencv's hough transform.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
image = cv2.imread(sys.argv[1])
|
||||
print "image dimensions -> {0}".format(image.shape)
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||||
|
||||
lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)
|
||||
print "found {0} lines".format(len(lines))
|
||||
for line in lines:
|
||||
r, theta = line[0]
|
||||
# filter horizontal and vertical lines
|
||||
if theta == 0 or np.isclose(theta, np.pi / 2):
|
||||
x0 = r * np.cos(theta)
|
||||
y0 = r * np.sin(theta)
|
||||
x1 = int(x0 + 10000 * (-np.sin(theta)))
|
||||
y1 = int(y0 + 10000 * (np.cos(theta)))
|
||||
x2 = int(x0 - 10000 * (-np.sin(theta)))
|
||||
y2 = int(y0 - 10000 * (np.cos(theta)))
|
||||
cv2.line(image, (x1, y1), (x2, y2), (0, 0, 255), 5)
|
||||
plt.imshow(image)
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
"""
|
||||
usage: python hough_skimage.py file.png
|
||||
|
||||
finds lines present in an image using scikit-image's hough transform.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from scipy.misc import imread
|
||||
import matplotlib.pyplot as plt
|
||||
from skimage.transform import hough_line, hough_line_peaks
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
image = cv2.imread(sys.argv[1])
|
||||
print "image dimensions -> {0}".format(image.shape)
|
||||
ret, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
|
||||
binary = np.min(binary, axis=2)
|
||||
binary = np.where(binary == 255, 0, 255)
|
||||
rows, cols = binary.shape
|
||||
pixel = np.zeros(binary.shape)
|
||||
|
||||
fig, ax = plt.subplots(1, 1, figsize=(8,4))
|
||||
ax.imshow(image, cmap=plt.cm.gray)
|
||||
|
||||
theta_in = np.linspace(0, np.pi / 2, 10)
|
||||
h, theta, d = hough_line(binary, theta_in)
|
||||
for _, angle, dist in zip(*hough_line_peaks(h, theta, d)):
|
||||
x0 = dist * np.cos(angle)
|
||||
y0 = dist * np.sin(angle)
|
||||
x1 = int(x0 + 1000 * (-np.sin(angle)))
|
||||
y1 = int(y0 + 1000 * (np.cos(angle)))
|
||||
x2 = int(x0 - 1000 * (-np.sin(angle)))
|
||||
y2 = int(y0 - 1000 * (np.cos(angle)))
|
||||
ax.plot((x1, x2), (y1, y2), '-r')
|
||||
a = np.cos(angle)
|
||||
b = np.sin(angle)
|
||||
x = np.arange(binary.shape[1])
|
||||
y = np.arange(binary.shape[0])
|
||||
x = a * x
|
||||
y = b * y
|
||||
R = np.round(np.add(y.reshape((binary.shape[0], 1)), x.reshape((1, binary.shape[1]))))
|
||||
pixel += np.isclose(R, np.round(dist))
|
||||
|
||||
pixel = np.clip(pixel, 0, 1)
|
||||
pixel = np.where(pixel == 1, 0, 1)
|
||||
binary = np.where(binary == 0, 255, 0)
|
||||
binary *= pixel.astype(np.int64)
|
||||
ax.imshow(binary, cmap=plt.cm.gray)
|
||||
ax.axis((0, cols, rows, 0))
|
||||
ax.set_title('Detected lines')
|
||||
ax.set_axis_off()
|
||||
ax.set_adjustable('box-forced')
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
"""
|
||||
usage: python hough_prob.py file.png
|
||||
|
||||
finds lines present in an image using scikit-image's hough transform.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
from scipy.misc import imread
|
||||
import matplotlib.pyplot as plt
|
||||
from skimage.feature import canny
|
||||
from skimage.transform import probabilistic_hough_line
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
image = imread(sys.argv[1], mode='L')
|
||||
edges = canny(image, 2, 1, 25)
|
||||
lines = probabilistic_hough_line(edges, threshold=1000)
|
||||
|
||||
fig, ax = plt.subplots(1, 1, figsize=(8,4), sharex=True, sharey=True)
|
||||
ax.imshow(edges * 0)
|
||||
|
||||
for line in lines:
|
||||
p0, p1 = line
|
||||
ax.plot((p0[0], p1[0]), (p0[1], p1[1]))
|
||||
|
||||
ax.set_title('Probabilistic Hough')
|
||||
ax.set_axis_off()
|
||||
ax.set_adjustable('box-forced')
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
"""
|
||||
usage: python morph_transform.py file.png scale={int} invert={bool}
|
||||
|
||||
finds lines present in an image using opencv's morph transform.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
def mt(imagename, scale=40, invert=False):
|
||||
img = cv2.imread(imagename)
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
if invert:
|
||||
threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
|
||||
vertical = threshold
|
||||
horizontal = threshold
|
||||
|
||||
verticalsize = vertical.shape[0] / scale
|
||||
horizontalsize = horizontal.shape[1] / scale
|
||||
|
||||
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||
|
||||
vertical = cv2.erode(vertical, ver, (-1, -1))
|
||||
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
||||
|
||||
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
||||
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
||||
|
||||
mask = vertical + horizontal
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
tables = {}
|
||||
for c in contours:
|
||||
x, y, w, h = cv2.boundingRect(c)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
# find number of non-zero values in joints using what boundingRect returns
|
||||
roi = joints[y:y+h, x:x+w]
|
||||
jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
for j in jc:
|
||||
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||
c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
|
||||
joint_coords.append((c1, c2))
|
||||
tables[(x1, y2, x2, y1)] = joint_coords
|
||||
|
||||
vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for vc in vcontours:
|
||||
x, y, w, h = cv2.boundingRect(vc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
plt.plot([(x1 + x2) / 2, (x1 + x2) / 2], [y2, y1])
|
||||
|
||||
hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for hc in hcontours:
|
||||
x, y, w, h = cv2.boundingRect(hc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
plt.plot([x1, x2], [(y1 + y2) / 2, (y1 + y2) / 2])
|
||||
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in tables.keys():
|
||||
for coord in tables[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
plt.plot(x_coord, y_coord, 'ro')
|
||||
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
return tables
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
try:
|
||||
scale = int(sys.argv[2].split('=')[1])
|
||||
except IndexError:
|
||||
scale = 40
|
||||
try:
|
||||
invert = bool(sys.argv[3].split('=')[1])
|
||||
except IndexError:
|
||||
invert = False
|
||||
t = mt(sys.argv[1], scale=scale, invert=invert)
|
||||
print 'tables found: ', len(t.keys())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,167 +0,0 @@
|
|||
"""
|
||||
usage: python plot_geo.py file.pdf
|
||||
python plot_geo.py file.pdf file.png
|
||||
|
||||
prints lines and rectangles present in a pdf file.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.pdfinterp import PDFResourceManager
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.layout import LAParams, LTLine, LTRect
|
||||
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||
|
||||
|
||||
MIN_LENGTH = 1
|
||||
pdf_x, pdf_y, image_x, image_y = [0] * 4
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
def remove_coords(coords):
|
||||
merged = []
|
||||
for coord in coords:
|
||||
if not merged:
|
||||
merged.append(coord)
|
||||
else:
|
||||
last = merged[-1]
|
||||
if np.isclose(last, coord, atol=2):
|
||||
pass
|
||||
else:
|
||||
merged.append(coord)
|
||||
return merged
|
||||
|
||||
|
||||
def parse_layout(pdfname):
|
||||
global pdf_x, pdf_y
|
||||
def is_horizontal(line):
|
||||
if line[0] == line[2]:
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_vertical(line):
|
||||
if line[1] == line[3]:
|
||||
return True
|
||||
return False
|
||||
|
||||
vertical, horizontal = [], []
|
||||
with open(pdfname, 'rb') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
laparams = LAParams()
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.create_pages(document):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTLine):
|
||||
line = (obj.x0, obj.y0, obj.x1, obj.y1)
|
||||
if is_vertical(line):
|
||||
vertical.append(line)
|
||||
elif is_horizontal(line):
|
||||
horizontal.append(line)
|
||||
elif isinstance(obj, LTRect):
|
||||
vertical.append((obj.x0, obj.y1, obj.x0, obj.y0))
|
||||
vertical.append((obj.x1, obj.y1, obj.x1, obj.y0))
|
||||
horizontal.append((obj.x0, obj.y1, obj.x1, obj.y1))
|
||||
horizontal.append((obj.x0, obj.y0, obj.x1, obj.y0))
|
||||
return vertical, horizontal
|
||||
|
||||
|
||||
def hough_transform(imagename):
|
||||
global pdf_x, pdf_y, image_x, image_y
|
||||
img = cv2.imread(imagename)
|
||||
image_x, image_y = img.shape[1], img.shape[0]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||||
lines = cv2.HoughLines(edges, 1, np.pi/180, 1000)
|
||||
x = []
|
||||
for line in lines:
|
||||
r, theta = line[0]
|
||||
x0 = r * np.cos(theta)
|
||||
x0 *= pdf_x / float(image_x)
|
||||
x.append(x0)
|
||||
y = []
|
||||
for line in lines:
|
||||
r, theta = line[0]
|
||||
y0 = r * np.sin(theta)
|
||||
y0 = abs(y0 - image_y)
|
||||
y0 *= pdf_y / float(image_y)
|
||||
y.append(y0)
|
||||
x = remove_coords(sorted(set([x0 for x0 in x if x0 > 0])))
|
||||
y = remove_coords(sorted(set(y), reverse=True))
|
||||
return x, y
|
||||
|
||||
|
||||
def plot_lines1(vertical, horizontal):
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax.set_xlim(0, 1000)
|
||||
ax.set_ylim(0, 1000)
|
||||
|
||||
vertical = filter(lambda x: abs(x[1] - x[3]) > MIN_LENGTH, vertical)
|
||||
horizontal = filter(lambda x: abs(x[0] - x[2]) > MIN_LENGTH, horizontal)
|
||||
for v in vertical:
|
||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in horizontal:
|
||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_lines2(imagename, vertical, horizontal):
|
||||
x, y = hough_transform(imagename)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax.set_xlim(0, 1000)
|
||||
ax.set_ylim(0, 1000)
|
||||
|
||||
for x0 in x:
|
||||
for v in vertical:
|
||||
if np.isclose(x0, v[0], atol=2):
|
||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for y0 in y:
|
||||
for h in horizontal:
|
||||
if np.isclose(y0, h[1], atol=2):
|
||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
vertical, horizontal = parse_layout(sys.argv[1])
|
||||
if len(sys.argv) == 2:
|
||||
plot_lines1(vertical, horizontal)
|
||||
elif len(sys.argv) == 3:
|
||||
plot_lines1(vertical, horizontal)
|
||||
plot_lines2(sys.argv[2], vertical, horizontal)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,69 +0,0 @@
|
|||
"""
|
||||
usage: python plot_intensity.py file.png threshold
|
||||
|
||||
plots sum of pixel intensities on both axes for an image.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from pylab import barh
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
def plot_barchart(ar):
|
||||
n = len(ar)
|
||||
ind = np.arange(n)
|
||||
width = 0.35
|
||||
plt.bar(ind, ar, width, color='r', zorder=1)
|
||||
plt.show()
|
||||
|
||||
|
||||
def merge_lines(lines):
|
||||
ranges = []
|
||||
for k, g in groupby(enumerate(lines), lambda (i, x): i-x):
|
||||
group = map(itemgetter(1), g)
|
||||
ranges.append((group[0], group[-1]))
|
||||
merged = []
|
||||
for r in ranges:
|
||||
merged.append((r[0] + r[1]) / 2)
|
||||
return merged
|
||||
|
||||
|
||||
def plot_lines(image, lines):
|
||||
for y in lines:
|
||||
plt.plot([0, image.shape[1]], [y, y])
|
||||
plt.imshow(image)
|
||||
plt.show()
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
image = cv2.imread(sys.argv[1])
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
|
||||
y_proj = np.sum(threshold, axis=1)
|
||||
line_threshold = int(sys.argv[2])
|
||||
lines = np.where(y_proj < line_threshold)[0]
|
||||
lines = merge_lines(lines)
|
||||
plot_lines(image, lines)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,83 +0,0 @@
|
|||
"""
|
||||
usage: python print_text.py file.pdf
|
||||
|
||||
prints horizontal and vertical text lines present in a pdf file.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pprint import pprint
|
||||
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.pdfinterp import PDFResourceManager
|
||||
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||
from pdfminer.layout import (LAParams, LTChar, LTAnno, LTTextBoxHorizontal,
|
||||
LTTextLineHorizontal, LTTextLineVertical, LTLine)
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
def extract_text_objects(layout, LTObject, t=None):
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += extract_text_objects(obj, LTObject)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
with open(sys.argv[1], 'rb') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
# 2.0, 0.5, 0.1
|
||||
kwargs = {
|
||||
'char_margin': 1.0,
|
||||
'line_margin': 0.5,
|
||||
'word_margin': 0.1,
|
||||
'detect_vertical': True
|
||||
}
|
||||
laparams = LAParams(**kwargs)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.create_pages(document):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
lh = extract_text_objects(layout, LTTextLineHorizontal)
|
||||
lv = extract_text_objects(layout, LTTextLineVertical)
|
||||
print "number of horizontal text lines -> {0}".format(len(lh))
|
||||
print "horizontal text lines ->"
|
||||
pprint([t.get_text() for t in lh])
|
||||
print "number of vertical text lines -> {0}".format(len(lv))
|
||||
print "vertical text lines ->"
|
||||
pprint([t.get_text() for t in lv])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
"""
|
||||
usage: python threshold.py file.png blocksize threshold_constant
|
||||
|
||||
shows thresholded image.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
img = cv2.imread(sys.argv[1])
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
blocksize = int(sys.argv[2])
|
||||
threshold_constant = float(sys.argv[3])
|
||||
threshold = cv2.adaptiveThreshold(np.invert(gray), 255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, threshold_constant)
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
38
docs/api.rst
38
docs/api.rst
|
|
@ -4,17 +4,37 @@
|
|||
API Reference
|
||||
=============
|
||||
|
||||
Pdf
|
||||
===
|
||||
.. automodule:: camelot.pdf
|
||||
camelot.read_pdf
|
||||
================
|
||||
.. automodule:: camelot.read_pdf
|
||||
:members:
|
||||
|
||||
Lattice
|
||||
=======
|
||||
.. automodule:: camelot.lattice
|
||||
camelot.handlers.PDFHandler
|
||||
===========================
|
||||
.. automodule:: camelot.handlers.PDFHandler
|
||||
:members:
|
||||
|
||||
Stream
|
||||
======
|
||||
.. automodule:: camelot.stream
|
||||
camelot.parsers.Stream
|
||||
======================
|
||||
.. automodule:: camelot.parsers.Stream
|
||||
:members:
|
||||
|
||||
camelot.parsers.Lattice
|
||||
=======================
|
||||
.. automodule:: camelot.parsers.Lattice
|
||||
:members:
|
||||
|
||||
camelot.core.Cell
|
||||
=================
|
||||
.. automodule:: camelot.core.Cell
|
||||
:members:
|
||||
|
||||
camelot.core.Table
|
||||
==================
|
||||
.. automodule:: camelot.core.Table
|
||||
:members:
|
||||
|
||||
camelot.core.TableList
|
||||
======================
|
||||
.. automodule:: camelot.core.TableList
|
||||
:members:
|
||||
100
docs/index.rst
100
docs/index.rst
|
|
@ -3,11 +3,11 @@
|
|||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
==================================
|
||||
Camelot: pdf parsing made simpler!
|
||||
==================================
|
||||
=====================================
|
||||
Camelot: PDF Table Parsing for Humans
|
||||
=====================================
|
||||
|
||||
Camelot is a Python 2.7 library and command-line tool for getting tables out of pdf files.
|
||||
Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files.
|
||||
|
||||
Why another pdf table parsing library?
|
||||
======================================
|
||||
|
|
@ -32,12 +32,22 @@ Usage
|
|||
|
||||
::
|
||||
|
||||
>>> from camelot.pdf import Pdf
|
||||
>>> from camelot.lattice import Lattice
|
||||
|
||||
>>> manager = Pdf(Lattice(), 'us-030.pdf')
|
||||
>>> tables = manager.extract()
|
||||
>>> print tables['page-1']['table-1']['data']
|
||||
>>> import camelot
|
||||
>>> tables = camelot.read_pdf("foo.pdf")
|
||||
>>> tables
|
||||
<TableList n=2>
|
||||
>>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
|
||||
>>> tables[0]
|
||||
<Table shape=(3,4)>
|
||||
>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
|
||||
>>> tables[0].parsing_report
|
||||
{
|
||||
"accuracy": 96,
|
||||
"whitespace": 80,
|
||||
"order": 1,
|
||||
"page": 1
|
||||
}
|
||||
>>> df = tables[0].df
|
||||
|
||||
.. csv-table::
|
||||
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
||||
|
|
@ -49,45 +59,6 @@ Usage
|
|||
"2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"
|
||||
"4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"
|
||||
|
||||
Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
|
||||
|
||||
::
|
||||
|
||||
Camelot: PDF parsing made simpler!
|
||||
|
||||
usage:
|
||||
camelot [options] <method> [<args>...]
|
||||
|
||||
options:
|
||||
-h, --help Show this screen.
|
||||
-v, --version Show version.
|
||||
-V, --verbose Verbose.
|
||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||
Example: -p 1,3-6,10 [default: 1]
|
||||
-P, --parallel Parallelize the parsing process.
|
||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
||||
-l, --log Log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 1.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-J, --split_text Split text lines if they span across multiple cells.
|
||||
-K, --flag_size Flag substring if its size differs from the whole string.
|
||||
Useful for super and subscripts.
|
||||
-X, --print-stats List stats on the parsing process.
|
||||
-Y, --save-stats Save stats to a file.
|
||||
-Z, --plot <dist> Plot distributions. (page,all,rc)
|
||||
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
stream Looks for spaces between data.
|
||||
|
||||
See 'camelot <method> -h' for more information on a specific method.
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
|
|
@ -95,42 +66,41 @@ Make sure you have the most updated versions for `pip` and `setuptools`. You can
|
|||
|
||||
pip install -U pip setuptools
|
||||
|
||||
The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_.
|
||||
The dependencies include `tk`_ and `ghostscript`_.
|
||||
|
||||
.. _numpy: http://www.numpy.org/
|
||||
.. _OpenCV: http://opencv.org/
|
||||
.. _ImageMagick: http://www.imagemagick.org/script/index.php
|
||||
.. _tk: https://wiki.tcl.tk/3743
|
||||
.. _ghostscript: https://www.ghostscript.com/
|
||||
|
||||
Installing dependencies
|
||||
-----------------------
|
||||
|
||||
numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager.
|
||||
tk and ghostscript can be installed using your system's default package manager.
|
||||
|
||||
Linux
|
||||
^^^^^
|
||||
|
||||
* Arch Linux
|
||||
|
||||
::
|
||||
|
||||
sudo pacman -S opencv imagemagick
|
||||
|
||||
* Ubuntu
|
||||
|
||||
::
|
||||
|
||||
sudo apt-get install libopencv-dev python-opencv imagemagick
|
||||
sudo apt-get install python-opencv python-tk ghostscript
|
||||
|
||||
* Arch Linux
|
||||
|
||||
::
|
||||
|
||||
sudo pacman -S opencv tk ghostscript
|
||||
|
||||
OS X
|
||||
^^^^
|
||||
|
||||
::
|
||||
|
||||
brew install homebrew/science/opencv imagemagick
|
||||
brew install homebrew/science/opencv ghostscript
|
||||
|
||||
Finally, `cd` into the project directory and install by::
|
||||
|
||||
make install
|
||||
python setup.py install
|
||||
|
||||
API Reference
|
||||
=============
|
||||
|
|
@ -150,14 +120,14 @@ You can check the latest sources with the command::
|
|||
Contributing
|
||||
------------
|
||||
|
||||
See :doc:`Contributing doc <contributing>`.
|
||||
See :doc:`Contributing guidelines <contributing>`.
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
::
|
||||
|
||||
make test
|
||||
python setup.py test
|
||||
|
||||
License
|
||||
=======
|
||||
|
|
|
|||
|
|
@ -1,11 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(Pdf("files/column_span_1.pdf", clean=True), scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(Pdf("files/column_span_2.pdf"), clean=True, scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(
|
||||
Pdf("files/row_span_1.pdf", clean=True), fill='v', scale=40)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(
|
||||
Pdf("files/row_span_2.pdf", clean=True), fill='v', scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(Pdf("files/lines_in_background_1.pdf",
|
||||
clean=True), scale=30, invert=True)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(Pdf("files/lines_in_background_2.pdf",
|
||||
clean=True), scale=30, invert=True)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(Pdf("files/left_rotated_table.pdf", clean=True), scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(Pdf("files/right_rotated_table.pdf", clean=True), scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(Pdf("files/twotables_1.pdf", clean=True), scale=40)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(Pdf("files/twotables_2.pdf", clean=True), scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Stream
|
||||
|
||||
|
||||
extractor = Stream(Pdf("files/budget_2014-15.pdf",
|
||||
char_margin=1.0, clean=True))
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Stream
|
||||
|
||||
|
||||
extractor = Stream(Pdf("files/inconsistent_rows.pdf", char_margin=1.0),
|
||||
columns="65,95,285,640,715,780", ytol=10)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Stream(Pdf("files/consistent_rows.pdf", char_margin=1.0),
|
||||
columns="28,67,180,230,425,475,700", ytol=5)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,11 @@
|
|||
click==6.7
|
||||
matplotlib==2.2.3
|
||||
numpy==1.13.3
|
||||
opencv-python==3.4.2.17
|
||||
pandas==0.23.4
|
||||
pdfminer==20140328
|
||||
Pillow==5.2.0
|
||||
PyPDF2==1.26.0
|
||||
pytest==3.8.0
|
||||
pytest-runner==4.2
|
||||
Sphinx==1.8.0b1
|
||||
|
|
@ -1,9 +1,8 @@
|
|||
docopt
|
||||
matplotlib
|
||||
nose
|
||||
pdfminer
|
||||
pyexcel-xlsx
|
||||
Pillow
|
||||
pyocr
|
||||
PyPDF2
|
||||
Sphinx
|
||||
click==6.7
|
||||
matplotlib==2.2.3
|
||||
numpy==1.13.3
|
||||
opencv-python==3.4.2.17
|
||||
pandas==0.23.4
|
||||
pdfminer==20140328
|
||||
Pillow==5.2.0
|
||||
PyPDF2==1.26.0
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
[aliases]
|
||||
test=pytest
|
||||
|
||||
[tool:pytest]
|
||||
addopts = --verbose
|
||||
python_files = tests/test_*.py
|
||||
20
setup.py
20
setup.py
|
|
@ -4,12 +4,12 @@ import camelot
|
|||
|
||||
NAME = 'camelot'
|
||||
VERSION = camelot.__version__
|
||||
DESCRIPTION = 'camelot parses tables from PDFs!'
|
||||
DESCRIPTION = 'PDF Table Parsing for Humans'
|
||||
with open('README.md') as f:
|
||||
LONG_DESCRIPTION = f.read()
|
||||
URL = 'https://github.com/socialcopsdev/camelot'
|
||||
AUTHOR = 'Vinayak Mehta'
|
||||
AUTHOR_EMAIL = 'vinayak@socialcops.com'
|
||||
AUTHOR_EMAIL = 'vmehta94@gmail.com'
|
||||
LICENSE = 'BSD License'
|
||||
|
||||
opencv_min_version = '2.4.8'
|
||||
|
|
@ -48,10 +48,8 @@ def setup_package():
|
|||
author=AUTHOR,
|
||||
author_email=AUTHOR_EMAIL,
|
||||
license=LICENSE,
|
||||
keywords='parse scrape pdf table',
|
||||
packages=['camelot'],
|
||||
install_requires=reqs,
|
||||
scripts=['tools/camelot'])
|
||||
install_requires=reqs)
|
||||
|
||||
try:
|
||||
from setuptools import setup
|
||||
|
|
@ -60,18 +58,14 @@ def setup_package():
|
|||
|
||||
opencv_status = get_opencv_status()
|
||||
opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version)
|
||||
instructions = ("Installation instructions are available in the README at "
|
||||
"https://github.com/socialcopsdev/camelot")
|
||||
|
||||
if opencv_status['up_to_date'] is False:
|
||||
if opencv_status['version']:
|
||||
raise ImportError("Your installation of OpenCV "
|
||||
"{0} is out-of-date.\n{1}{2}"
|
||||
.format(opencv_status['version'],
|
||||
opencv_req_str, instructions))
|
||||
raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}"
|
||||
.format(opencv_status['version'], opencv_req_str))
|
||||
else:
|
||||
raise ImportError("OpenCV is not installed.\n{0}{1}"
|
||||
.format(opencv_req_str, instructions))
|
||||
raise ImportError("OpenCV is not installed.\n{}"
|
||||
.format(opencv_req_str))
|
||||
|
||||
setup(**metadata)
|
||||
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue