diff --git a/.gitignore b/.gitignore index 14fc340..4fd453c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ build/ dist/ *.egg-info/ .coverage + +.pytest_cache/ +_build/ +_static/ diff --git a/Makefile b/Makefile deleted file mode 100644 index f4183af..0000000 --- a/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -PYTHON ?= python -NOSETESTS ?= nosetests - -help: - @echo "Please use \`make ' where is one of" - @echo " clean" - @echo " dev to install in develop mode" - @echo " undev to uninstall develop mode" - @echo " install to install for all users" - @echo " test to run tests" - @echo " test-coverage to run tests with coverage report" - -clean: - $(PYTHON) setup.py clean - rm -rf dist - -dev: - $(PYTHON) setup.py develop - -undev: - $(PYTHON) setup.py develop --uninstall - -install: - $(PYTHON) setup.py install - -test: - $(NOSETESTS) -s -v - -test-coverage: - rm -rf coverage .coverage - $(NOSETESTS) -s -v --with-coverage diff --git a/README.md b/README.md index 5cb4fc0..372cc09 100644 --- a/README.md +++ b/README.md @@ -1,67 +1,31 @@ -# camelot +# Camelot: PDF Table Parsing for Humans -Camelot is a Python 2.7 library and command-line tool for getting tables out of PDF files. +Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files. ## Usage
-from camelot.pdf import Pdf
-from camelot.lattice import Lattice
-
-manager = Pdf(Lattice(), "/path/to/pdf")
-tables = manager.extract()
-
- -Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF. - -
-Camelot: PDF parsing made simpler!
-
-usage:
- camelot [options] <method> [<args>...]
-
-options:
- -h, --help                Show this screen.
- -v, --version             Show version.
- -V, --verbose             Verbose.
- -p, --pages <pageno>      Comma-separated list of page numbers.
-                           Example: -p 1,3-6,10  [default: 1]
- -P, --parallel            Parallelize the parsing process.
- -f, --format <format>     Output format. (csv,tsv,html,json,xlsx) [default: csv]
- -l, --log                 Log to file.
- -o, --output <directory>  Output directory.
- -M, --cmargin <cmargin>   Char margin. Chars closer than cmargin are
-                           grouped together to form a word. [default: 2.0]
- -L, --lmargin <lmargin>   Line margin. Lines closer than lmargin are
-                           grouped together to form a textbox. [default: 0.5]
- -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
-                           if distance between words is greater than word
-                           margin. [default: 0.1]
- -J, --split_text          Split text lines if they span across multiple cells.
- -K, --flag_size           Flag substring if its size differs from the whole string.
-                           Useful for super and subscripts.
- -X, --print-stats         List stats on the parsing process.
- -Y, --save-stats          Save stats to a file.
- -Z, --plot <dist>         Plot distributions. (page,all,rc)
-
-camelot methods:
- lattice  Looks for lines between data.
- stream   Looks for spaces between data.
- ocrl     Lattice, but for images.
- ocrs     Stream, but for images.
-
-See 'camelot <method> -h' for more information on a specific method.
+>>> import camelot
+>>> tables = camelot.read_pdf("foo.pdf")
+>>> tables
+<TableList n=2>
+>>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
+>>> tables[0]
+<Table shape=(3,4)>
+>>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
+>>> tables[0].parsing_report
+{
+    "accuracy": 96,
+    "whitespace": 80,
+    "order": 1,
+    "page": 1
+}
+>>> df = tables[0].df
 
## Dependencies -Currently, camelot works under Python 2.7. - -The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ImageMagick](http://www.imagemagick.org/script/index.php). - -### Optional - -You'll need to install [Tesseract](https://github.com/tesseract-ocr/tesseract) if you want to extract tables from image based pdfs. Also, you'll need a tesseract language pack if your pdf isn't in english. +The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/). ## Installation @@ -73,32 +37,32 @@ pip install -U pip setuptools ### Installing dependencies -numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager. +tk and ghostscript can be installed using your system's default package manager. #### Linux -* Arch Linux - -
-sudo pacman -S opencv imagemagick
-
- * Ubuntu
-sudo apt-get install libopencv-dev python-opencv imagemagick
+sudo apt-get install python-opencv python-tk ghostscript
+
+ +* Arch Linux + +
+sudo pacman -S opencv tk ghostscript
 
#### OS X
-brew install homebrew/science/opencv imagemagick
+brew install homebrew/science/opencv ghostscript
 
Finally, `cd` into the project directory and install by
-make install
+python setup.py install
 
## Development @@ -113,14 +77,14 @@ git clone https://github.com/socialcopsdev/camelot.git ### Contributing -See [Contributing doc](). +See [Contributing guidelines](). ### Testing
-make test
+python setup.py test
 
## License -BSD License +BSD License \ No newline at end of file diff --git a/camelot/__init__.py b/camelot/__init__.py index 55aee51..6e416e4 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -1,3 +1,4 @@ -__version__ = '1.2.0' +from .__version__ import __version__ -__all__ = ['pdf', 'lattice', 'stream', 'ocr'] +from .io import read_pdf +from .plotting import plot_geometry \ No newline at end of file diff --git a/camelot/__version__.py b/camelot/__version__.py new file mode 100644 index 0000000..b794fd4 --- /dev/null +++ b/camelot/__version__.py @@ -0,0 +1 @@ +__version__ = '0.1.0' diff --git a/camelot/cell.py b/camelot/cell.py deleted file mode 100644 index 8dfe8d3..0000000 --- a/camelot/cell.py +++ /dev/null @@ -1,128 +0,0 @@ -class Cell: - """Cell. - Defines a cell object with coordinates relative to a left-bottom - origin, which is also PDFMiner's coordinate space. - - Parameters - ---------- - x1 : float - x-coordinate of left-bottom point. - - y1 : float - y-coordinate of left-bottom point. - - x2 : float - x-coordinate of right-top point. - - y2 : float - y-coordinate of right-top point. - - Attributes - ---------- - lb : tuple - Tuple representing left-bottom coordinates. - - lt : tuple - Tuple representing left-top coordinates. - - rb : tuple - Tuple representing right-bottom coordinates. - - rt : tuple - Tuple representing right-top coordinates. - - bbox : tuple - Tuple representing the cell's bounding box using the - lower-bottom and right-top coordinates. - - left : bool - Whether or not cell is bounded on the left. - - right : bool - Whether or not cell is bounded on the right. - - top : bool - Whether or not cell is bounded on the top. - - bottom : bool - Whether or not cell is bounded on the bottom. - - text_objects : list - List of text objects assigned to cell. - - text : string - Text assigned to cell. - - spanning_h : bool - Whether or not cell spans/extends horizontally. - - spanning_v : bool - Whether or not cell spans/extends vertically. - """ - - def __init__(self, x1, y1, x2, y2): - - self.x1 = x1 - self.y1 = y1 - self.x2 = x2 - self.y2 = y2 - self.lb = (x1, y1) - self.lt = (x1, y2) - self.rb = (x2, y1) - self.rt = (x2, y2) - self.bbox = (x1, y1, x2, y2) - self.left = False - self.right = False - self.top = False - self.bottom = False - self.text_objects = [] - self.text = '' - self.spanning_h = False - self.spanning_v = False - self.image = None - - def add_text(self, text): - """Adds text to cell. - - Parameters - ---------- - text : string - """ - self.text = ''.join([self.text, text]) - - def get_text(self): - """Returns text assigned to cell. - - Returns - ------- - text : string - """ - return self.text - - def add_object(self, t_object): - """Adds PDFMiner text object to cell. - - Parameters - ---------- - t_object : object - """ - self.text_objects.append(t_object) - - def get_objects(self): - """Returns list of text objects assigned to cell. - - Returns - ------- - text_objects : list - """ - return self.text_objects - - def get_bounded_edges(self): - """Returns the number of edges by which a cell is bounded. - - Returns - ------- - bounded_edges : int - """ - self.bounded_edges = self.top + self.bottom + self.left + self.right - return self.bounded_edges diff --git a/camelot/cli.py b/camelot/cli.py new file mode 100644 index 0000000..302830e --- /dev/null +++ b/camelot/cli.py @@ -0,0 +1 @@ +import click \ No newline at end of file diff --git a/camelot/core.py b/camelot/core.py new file mode 100644 index 0000000..f400fe8 --- /dev/null +++ b/camelot/core.py @@ -0,0 +1,491 @@ +import os +import json +import zipfile +import tempfile + +import numpy as np +import pandas as pd + + +class Cell(object): + """Defines a cell in a table with coordinates relative to a + left-bottom origin. (pdf coordinate space) + + Parameters + ---------- + x1 : float + x-coordinate of left-bottom point. + y1 : float + y-coordinate of left-bottom point. + x2 : float + x-coordinate of right-top point. + y2 : float + y-coordinate of right-top point. + + Attributes + ---------- + lb : tuple + Tuple representing left-bottom coordinates. + lt : tuple + Tuple representing left-top coordinates. + rb : tuple + Tuple representing right-bottom coordinates. + rt : tuple + Tuple representing right-top coordinates. + left : bool + Whether or not cell is bounded on the left. + right : bool + Whether or not cell is bounded on the right. + top : bool + Whether or not cell is bounded on the top. + bottom : bool + Whether or not cell is bounded on the bottom. + hspan : bool + Whether or not cell spans horizontally. + vspan : bool + Whether or not cell spans vertically. + text : string + Text assigned to cell. + bound + + """ + + def __init__(self, x1, y1, x2, y2): + self.x1 = x1 + self.y1 = y1 + self.x2 = x2 + self.y2 = y2 + self.lb = (x1, y1) + self.lt = (x1, y2) + self.rb = (x2, y1) + self.rt = (x2, y2) + self.left = False + self.right = False + self.top = False + self.bottom = False + self.hspan = False + self.vspan = False + self._text = '' + + def __repr__(self): + return ''.format( + self.x1, self.y1, self.x2, self.y2) + + @property + def text(self): + return self._text + + @text.setter + def text(self, t): + self._text = ''.join([self._text, t]) + + @property + def bound(self): + """The number of sides on which the cell is bounded. + """ + return self.top + self.bottom + self.left + self.right + + +class Table(object): + """Defines a table with coordinates relative to a left-bottom + origin. (pdf coordinate space) + + Parameters + ---------- + cols : list + List of tuples representing column x-coordinates in increasing + order. + rows : list + List of tuples representing row y-coordinates in decreasing + order. + + Attributes + ---------- + df : object + pandas.DataFrame + shape : tuple + Shape of the table. + accuracy : float + Accuracy with which text was assigned to the cell. + whitespace : float + Percentage of whitespace in the table. + order : int + Table number on pdf page. + page : int + Pdf page number. + data + parsing_report + + """ + def __init__(self, cols, rows): + self.cols = cols + self.rows = rows + self.cells = [[Cell(c[0], r[1], c[1], r[0]) + for c in cols] for r in rows] + self.df = None + self.shape = (0, 0) + self.accuracy = 0 + self.whitespace = 0 + self.order = None + self.page = None + + def __repr__(self): + return '<{} shape={}>'.format(self.__class__.__name__, self.shape) + + @property + def data(self): + """Returns two-dimensional list of strings in table. + """ + d = [] + for row in self.cells: + d.append([cell.text.strip() for cell in row]) + return d + + @property + def parsing_report(self): + """Returns a parsing report with accuracy, %whitespace, + table number on page and page number. + """ + # pretty? + report = { + 'accuracy': self.accuracy, + 'whitespace': self.whitespace, + 'order': self.order, + 'page': self.page + } + return report + + def set_all_edges(self): + """Sets all table edges to True. + """ + for row in self.cells: + for cell in row: + cell.left = cell.right = cell.top = cell.bottom = True + return self + + def set_edges(self, vertical, horizontal, joint_close_tol=2): + """Sets a cell's edges to True depending on whether the cell's + coordinates overlap with the line's coordinates within a + tolerance. + + Parameters + ---------- + vertical : list + List of detected vertical lines. + horizontal : list + List of detected horizontal lines. + + """ + for v in vertical: + # find closest x coord + # iterate over y coords and find closest start and end points + i = [i for i, t in enumerate(self.cols) + if np.isclose(v[0], t[0], atol=joint_close_tol)] + j = [j for j, t in enumerate(self.rows) + if np.isclose(v[3], t[0], atol=joint_close_tol)] + k = [k for k, t in enumerate(self.rows) + if np.isclose(v[1], t[0], atol=joint_close_tol)] + if not j: + continue + J = j[0] + if i == [0]: # only left edge + L = i[0] + if k: + K = k[0] + while J < K: + self.cells[J][L].left = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][L].left = True + J += 1 + elif i == []: # only right edge + L = len(self.cols) - 1 + if k: + K = k[0] + while J < K: + self.cells[J][L].right = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][L].right = True + J += 1 + else: # both left and right edges + L = i[0] + if k: + K = k[0] + while J < K: + self.cells[J][L].left = True + self.cells[J][L - 1].right = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][L].left = True + self.cells[J][L - 1].right = True + J += 1 + + for h in horizontal: + # find closest y coord + # iterate over x coords and find closest start and end points + i = [i for i, t in enumerate(self.rows) + if np.isclose(h[1], t[0], atol=joint_close_tol)] + j = [j for j, t in enumerate(self.cols) + if np.isclose(h[0], t[0], atol=joint_close_tol)] + k = [k for k, t in enumerate(self.cols) + if np.isclose(h[2], t[0], atol=joint_close_tol)] + if not j: + continue + J = j[0] + if i == [0]: # only top edge + L = i[0] + if k: + K = k[0] + while J < K: + self.cells[L][J].top = True + J += 1 + else: + K = len(self.cols) + while J < K: + self.cells[L][J].top = True + J += 1 + elif i == []: # only bottom edge + I = len(self.rows) - 1 + if k: + K = k[0] + while J < K: + self.cells[L][J].bottom = True + J += 1 + else: + K = len(self.cols) + while J < K: + self.cells[L][J].bottom = True + J += 1 + else: # both top and bottom edges + L = i[0] + if k: + K = k[0] + while J < K: + self.cells[L][J].top = True + self.cells[L - 1][J].bottom = True + J += 1 + else: + K = len(self.cols) + while J < K: + self.cells[L][J].top = True + self.cells[L - 1][J].bottom = True + J += 1 + + return self + + def set_border(self): + """Sets table border edges to True. + """ + for r in range(len(self.rows)): + self.cells[r][0].left = True + self.cells[r][len(self.cols) - 1].right = True + for c in range(len(self.cols)): + self.cells[0][c].top = True + self.cells[len(self.rows) - 1][c].bottom = True + return self + + def set_span(self): + """Sets a cell's hspan or vspan attribute to True depending + on whether the cell spans horizontally or vertically. + """ + for row in self.cells: + for cell in row: + left = cell.left + right = cell.right + top = cell.top + bottom = cell.bottom + if cell.bound == 4: + continue + elif cell.bound == 3: + if not left and (right and top and bottom): + cell.hspan = True + elif not right and (left and top and bottom): + cell.hspan = True + elif not top and (left and right and bottom): + cell.vspan = True + elif not bottom and (left and right and top): + cell.vspan = True + elif cell.bound == 2: + if left and right and (not top and not bottom): + cell.vspan = True + elif top and bottom and (not left and not right): + cell.hspan = True + return self + + def to_csv(self, path, **kwargs): + """Write Table to a comma-separated values (csv) file. + """ + kw = { + 'encoding': 'utf-8', + 'index': False, + 'quoting': 1 + } + kw.update(kwargs) + self.df.to_csv(path, **kw) + + def to_json(self, path, **kwargs): + """Write Table to a JSON file. + """ + kw = { + 'orient': 'records' + } + kw.update(kwargs) + json_string = self.df.to_json(**kw) + with open(path, 'w') as f: + f.write(json_string) + + def to_excel(self, path, **kwargs): + """Write Table to an Excel file. + """ + kw = { + 'sheet_name': 'page-{}-table-{}'.format(self.page, self.order), + 'encoding': 'utf-8' + } + kw.update(kwargs) + writer = pd.ExcelWriter(path) + self.df.to_excel(writer, **kw) + writer.save() + + def to_html(self, path, **kwargs): + """Write Table to an HTML file. + """ + html_string = self.df.to_html(**kwargs) + with open(path, 'w') as f: + f.write(html_string) + + +class TableList(object): + """Defines a list of camelot.core.Table objects. Each table can + be accessed using its index. + + Attributes + ---------- + n : int + Number of tables in the list. + + """ + def __init__(self, tables): + self._tables = tables + + def __repr__(self): + return '<{} tables={}>'.format( + self.__class__.__name__, len(self._tables)) + + def __len__(self): + return len(self._tables) + + def __getitem__(self, idx): + return self._tables[idx] + + @staticmethod + def _format_func(table, f): + return getattr(table, 'to_{}'.format(f)) + + @property + def n(self): + return len(self._tables) + + def _write_file(self, f=None, **kwargs): + dirname = kwargs.get('dirname') + root = kwargs.get('root') + ext = kwargs.get('ext') + for table in self._tables: + filename = os.path.join('{}-page-{}-table-{}{}'.format( + root, table.page, table.order, ext)) + filepath = os.path.join(dirname, filename) + to_format = self._format_func(table, f) + to_format(filepath) + + def _compress_dir(self, **kwargs): + path = kwargs.get('path') + dirname = kwargs.get('dirname') + root = kwargs.get('root') + ext = kwargs.get('ext') + zipname = os.path.join(os.path.dirname(path), root) + '.zip' + with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: + for table in self._tables: + filename = os.path.join('{}-page-{}-table-{}{}'.format( + root, table.page, table.order, ext)) + filepath = os.path.join(dirname, filename) + z.write(filepath, os.path.basename(filepath)) + + def export(self, path, f='csv', compress=False): + """Exports the list of tables to specified file format. + + Parameters + ---------- + path : str + Filepath + f : str + File format. Can be csv, json, excel and html. + compress : bool + Whether or not to add files to a ZIP archive. + + """ + dirname = os.path.dirname(path) + basename = os.path.basename(path) + root, ext = os.path.splitext(basename) + if compress: + dirname = tempfile.mkdtemp() + + kwargs = { + 'path': path, + 'dirname': dirname, + 'root': root, + 'ext': ext + } + + if f in ['csv', 'json', 'html']: + self._write_file(f=f, **kwargs) + if compress: + self._compress_dir(**kwargs) + elif f == 'excel': + filepath = os.path.join(dirname, basename) + writer = pd.ExcelWriter(filepath) + for table in self._tables: + sheet_name = 'page-{}-table-{}'.format(table.page, table.order) + table.df.to_excel(writer, sheet_name=sheet_name, encoding='utf-8') + writer.save() + if compress: + zipname = os.path.join(os.path.dirname(path), root) + '.zip' + with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: + z.write(filepath, os.path.basename(filepath)) + + +class Geometry(object): + def __init__(self): + self.text = [] + self.images = () + self.segments = () + self.tables = [] + + def __repr__(self): + return '<{} text={} images={} segments={} tables={}>'.format( + self.__class__.__name__, + len(self.text), + len(self.images), + len(self.segments), + len(self.tables)) + + +class GeometryList(object): + def __init__(self, geometry): + self.text = [g.text for g in geometry] + self.images = [g.images for g in geometry] + self.segments = [g.segments for g in geometry] + self.tables = [g.tables for g in geometry] + + def __repr__(self): + return '<{} text={} images={} segments={} tables={}>'.format( + self.__class__.__name__, + len(self.text), + len(self.images), + len(self.segments), + len(self.tables)) \ No newline at end of file diff --git a/camelot/handlers.py b/camelot/handlers.py new file mode 100644 index 0000000..8585432 --- /dev/null +++ b/camelot/handlers.py @@ -0,0 +1,144 @@ +import os +import tempfile + +from PyPDF2 import PdfFileReader, PdfFileWriter + +from .core import TableList, GeometryList +from .parsers import Stream, Lattice +from .utils import get_page_layout, get_text_objects, get_rotation + + +class PDFHandler(object): + """Handles all operations like temp directory creation, splitting + file into single page pdfs, parsing each pdf and then removing the + temp directory. + + Parameter + --------- + filename : str + Path to pdf file. + pages : str + Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + + """ + def __init__(self, filename, pages='1'): + self.filename = filename + if not self.filename.endswith('.pdf'): + raise TypeError("File format not supported.") + self.pages = self._get_pages(self.filename, pages) + self.tempdir = tempfile.mkdtemp() + + def _get_pages(self, filename, pages): + """Converts pages string to list of ints. + + Parameters + ---------- + filename : str + Path to pdf file. + pages : str + Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + + Returns + ------- + P : list + List of int page numbers. + + """ + page_numbers = [] + if pages == '1': + page_numbers.append({'start': 1, 'end': 1}) + else: + infile = PdfFileReader(open(filename, 'rb'), strict=False) + if pages == 'all': + page_numbers.append({'start': 1, 'end': infile.getNumPages()}) + else: + for r in pages.split(','): + if '-' in r: + a, b = r.split('-') + if b == 'end': + b = infile.getNumPages() + page_numbers.append({'start': int(a), 'end': int(b)}) + else: + page_numbers.append({'start': int(r), 'end': int(r)}) + P = [] + for p in page_numbers: + P.extend(range(p['start'], p['end'] + 1)) + return sorted(set(P)) + + def _save_page(self, filename, page, temp): + """Saves specified page from pdf into a temporary directory. + + Parameters + ---------- + filename : str + Path to pdf file. + page : int + Page number + temp : str + Tmp directory + + """ + with open(filename, 'rb') as fileobj: + infile = PdfFileReader(fileobj, strict=False) + fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) + froot, fext = os.path.splitext(fpath) + p = infile.getPage(page - 1) + outfile = PdfFileWriter() + outfile.addPage(p) + with open(fpath, 'wb') as f: + outfile.write(f) + layout, dim = get_page_layout(fpath) + # fix rotated pdf + lttextlh = get_text_objects(layout, ltype="lh") + lttextlv = get_text_objects(layout, ltype="lv") + ltchar = get_text_objects(layout, ltype="char") + rotation = get_rotation(lttextlh, lttextlv, ltchar) + if rotation != '': + fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) + os.rename(fpath, fpath_new) + infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) + outfile = PdfFileWriter() + p = infile.getPage(0) + if rotation == 'anticlockwise': + p.rotateClockwise(90) + elif rotation == 'clockwise': + p.rotateCounterClockwise(90) + outfile.addPage(p) + with open(fpath, 'wb') as f: + outfile.write(f) + + def parse(self, mesh=False, **kwargs): + """Extracts tables by calling parser.get_tables on all single + page pdfs. + + Parameters + ---------- + mesh : bool (default: False) + Whether or not to use Lattice method of parsing. Stream + is used by default. + kwargs : dict + See camelot.read_pdf kwargs. + + Returns + ------- + tables : camelot.core.TableList + List of tables found in pdf. + geometry : camelot.core.GeometryList + List of geometry objects (contours, lines, joints) + found in pdf. + + """ + for p in self.pages: + self._save_page(self.filename, p, self.tempdir) + pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p)) + for p in self.pages] + tables = [] + geometry = [] + parser = Stream(**kwargs) if not mesh else Lattice(**kwargs) + for p in pages: + t, g = parser.extract_tables(p) + tables.extend(t) + geometry.append(g) + return TableList(tables), GeometryList(geometry) \ No newline at end of file diff --git a/camelot/imgproc.py b/camelot/image_processing.py similarity index 69% rename from camelot/imgproc.py rename to camelot/image_processing.py index 1621bea..23923b2 100644 --- a/camelot/imgproc.py +++ b/camelot/image_processing.py @@ -1,3 +1,4 @@ +from __future__ import division from itertools import groupby from operator import itemgetter @@ -7,40 +8,38 @@ import numpy as np from .utils import merge_tuples -def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): +def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): """Thresholds an image using OpenCV's adaptiveThreshold. Parameters ---------- imagename : string Path to image file. - - invert : bool - Whether or not to invert the image. Useful when pdfs have - tables with lines in background. - (optional, default: False) - - blocksize: int + process_background : bool, optional (default: False) + Whether or not to process lines that are in background. + blocksize : int, optional (default: 15) Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. - c: float - Constant subtracted from the mean or weighted mean - (see the details below). Normally, it is positive but may be - zero or negative as well. + For more information, refer `OpenCV's adaptiveThreshold `_. + c : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. Returns ------- img : object numpy.ndarray representing the original image. - threshold : object numpy.ndarray representing the thresholded image. + """ img = cv2.imread(imagename) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - if invert: + if process_background: threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c) else: @@ -49,7 +48,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): return img, threshold -def find_lines(threshold, direction='horizontal', scale=15, iterations=0): +def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): """Finds horizontal and vertical lines by applying morphological transformations on an image. @@ -57,38 +56,37 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0): ---------- threshold : object numpy.ndarray representing the thresholded image. - - direction : string + direction : string, optional (default: 'horizontal') Specifies whether to find vertical or horizontal lines. - (default: 'horizontal') + line_size_scaling : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. - scale : int - Used to divide the height/width to get a structuring element - for morph transform. - (optional, default: 15) + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. - iterations : int - Number of iterations for dilation. - (optional, default: 2) + For more information, refer `OpenCV's dilate `_. Returns ------- dmask : object numpy.ndarray representing pixels where vertical/horizontal lines lie. - lines : list List of tuples representing vertical/horizontal lines with coordinates relative to a left-top origin in - OpenCV's coordinate space. + image coordinate space. + """ lines = [] if direction == 'vertical': - size = threshold.shape[0] // scale + size = threshold.shape[0] // line_size_scaling el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) elif direction == 'horizontal': - size = threshold.shape[1] // scale + size = threshold.shape[1] // line_size_scaling el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) elif direction is None: raise ValueError("Specify direction as either 'vertical' or" @@ -110,9 +108,9 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0): x1, x2 = x, x + w y1, y2 = y, y + h if direction == 'vertical': - lines.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) + lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1)) elif direction == 'horizontal': - lines.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) + lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2)) return dmask, lines @@ -124,7 +122,6 @@ def find_table_contours(vertical, horizontal): ---------- vertical : object numpy.ndarray representing pixels where vertical lines lie. - horizontal : object numpy.ndarray representing pixels where horizontal lines lie. @@ -133,7 +130,8 @@ def find_table_contours(vertical, horizontal): cont : list List of tuples representing table boundaries. Each tuple is of the form (x, y, w, h) where (x, y) -> left-top, w -> width and - h -> height in OpenCV's coordinate space. + h -> height in image coordinate space. + """ mask = vertical + horizontal @@ -161,11 +159,9 @@ def find_table_joints(contours, vertical, horizontal): contours : list List of tuples representing table boundaries. Each tuple is of the form (x, y, w, h) where (x, y) -> left-top, w -> width and - h -> height in OpenCV's coordinate space. - + h -> height in image coordinate space. vertical : object numpy.ndarray representing pixels where vertical lines lie. - horizontal : object numpy.ndarray representing pixels where horizontal lines lie. @@ -174,9 +170,9 @@ def find_table_joints(contours, vertical, horizontal): tables : dict Dict with table boundaries as keys and list of intersections in that boundary as their value. - Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb - and (x2, y2) -> rt in OpenCV's coordinate space. + and (x2, y2) -> rt in image coordinate space. + """ joints = np.bitwise_and(vertical, horizontal) tables = {} @@ -194,32 +190,35 @@ def find_table_joints(contours, vertical, horizontal): joint_coords = [] for j in jc: jx, jy, jw, jh = cv2.boundingRect(j) - c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2 + c1, c2 = x + (2 * jx + jw) // 2, y + (2 * jy + jh) // 2 joint_coords.append((c1, c2)) tables[(x, y + h, x + w, y)] = joint_coords return tables -def remove_lines(threshold, line_scale=15): +def remove_lines(threshold, line_size_scaling=15): """Removes lines from a thresholded image. Parameters ---------- threshold : object numpy.ndarray representing the thresholded image. + line_size_scaling : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. - line_scale : int - Line scaling factor. - (optional, default: 15) + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. Returns ------- threshold : object numpy.ndarray representing the thresholded image with horizontal and vertical lines removed. + """ - size = threshold.shape[0] // line_scale + size = threshold.shape[0] // line_size_scaling vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) @@ -235,24 +234,26 @@ def remove_lines(threshold, line_scale=15): return threshold -def find_cuts(threshold, char_scale=200): +def find_cuts(threshold, char_size_scaling=200): """Finds cuts made by text projections on y-axis. Parameters ---------- threshold : object numpy.ndarray representing the thresholded image. + line_size_scaling : int, optional (default: 200) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. - char_scale : int - Char scaling factor. - (optional, default: 200) + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. Returns ------- y_cuts : list List of cuts on y-axis. """ - size = threshold.shape[0] // char_scale + size = threshold.shape[0] // char_size_scaling char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) threshold = cv2.erode(threshold, char_el) @@ -268,5 +269,5 @@ def find_cuts(threshold, char_scale=200): contours = [cv2.boundingRect(c) for c in contours] y_cuts = [(c[1], c[1] + c[3]) for c in contours] y_cuts = list(merge_tuples(sorted(y_cuts))) - y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) / 2 for i in range(1, len(y_cuts))] + y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) // 2 for i in range(1, len(y_cuts))] return sorted(y_cuts, reverse=True) \ No newline at end of file diff --git a/camelot/io.py b/camelot/io.py new file mode 100644 index 0000000..33007d4 --- /dev/null +++ b/camelot/io.py @@ -0,0 +1,94 @@ +from .handlers import PDFHandler + + +def read_pdf(filepath, pages='1', mesh=False, **kwargs): + """Read PDF and return parsed data tables. + + Note: kwargs annotated with ^ can only be used with mesh=False + and kwargs annotated with * can only be used with mesh=True. + + Parameters + ---------- + filepath : str + Path to pdf file. + pages : str + Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + mesh : bool (default: False) + Whether or not to use Lattice method of parsing. Stream + is used by default. + table_area : list, optional (default: None) + List of table areas to analyze as strings of the form + x1,y1,x2,y2 where (x1, y1) -> left-top and + (x2, y2) -> right-bottom in pdf coordinate space. + columns^ : list, optional (default: None) + List of column x-coordinates as strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + row_close_tol^ : int, optional (default: 2) + Rows will be formed by combining text vertically + within this tolerance. + col_close_tol^ : int, optional (default: 0) + Columns will be formed by combining text horizontally + within this tolerance. + process_background* : bool, optional (default: False) + Whether or not to process lines that are in background. + line_size_scaling* : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + copy_text* : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + shift_text* : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a list + to specify where the text in a spanning cell should flow. + line_close_tol* : int, optional (default: 2) + Tolerance parameter used to merge vertical and horizontal + detected lines which lie close to each other. + joint_close_tol* : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + threshold_constant : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + margins : tuple + PDFMiner margins. (char_margin, line_margin, word_margin) + + For for information, refer `PDFMiner docs `_. + debug : bool, optional (default: False) + Whether or not to return all text objects on the page + which can be used to generate a matplotlib plot, to get + values for table_area(s) and debugging. + + Returns + ------- + tables : camelot.core.TableList + + """ + # validate kwargs? + p = PDFHandler(filepath, pages) + tables, __ = p.parse(mesh=mesh, **kwargs) + return tables \ No newline at end of file diff --git a/camelot/lattice.py b/camelot/lattice.py deleted file mode 100644 index 40803f6..0000000 --- a/camelot/lattice.py +++ /dev/null @@ -1,382 +0,0 @@ -from __future__ import division -import os -import sys -import copy -import types -import logging -import copy_reg -import warnings -import subprocess - -from .imgproc import (adaptive_threshold, find_lines, find_table_contours, - find_table_joints) -from .table import Table -from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, - merge_close_values, get_table_index, get_score, count_empty, - encode_list, get_text_objects, get_page_layout) - - -__all__ = ['Lattice'] -logger = logging.getLogger('app_logger') - - -def _reduce_method(m): - if m.im_self is None: - return getattr, (m.im_class, m.im_func.func_name) - else: - return getattr, (m.im_self, m.im_func.func_name) -copy_reg.pickle(types.MethodType, _reduce_method) - - -def _reduce_index(t, idx, shift_text): - """Reduces index of a text object if it lies within a spanning - cell. - - Parameters - ---------- - table : object - camelot.table.Table - - idx : list - List of tuples of the form (r_idx, c_idx, text). - - shift_text : list - {'l', 'r', 't', 'b'} - Select one or more from above and pass them as a list to - specify where the text in a spanning cell should flow. - - Returns - ------- - indices : list - List of tuples of the form (idx, text) where idx is the reduced - index of row/column and text is the an lttextline substring. - """ - indices = [] - for r_idx, c_idx, text in idx: - for d in shift_text: - if d == 'l': - if t.cells[r_idx][c_idx].spanning_h: - while not t.cells[r_idx][c_idx].left: - c_idx -= 1 - if d == 'r': - if t.cells[r_idx][c_idx].spanning_h: - while not t.cells[r_idx][c_idx].right: - c_idx += 1 - if d == 't': - if t.cells[r_idx][c_idx].spanning_v: - while not t.cells[r_idx][c_idx].top: - r_idx -= 1 - if d == 'b': - if t.cells[r_idx][c_idx].spanning_v: - while not t.cells[r_idx][c_idx].bottom: - r_idx += 1 - indices.append((r_idx, c_idx, text)) - return indices - - -def _fill_spanning(t, fill=None): - """Fills spanning cells. - - Parameters - ---------- - t : object - camelot.table.Table - - fill : list - {'h', 'v'} - Specify to fill spanning cells in horizontal or vertical - direction. - (optional, default: None) - - Returns - ------- - t : object - camelot.table.Table - """ - for f in fill: - if f == "h": - for i in range(len(t.cells)): - for j in range(len(t.cells[i])): - if t.cells[i][j].get_text().strip() == '': - if t.cells[i][j].spanning_h and not t.cells[i][j].left: - t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) - elif f == "v": - for i in range(len(t.cells)): - for j in range(len(t.cells[i])): - if t.cells[i][j].get_text().strip() == '': - if t.cells[i][j].spanning_v and not t.cells[i][j].top: - t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) - return t - - -class Lattice: - """Lattice looks for lines in the pdf to form a table. - - If you want to give fill and mtol for each table when specifying - multiple table areas, make sure that the length of fill and mtol - is equal to the length of table_area. Mapping between them is based - on index. - - Parameters - ---------- - table_area : list - List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's - coordinate space, denoting table areas to analyze. - (optional, default: None) - - fill : list - List of strings specifying directions to fill spanning cells. - {'h', 'v'} to fill spanning cells in horizontal or vertical - direction. - (optional, default: None) - - mtol : list - List of ints specifying m-tolerance parameters. - (optional, default: [2]) - - jtol : list - List of ints specifying j-tolerance parameters. - (optional, default: [2]) - - blocksize : int - Size of a pixel neighborhood that is used to calculate a - threshold value for the pixel: 3, 5, 7, and so on. - (optional, default: 15) - - threshold_constant : float - Constant subtracted from the mean or weighted mean - (see the details below). Normally, it is positive but may be - zero or negative as well. - (optional, default: -2) - - scale : int - Used to divide the height/width of a pdf to get a structuring - element for image processing. - (optional, default: 15) - - iterations : int - Number of iterations for dilation. - (optional, default: 0) - - invert : bool - Whether or not to invert the image. Useful when pdfs have - tables with lines in background. - (optional, default: False) - - margins : tuple - PDFMiner margins. (char_margin, line_margin, word_margin) - (optional, default: (1.0, 0.5, 0.1)) - - split_text : bool - Whether or not to split a text line if it spans across - different cells. - (optional, default: False) - - flag_size : bool - Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. - (optional, default: True) - - shift_text : list - {'l', 'r', 't', 'b'} - Select one or more from above and pass them as a list to - specify where the text in a spanning cell should flow. - (optional, default: ['l', 't']) - - debug : string - {'contour', 'line', 'joint', 'table'} - Set to one of the above values to generate a matplotlib plot - of detected contours, lines, joints and the table generated. - (optional, default: None) - """ - def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2], - blocksize=15, threshold_constant=-2, scale=15, iterations=0, - invert=False, margins=(1.0, 0.5, 0.1), split_text=False, - flag_size=True, shift_text=['l', 't'], debug=None): - - self.method = 'lattice' - self.table_area = table_area - self.fill = fill - self.mtol = mtol - self.jtol = jtol - self.blocksize = blocksize - self.threshold_constant = threshold_constant - self.scale = scale - self.iterations = iterations - self.invert = invert - self.char_margin, self.line_margin, self.word_margin = margins - self.split_text = split_text - self.flag_size = flag_size - self.shift_text = shift_text - self.debug = debug - - def get_tables(self, pdfname): - """Expects a single page pdf as input with rotation corrected. - - Parameters - ---------- - pdfname : string - Path to single page pdf file. - - Returns - ------- - page : dict - """ - layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, - line_margin=self.line_margin, word_margin=self.word_margin) - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - width, height = dim - bname, __ = os.path.splitext(pdfname) - logger.info('Processing {0}.'.format(os.path.basename(bname))) - if not ltchar: - warnings.warn("{0}: Page contains no text.".format( - os.path.basename(bname))) - return {os.path.basename(bname): None} - - imagename = ''.join([bname, '.png']) - gs_call = [ - "-q", "-sDEVICE=png16m", "-o", imagename, "-r600", pdfname - ] - if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): - gs_call.insert(0, "gs") - else: - gs_call.insert(0, "gsc") - subprocess.call(gs_call, stdout=open(os.devnull, 'w'), - stderr=subprocess.STDOUT) - - img, threshold = adaptive_threshold(imagename, invert=self.invert, - blocksize=self.blocksize, c=self.threshold_constant) - pdf_x = width - pdf_y = height - img_x = img.shape[1] - img_y = img.shape[0] - sc_x_image = img_x / float(pdf_x) - sc_y_image = img_y / float(pdf_y) - sc_x_pdf = pdf_x / float(img_x) - sc_y_pdf = pdf_y / float(img_y) - factors_image = (sc_x_image, sc_y_image, pdf_y) - factors_pdf = (sc_x_pdf, sc_y_pdf, img_y) - - vmask, v_segments = find_lines(threshold, direction='vertical', - scale=self.scale, iterations=self.iterations) - hmask, h_segments = find_lines(threshold, direction='horizontal', - scale=self.scale, iterations=self.iterations) - - if self.table_area is not None: - areas = [] - for area in self.table_area: - x1, y1, x2, y2 = area.split(",") - x1 = float(x1) - y1 = float(y1) - x2 = float(x2) - y2 = float(y2) - x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image) - areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) - table_bbox = find_table_joints(areas, vmask, hmask) - else: - contours = find_table_contours(vmask, hmask) - table_bbox = find_table_joints(contours, vmask, hmask) - - if len(self.mtol) == 1 and self.mtol[0] == 2: - mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) - else: - mtolerance = copy.deepcopy(self.mtol) - - if len(self.jtol) == 1 and self.jtol[0] == 2: - jtolerance = copy.deepcopy(self.jtol) * len(table_bbox) - else: - jtolerance = copy.deepcopy(self.jtol) - - if self.debug: - self.debug_images = (img, table_bbox) - - table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments, - h_segments, factors_pdf) - - if self.debug: - self.debug_segments = (v_segments, h_segments) - self.debug_tables = [] - - page = {} - tables = {} - # sort tables based on y-coord - for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)): - # select elements which lie within table_bbox - table_data = {} - t_bbox = {} - v_s, h_s = segments_bbox(k, v_segments, h_segments) - t_bbox['horizontal'] = text_in_bbox(k, lttextlh) - t_bbox['vertical'] = text_in_bbox(k, lttextlv) - char_bbox = text_in_bbox(k, ltchar) - table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) - for direction in t_bbox: - t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) - cols, rows = zip(*table_bbox[k]) - cols, rows = list(cols), list(rows) - cols.extend([k[0], k[2]]) - rows.extend([k[1], k[3]]) - # sort horizontal and vertical segments - cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no]) - rows = merge_close_values( - sorted(rows, reverse=True), mtol=mtolerance[table_no]) - # make grid using x and y coord of shortlisted rows and cols - cols = [(cols[i], cols[i + 1]) - for i in range(0, len(cols) - 1)] - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] - - table = Table(cols, rows) - # set table edges to True using ver+hor lines - table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no]) - nouse = table.nocont_ / (len(v_s) + len(h_s)) - table_data['line_p'] = 100 * (1 - nouse) - # set spanning cells to True - table = table.set_spanning() - # set table border edges to True - table = table.set_border_edges() - - if self.debug: - self.debug_tables.append(table) - - assignment_errors = [] - table_data['split_text'] = [] - table_data['superscript'] = [] - for direction in ['vertical', 'horizontal']: - for t in t_bbox[direction]: - indices, error = get_table_index( - table, t, direction, split_text=self.split_text, - flag_size=self.flag_size) - if indices[:2] != (-1, -1): - assignment_errors.append(error) - indices = _reduce_index(table, indices, shift_text=self.shift_text) - if len(indices) > 1: - table_data['split_text'].append(indices) - for r_idx, c_idx, text in indices: - if all(s in text for s in ['', '']): - table_data['superscript'].append((r_idx, c_idx, text)) - table.cells[r_idx][c_idx].add_text(text) - score = get_score([[100, assignment_errors]]) - table_data['score'] = score - - if self.fill is not None: - table = _fill_spanning(table, fill=self.fill) - ar = table.get_list() - ar = encode_list(ar) - table_data['data'] = ar - empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) - table_data['empty_p'] = empty_p - table_data['r_nempty_cells'] = r_nempty_cells - table_data['c_nempty_cells'] = c_nempty_cells - table_data['nrows'] = len(ar) - table_data['ncols'] = len(ar[0]) - tables['table-{0}'.format(table_no + 1)] = table_data - page[os.path.basename(bname)] = tables - - if self.debug: - return None - - return page \ No newline at end of file diff --git a/camelot/ocr.py b/camelot/ocr.py deleted file mode 100644 index 48d1983..0000000 --- a/camelot/ocr.py +++ /dev/null @@ -1,331 +0,0 @@ -import os -import copy -import logging -import subprocess - -import pyocr -from PIL import Image - -from .table import Table -from .imgproc import (adaptive_threshold, find_lines, find_table_contours, - find_table_joints, remove_lines, find_cuts) -from .utils import merge_close_values, encode_list - - -__all__ = ['OCRLattice', 'OCRStream'] -logger = logging.getLogger('app_logger') - - -class OCRLattice: - """Lattice, but for images. - - Parameters - ---------- - table_area : list - List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's - coordinate space, denoting table areas to analyze. - (optional, default: None) - - mtol : list - List of ints specifying m-tolerance parameters. - (optional, default: [2]) - - blocksize : int - Size of a pixel neighborhood that is used to calculate a - threshold value for the pixel: 3, 5, 7, and so on. - (optional, default: 15) - - threshold_constant : float - Constant subtracted from the mean or weighted mean - (see the details below). Normally, it is positive but may be - zero or negative as well. - (optional, default: -2) - - dpi : int - Dots per inch. - (optional, default: 300) - - layout : int - Tesseract page segmentation mode. - (optional, default: 7) - - lang : string - Language to be used for OCR. - (optional, default: 'eng') - - scale : int - Used to divide the height/width of a pdf to get a structuring - element for image processing. - (optional, default: 15) - - iterations : int - Number of iterations for dilation. - (optional, default: 0) - - debug : string - {'contour', 'line', 'joint', 'table'} - Set to one of the above values to generate a matplotlib plot - of detected contours, lines, joints and the table generated. - (optional, default: None) - """ - def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2, - dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None): - - self.method = 'ocrl' - self.table_area = table_area - self.mtol = mtol - self.blocksize = blocksize - self.threshold_constant = threshold_constant - self.tool = pyocr.get_available_tools()[0] # fix this - self.dpi = dpi - self.layout = layout - self.lang = lang - self.scale = scale - self.iterations = iterations - self.debug = debug - - def get_tables(self, pdfname): - if self.tool is None: - return None - - bname, __ = os.path.splitext(pdfname) - imagename = ''.join([bname, '.png']) - logger.info('Processing {0}.'.format(os.path.basename(bname))) - - gs_call = [ - "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), - pdfname - ] - if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): - gs_call.insert(0, "gs") - else: - gs_call.insert(0, "gsc") - subprocess.call(gs_call, stdout=open(os.devnull, 'w'), - stderr=subprocess.STDOUT) - - img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, - c=self.threshold_constant) - vmask, v_segments = find_lines(threshold, direction='vertical', - scale=self.scale, iterations=self.iterations) - hmask, h_segments = find_lines(threshold, direction='horizontal', - scale=self.scale, iterations=self.iterations) - - if self.table_area is not None: - areas = [] - for area in self.table_area: - x1, y1, x2, y2 = area.split(",") - x1 = int(float(x1)) - y1 = int(float(y1)) - x2 = int(float(x2)) - y2 = int(float(y2)) - areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) - table_bbox = find_table_joints(areas, vmask, hmask) - else: - contours = find_table_contours(vmask, hmask) - table_bbox = find_table_joints(contours, vmask, hmask) - - if self.debug: - self.debug_images = (img, table_bbox) - self.debug_segments = (v_segments, h_segments) - self.debug_tables = [] - - if len(self.mtol) == 1 and self.mtol[0] == 2: - mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) - else: - mtolerance = copy.deepcopy(self.mtol) - - page = {} - tables = {} - table_no = 0 - for k in sorted(table_bbox.keys(), key=lambda x: x[1]): - table_data = {} - cols, rows = zip(*table_bbox[k]) - cols, rows = list(cols), list(rows) - cols.extend([k[0], k[2]]) - rows.extend([k[1], k[3]]) - cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no]) - rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no]) - cols = [(cols[i], cols[i + 1]) - for i in range(0, len(cols) - 1)] - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] - table = Table(cols, rows) - if self.debug: - self.debug_tables.append(table) - table.image = img[k[3]:k[1],k[0]:k[2]] - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - x1 = int(table.cells[i][j].x1) - y1 = int(table.cells[i][j].y1) - x2 = int(table.cells[i][j].x2) - y2 = int(table.cells[i][j].y2) - table.cells[i][j].image = img[y1:y2,x1:x2] - text = self.tool.image_to_string( - Image.fromarray(table.cells[i][j].image), - lang=self.lang, - builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout) - ) - table.cells[i][j].add_text(text) - ar = table.get_list() - ar.reverse() - ar = encode_list(ar) - table_data['data'] = ar - tables['table-{0}'.format(table_no + 1)] = table_data - table_no += 1 - page[os.path.basename(bname)] = tables - - if self.debug: - return None - - return page - - -class OCRStream: - """Stream, but for images. - - Parameters - ---------- - table_area : list - List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's - coordinate space, denoting table areas to analyze. - (optional, default: None) - - columns : list - List of strings where each string is comma-separated values of - x-coordinates in OpenCV's coordinate space. - (optional, default: None) - - blocksize : int - Size of a pixel neighborhood that is used to calculate a - threshold value for the pixel: 3, 5, 7, and so on. - (optional, default: 15) - - threshold_constant : float - Constant subtracted from the mean or weighted mean - (see the details below). Normally, it is positive but may be - zero or negative as well. - (optional, default: -2) - - dpi : int - Dots per inch. - (optional, default: 300) - - layout : int - Tesseract page segmentation mode. - (optional, default: 7) - - lang : string - Language to be used for OCR. - (optional, default: 'eng') - - line_scale : int - Line scaling factor. - (optional, default: 15) - - char_scale : int - Char scaling factor. - (optional, default: 200) - """ - def __init__(self, table_area=None, columns=None, blocksize=15, - threshold_constant=-2, dpi=300, layout=7, lang="eng", - line_scale=15, char_scale=200, debug=False): - - self.method = 'ocrs' - self.table_area = table_area - self.columns = columns - self.blocksize = blocksize - self.threshold_constant = threshold_constant - self.tool = pyocr.get_available_tools()[0] # fix this - self.dpi = dpi - self.layout = layout - self.lang = lang - self.line_scale = line_scale - self.char_scale = char_scale - self.debug = debug - - def get_tables(self, pdfname): - if self.tool is None: - return None - - bname, __ = os.path.splitext(pdfname) - imagename = ''.join([bname, '.png']) - logger.info('Processing {0}.'.format(os.path.basename(bname))) - - gs_call = [ - "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), - pdfname - ] - if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): - gs_call.insert(0, "gs") - else: - gs_call.insert(0, "gsc") - subprocess.call(gs_call, stdout=open(os.devnull, 'w'), - stderr=subprocess.STDOUT) - - img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, - c=self.threshold_constant) - threshold = remove_lines(threshold, line_scale=self.line_scale) - height, width = threshold.shape - if self.debug: - self.debug_images = img - return None - - if self.table_area is not None: - if self.columns is not None: - if len(self.table_area) != len(self.columns): - raise ValueError("{0}: Length of table area and columns" - " should be equal.".format(os.path.basename(bname))) - - table_bbox = {} - for area in self.table_area: - x1, y1, x2, y2 = area.split(",") - x1 = int(float(x1)) - y1 = int(float(y1)) - x2 = int(float(x2)) - y2 = int(float(y2)) - table_bbox[(x1, y1, x2, y2)] = None - else: - table_bbox = {(0, 0, width, height): None} - - page = {} - tables = {} - table_no = 0 - for k in sorted(table_bbox.keys(), key=lambda x: x[1]): - if self.columns is None: - raise NotImplementedError - else: - table_data = {} - table_image = threshold[k[1]:k[3],k[0]:k[2]] - cols = self.columns[table_no].split(',') - cols = [float(c) for c in cols] - cols.insert(0, k[0]) - cols.append(k[2]) - cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)] - y_cuts = find_cuts(table_image, char_scale=self.char_scale) - rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)] - table = Table(cols, rows) - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - x1 = int(table.cells[i][j].x1) - y1 = int(table.cells[i][j].y1) - x2 = int(table.cells[i][j].x2) - y2 = int(table.cells[i][j].y2) - table.cells[i][j].image = table_image[y1:y2,x1:x2] - cell_image = Image.fromarray(table.cells[i][j].image) - text = self.tool.image_to_string( - cell_image, - lang=self.lang, - builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout) - ) - table.cells[i][j].add_text(text) - ar = table.get_list() - ar.reverse() - ar = encode_list(ar) - table_data['data'] = ar - tables['table-{0}'.format(table_no + 1)] = table_data - table_no += 1 - page[os.path.basename(bname)] = tables - - return page \ No newline at end of file diff --git a/camelot/parsers/__init__.py b/camelot/parsers/__init__.py new file mode 100644 index 0000000..e046b46 --- /dev/null +++ b/camelot/parsers/__init__.py @@ -0,0 +1,2 @@ +from .stream import Stream +from .lattice import Lattice \ No newline at end of file diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py new file mode 100644 index 0000000..3ffe146 --- /dev/null +++ b/camelot/parsers/base.py @@ -0,0 +1,21 @@ +import os + +from ..core import Geometry +from ..utils import get_page_layout, get_text_objects + + +class BaseParser(object): + """Defines a base parser. + """ + def _generate_layout(self, filename): + self.filename = filename + self.layout, self.dimensions = get_page_layout( + self.filename, + char_margin=self.char_margin, + line_margin=self.line_margin, + word_margin=self.word_margin) + self.horizontal_text = get_text_objects(self.layout, ltype="lh") + self.vertical_text = get_text_objects(self.layout, ltype="lv") + self.pdf_width, self.pdf_height = self.dimensions + self.rootname, __ = os.path.splitext(self.filename) + self.g = Geometry() \ No newline at end of file diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py new file mode 100644 index 0000000..40a9040 --- /dev/null +++ b/camelot/parsers/lattice.py @@ -0,0 +1,336 @@ +from __future__ import division +import os +import copy +import logging +import subprocess + +import numpy as np +import pandas as pd + +from .base import BaseParser +from ..core import Table +from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, + merge_close_lines, get_table_index, compute_accuracy, + compute_whitespace, setup_logging, encode_) +from ..image_processing import (adaptive_threshold, find_lines, + find_table_contours, find_table_joints) + + +logger = setup_logging(__name__) + + +class Lattice(BaseParser): + """Lattice method of parsing looks for lines between text + to form a table. + + Parameters + ---------- + table_area : list, optional (default: None) + List of table areas to analyze as strings of the form + x1,y1,x2,y2 where (x1, y1) -> left-top and + (x2, y2) -> right-bottom in pdf coordinate space. + process_background : bool, optional (default: False) + Whether or not to process lines that are in background. + line_size_scaling : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + copy_text : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + shift_text : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a list + to specify where the text in a spanning cell should flow. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + line_close_tol : int, optional (default: 2) + Tolerance parameter used to merge vertical and horizontal + detected lines which lie close to each other. + joint_close_tol : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + threshold_constant : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + margins : tuple + PDFMiner margins. (char_margin, line_margin, word_margin) + + For for information, refer `PDFMiner docs `_. + debug : bool, optional (default: False) + Whether or not to return all text objects on the page + which can be used to generate a matplotlib plot, to get + values for table_area(s) and debugging. + + """ + def __init__(self, table_area=None, process_background=False, + line_size_scaling=15, copy_text=None, shift_text=['l', 't'], + split_text=False, flag_size=False, line_close_tol=2, + joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, + iterations=0, margins=(1.0, 0.5, 0.1), debug=False): + self.table_area = table_area + self.process_background = process_background + self.line_size_scaling = line_size_scaling + self.copy_text = copy_text + self.shift_text = shift_text + self.split_text = split_text + self.flag_size = flag_size + self.line_close_tol = line_close_tol + self.joint_close_tol = joint_close_tol + self.threshold_blocksize = threshold_blocksize + self.threshold_constant = threshold_constant + self.iterations = iterations + self.char_margin, self.line_margin, self.word_margin = margins + self.debug = debug + + @staticmethod + def _reduce_index(t, idx, shift_text): + """Reduces index of a text object if it lies within a spanning + cell. + + Parameters + ---------- + table : camelot.core.Table + idx : list + List of tuples of the form (r_idx, c_idx, text). + shift_text : list + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a + list to specify where the text in a spanning cell should + flow. + + Returns + ------- + indices : list + List of tuples of the form (r_idx, c_idx, text) where + r_idx and c_idx are new row and column indices for text. + + """ + indices = [] + for r_idx, c_idx, text in idx: + for d in shift_text: + if d == 'l': + if t.cells[r_idx][c_idx].hspan: + while not t.cells[r_idx][c_idx].left: + c_idx -= 1 + if d == 'r': + if t.cells[r_idx][c_idx].hspan: + while not t.cells[r_idx][c_idx].right: + c_idx += 1 + if d == 't': + if t.cells[r_idx][c_idx].vspan: + while not t.cells[r_idx][c_idx].top: + r_idx -= 1 + if d == 'b': + if t.cells[r_idx][c_idx].vspan: + while not t.cells[r_idx][c_idx].bottom: + r_idx += 1 + indices.append((r_idx, c_idx, text)) + return indices + + @staticmethod + def _copy_spanning_text(t, copy_text=None): + """Copies over text in empty spanning cells. + + Parameters + ---------- + t : camelot.core.Table + copy_text : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + + Returns + ------- + t : camelot.core.Table + + """ + for f in copy_text: + if f == "h": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].text.strip() == '': + if t.cells[i][j].hspan and not t.cells[i][j].left: + t.cells[i][j].text = t.cells[i][j - 1].text + elif f == "v": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].text.strip() == '': + if t.cells[i][j].vspan and not t.cells[i][j].top: + t.cells[i][j].text = t.cells[i - 1][j].text + return t + + def _generate_image(self): + self.imagename = ''.join([self.rootname, '.png']) + gs_call = [ + "-q", "-sDEVICE=png16m", "-o", self.imagename, "-r600", self.filename + ] + if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): + gs_call.insert(0, "gs") + else: + gs_call.insert(0, "gsc") + subprocess.call(gs_call, stdout=open(os.devnull, 'w'), + stderr=subprocess.STDOUT) + + def _generate_table_bbox(self): + self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background, + blocksize=self.threshold_blocksize, c=self.threshold_constant) + image_width = self.image.shape[1] + image_height = self.image.shape[0] + image_width_scaler = image_width / float(self.pdf_width) + image_height_scaler = image_height / float(self.pdf_height) + pdf_width_scaler = self.pdf_width / float(image_width) + pdf_height_scaler = self.pdf_height / float(image_height) + image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) + pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) + + vertical_mask, vertical_segments = find_lines( + self.threshold, direction='vertical', + line_size_scaling=self.line_size_scaling, iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, direction='horizontal', + line_size_scaling=self.line_size_scaling, iterations=self.iterations) + + if self.table_area is not None: + areas = [] + for area in self.table_area: + x1, y1, x2, y2 = area.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) + areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) + table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask) + else: + contours = find_table_contours(vertical_mask, horizontal_mask) + table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask) + + self.table_bbox_unscaled = copy.deepcopy(table_bbox) + + self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( + table_bbox, vertical_segments, horizontal_segments, pdf_scalers) + + def _generate_columns_and_rows(self, table_idx, tk): + # select elements which lie within table_bbox + t_bbox = {} + v_s, h_s = segments_in_bbox( + tk, self.vertical_segments, self.horizontal_segments) + t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text) + t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text) + self.t_bbox = t_bbox + + for direction in t_bbox: + t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) + + cols, rows = zip(*self.table_bbox[tk]) + cols, rows = list(cols), list(rows) + cols.extend([tk[0], tk[2]]) + rows.extend([tk[1], tk[3]]) + # sort horizontal and vertical segments + cols = merge_close_lines( + sorted(cols), line_close_tol=self.line_close_tol) + rows = merge_close_lines( + sorted(rows, reverse=True), line_close_tol=self.line_close_tol) + # make grid using x and y coord of shortlisted rows and cols + cols = [(cols[i], cols[i + 1]) + for i in range(0, len(cols) - 1)] + rows = [(rows[i], rows[i + 1]) + for i in range(0, len(rows) - 1)] + + return cols, rows, v_s, h_s + + def _generate_table(self, table_idx, cols, rows, **kwargs): + v_s = kwargs.get('v_s') + h_s = kwargs.get('h_s') + if v_s is None or h_s is None: + raise ValueError('No segments found on {}'.format(self.rootname)) + + table = Table(cols, rows) + # set table edges to True using ver+hor lines + table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol) + # set table border edges to True + table = table.set_border() + # set spanning cells to True + table = table.set_span() + + pos_errors = [] + for direction in self.t_bbox: + for t in self.t_bbox[direction]: + indices, error = get_table_index( + table, t, direction, split_text=self.split_text, + flag_size=self.flag_size) + if indices[:2] != (-1, -1): + pos_errors.append(error) + indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].text = text + accuracy = compute_accuracy([[100, pos_errors]]) + + if self.copy_text is not None: + table = Lattice._copy_spanning_text(table, copy_text=self.copy_text) + + data = table.data + data = encode_(data) + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + whitespace = compute_whitespace(data) + table.accuracy = accuracy + table.whitespace = whitespace + table.order = table_idx + 1 + table.page = int(os.path.basename(self.rootname).replace('page-', '')) + + return table + + def extract_tables(self, filename): + logger.info('Processing {}'.format(os.path.basename(filename))) + self._generate_layout(filename) + + if not self.horizontal_text: + logger.info("No tables found on {}".format( + os.path.basename(self.rootname))) + return [], self.g + + self._generate_image() + self._generate_table_bbox() + + _tables = [] + # sort tables based on y-coord + for table_idx, tk in enumerate(sorted(self.table_bbox.keys(), + key=lambda x: x[1], reverse=True)): + cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) + table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) + _tables.append(table) + + if self.debug: + text = [] + text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + self.g.text = text + self.g.images = (self.image, self.table_bbox_unscaled) + self.g.segments = (self.vertical_segments, self.horizontal_segments) + self.g.tables = _tables + + return _tables, self.g \ No newline at end of file diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py new file mode 100644 index 0000000..f547bf0 --- /dev/null +++ b/camelot/parsers/stream.py @@ -0,0 +1,370 @@ +from __future__ import division +import os +import logging + +import numpy as np +import pandas as pd + +from .base import BaseParser +from ..core import Table +from ..utils import (text_in_bbox, get_table_index, compute_accuracy, + compute_whitespace, setup_logging, encode_) + + +logger = setup_logging(__name__) + + +class Stream(BaseParser): + """Stream method of parsing looks for spaces between text + to form a table. + + If you want to specify columns when specifying multiple table + areas, make sure that the length of both lists are equal. + + Parameters + ---------- + table_area : list, optional (default: None) + List of table areas to analyze as strings of the form + x1,y1,x2,y2 where (x1, y1) -> left-top and + (x2, y2) -> right-bottom in pdf coordinate space. + columns : list, optional (default: None) + List of column x-coordinates as strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + row_close_tol : int, optional (default: 2) + Rows will be formed by combining text vertically + within this tolerance. + col_close_tol : int, optional (default: 0) + Columns will be formed by combining text horizontally + within this tolerance. + margins : tuple, optional (default: (1.0, 0.5, 0.1)) + PDFMiner margins. (char_margin, line_margin, word_margin) + + For for information, refer `PDFMiner docs `_. + debug : bool, optional (default: False) + Whether or not to return all text objects on the page + which can be used to generate a matplotlib plot, to get + values for table_area(s), columns and debugging. + + """ + def __init__(self, table_area=None, columns=None, split_text=False, + flag_size=False, row_close_tol=2, col_close_tol=0, + margins=(1.0, 0.5, 0.1), debug=False): + self.table_area = table_area + self.columns = columns + self._validate_columns() + self.split_text = split_text + self.flag_size = flag_size + self.row_close_tol = row_close_tol + self.col_close_tol = col_close_tol + self.char_margin, self.line_margin, self.word_margin = margins + self.debug = debug + + @staticmethod + def _text_bbox(t_bbox): + """Returns bounding box for the text present on a page. + + Parameters + ---------- + t_bbox : dict + Dict with two keys 'horizontal' and 'vertical' with lists of + LTTextLineHorizontals and LTTextLineVerticals respectively. + + Returns + ------- + text_bbox : tuple + Tuple (x0, y0, x1, y1) in pdf coordinate space. + + """ + xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) + ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) + xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) + ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]]) + text_bbox = (xmin, ymin, xmax, ymax) + return text_bbox + + @staticmethod + def _group_rows(text, row_close_tol=2): + """Groups PDFMiner text objects into rows vertically + within a tolerance. + + Parameters + ---------- + text : list + List of PDFMiner text objects. + row_close_tol : int, optional (default: 2) + + Returns + ------- + rows : list + Two-dimensional list of text objects grouped into rows. + + """ + row_y = 0 + rows = [] + temp = [] + for t in text: + # is checking for upright necessary? + # if t.get_text().strip() and all([obj.upright for obj in t._objs if + # type(obj) is LTChar]): + if t.get_text().strip(): + if not np.isclose(row_y, t.y0, atol=row_close_tol): + rows.append(sorted(temp, key=lambda t: t.x0)) + temp = [] + row_y = t.y0 + temp.append(t) + rows.append(sorted(temp, key=lambda t: t.x0)) + __ = rows.pop(0) # hacky + return rows + + @staticmethod + def _merge_columns(l, col_close_tol=0): + """Merges column boundaries horizontally if they overlap + or lie within a tolerance. + + Parameters + ---------- + l : list + List of column x-coordinate tuples. + col_close_tol : int, optional (default: 0) + + Returns + ------- + merged : list + List of merged column x-coordinate tuples. + + """ + merged = [] + for higher in l: + if not merged: + merged.append(higher) + else: + lower = merged[-1] + if col_close_tol >= 0: + if (higher[0] <= lower[1] or + np.isclose(higher[0], lower[1], atol=col_close_tol)): + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + elif col_close_tol < 0: + if higher[0] <= lower[1]: + if np.isclose(higher[0], lower[1], atol=abs(col_close_tol)): + merged.append(higher) + else: + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + return merged + + @staticmethod + def _join_rows(rows_grouped, text_y_max, text_y_min): + """Makes row coordinates continuous. + + Parameters + ---------- + rows_grouped : list + Two-dimensional list of text objects grouped into rows. + text_y_max : int + text_y_min : int + + Returns + ------- + rows : list + List of continuous row y-coordinate tuples. + + """ + row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) + if len(r) > 0 else 0 for r in rows_grouped] + rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] + rows.insert(0, text_y_max) + rows.append(text_y_min) + rows = [(rows[i], rows[i + 1]) + for i in range(0, len(rows) - 1)] + return rows + + @staticmethod + def _add_columns(cols, text, row_close_tol): + """Adds columns to existing list by taking into account + the text that lies outside the current column x-coordinates. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text : list + List of PDFMiner text objects. + ytol : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ + if text: + text = Stream._group_rows(text, row_close_tol=row_close_tol) + elements = [len(r) for r in text] + new_cols = [(t.x0, t.x1) + for r in text if len(r) == max(elements) for t in r] + cols.extend(Stream._merge_columns(sorted(new_cols))) + return cols + + @staticmethod + def _join_columns(cols, text_x_min, text_x_max): + """Makes column coordinates continuous. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text_x_min : int + text_y_max : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ + cols = sorted(cols) + cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) + for i in range(0, len(cols) - 1)] + return cols + + def _validate_columns(self): + if self.table_area is not None and self.columns is not None: + if len(self.table_area) != len(self.columns): + raise ValueError("Length of table_area and columns" + " should be equal") + + def _generate_table_bbox(self): + if self.table_area is not None: + table_bbox = {} + for area in self.table_area: + x1, y1, x2, y2 = area.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + table_bbox[(x1, y2, x2, y1)] = None + else: + table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} + self.table_bbox = table_bbox + + def _generate_columns_and_rows(self, table_idx, tk): + # select elements which lie within table_bbox + t_bbox = {} + t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text) + t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text) + self.t_bbox = t_bbox + + for direction in self.t_bbox: + self.t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) + + text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) + rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_close_tol=self.row_close_tol) + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + elements = [len(r) for r in rows_grouped] + + if self.columns is not None and self.columns[table_idx] != "": + # user has to input boundary columns too + # take (0, pdf_width) by default + # similar to else condition + # len can't be 1 + cols = self.columns[table_idx].split(',') + cols = [float(c) for c in cols] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + else: + ncols = max(set(elements), key=elements.count) + if ncols == 1: + logger.info("No tables found on {}".format( + os.path.basename(self.rootname))) + cols = [(t.x0, t.x1) + for r in rows_grouped if len(r) == ncols for t in r] + cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend([t for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right]) + outer_text = [t for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.row_close_tol) + cols = self._join_columns(cols, text_x_min, text_x_max) + + return cols, rows + + def _generate_table(self, table_idx, cols, rows, **kwargs): + table = Table(cols, rows) + table = table.set_all_edges() + pos_errors = [] + for direction in self.t_bbox: + for t in self.t_bbox[direction]: + indices, error = get_table_index( + table, t, direction, split_text=self.split_text, + flag_size=self.flag_size) + if indices[:2] != (-1, -1): + pos_errors.append(error) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].text = text + accuracy = compute_accuracy([[100, pos_errors]]) + + data = table.data + data = encode_(data) + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + whitespace = compute_whitespace(data) + table.accuracy = accuracy + table.whitespace = whitespace + table.order = table_idx + 1 + table.page = int(os.path.basename(self.rootname).replace('page-', '')) + + return table + + def extract_tables(self, filename): + logger.info('Processing {}'.format(os.path.basename(filename))) + self._generate_layout(filename) + + if not self.horizontal_text: + logger.info("No tables found on {}".format( + os.path.basename(self.rootname))) + return [], self.g + + self._generate_table_bbox() + + _tables = [] + # sort tables based on y-coord + for table_idx, tk in enumerate(sorted(self.table_bbox.keys(), + key=lambda x: x[1], reverse=True)): + cols, rows = self._generate_columns_and_rows(table_idx, tk) + table = self._generate_table(table_idx, cols, rows) + _tables.append(table) + + if self.debug: + text = [] + text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + self.g.text = text + self.g.tables = _tables + + return _tables, self.g \ No newline at end of file diff --git a/camelot/pdf.py b/camelot/pdf.py deleted file mode 100644 index 08fd26c..0000000 --- a/camelot/pdf.py +++ /dev/null @@ -1,268 +0,0 @@ -import os -import shutil -import tempfile -import itertools -import multiprocessing as mp -from functools import partial - -import cv2 -from PyPDF2 import PdfFileReader, PdfFileWriter - -from .utils import get_page_layout, get_text_objects, get_rotation - - -__all__ = ['Pdf'] - - -def _parse_page_numbers(pagenos): - """Converts list of dicts to list of ints. - - Parameters - ---------- - pagenos : list - List of dicts representing page ranges. A dict must have only - two keys named 'start' and 'end' having int as their value. - - Returns - ------- - page_numbers : list - List of int page numbers. - """ - page_numbers = [] - for p in pagenos: - page_numbers.extend(range(p['start'], p['end'] + 1)) - page_numbers = sorted(set(page_numbers)) - return page_numbers - - -def _save_page(temp, pdfname, pageno): - with open(pdfname, 'rb') as pdffile: - infile = PdfFileReader(pdffile, strict=False) - sp_path = os.path.join(temp, 'page-{0}.pdf'.format(pageno)) - sp_name, sp_ext = os.path.splitext(sp_path) - page = infile.getPage(pageno - 1) - outfile = PdfFileWriter() - outfile.addPage(page) - with open(sp_path, 'wb') as f: - outfile.write(f) - layout, dim = get_page_layout(sp_path) - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - rotation = get_rotation(lttextlh, lttextlv, ltchar) - if rotation != '': - sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext]) - os.rename(sp_path, sp_new_path) - sp_in = PdfFileReader(open(sp_new_path, 'rb'), - strict=False) - sp_out = PdfFileWriter() - sp_page = sp_in.getPage(0) - if rotation == 'left': - sp_page.rotateClockwise(90) - elif rotation == 'right': - sp_page.rotateCounterClockwise(90) - sp_out.addPage(sp_page) - with open(sp_path, 'wb') as pdf_out: - sp_out.write(pdf_out) - - -class Pdf: - """Pdf manager. - Handles all operations like temp directory creation, splitting file - into single page pdfs, running extraction using multiple processes - and removing the temp directory. - - Parameters - ---------- - extractor : object - camelot.stream.Stream or camelot.lattice.Lattice extractor - object. - - pdfname : string - Path to pdf file. - - pagenos : list - List of dicts representing page ranges. A dict must have only - two keys named 'start' and 'end' having int as their value. - (optional, default: [{'start': 1, 'end': 1}]) - - parallel : bool - Whether or not to run using multiple processes. - (optional, default: False) - - clean : bool - Whether or not to remove the temp directory. - (optional, default: False) - """ - - def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}], - parallel=False, clean=False): - - self.extractor = extractor - self.pdfname = pdfname - if not self.pdfname.endswith('.pdf'): - raise TypeError("File format not supported.") - self.pagenos = _parse_page_numbers(pagenos) - self.parallel = parallel - if self.parallel: - self.cpu_count = mp.cpu_count() - self.pool = mp.Pool(processes=self.cpu_count) - self.clean = clean - self.temp = tempfile.mkdtemp() - - def split(self): - """Splits file into single page pdfs. - """ - if self.parallel: - pfunc = partial(_save_page, self.temp, self.pdfname) - self.pool.map(pfunc, self.pagenos) - else: - for p in self.pagenos: - _save_page(self.temp, self.pdfname, p) - - - def extract(self): - """Runs table extraction by calling extractor.get_tables - on all single page pdfs. - """ - self.split() - pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) - for p in self.pagenos] - if self.parallel: - tables = self.pool.map(self.extractor.get_tables, pages) - tables = {k: v for d in tables if d is not None for k, v in d.items()} - else: - tables = {} - if self.extractor.debug: - if self.extractor.method == 'stream': - self.debug = self.extractor.debug - self.debug_text = [] - elif self.extractor.method in ['lattice', 'ocrl']: - self.debug = self.extractor.debug - self.debug_images = [] - self.debug_segments = [] - self.debug_tables = [] - elif self.extractor.method == 'ocrs': - self.debug = self.extractor.debug - self.debug_images = [] - for p in pages: - table = self.extractor.get_tables(p) - if table is not None: - tables.update(table) - if self.extractor.debug: - if self.extractor.method == 'stream': - self.debug_text.append(self.extractor.debug_text) - elif self.extractor.method in ['lattice', 'ocr']: - self.debug_images.append(self.extractor.debug_images) - self.debug_segments.append(self.extractor.debug_segments) - self.debug_tables.append(self.extractor.debug_tables) - elif self.extractor.method == 'ocrs': - self.debug_images.append(self.extractor.debug_images) - if self.clean: - self.remove_tempdir() - return tables - - def remove_tempdir(self): - """Removes temporary directory that was created to save single - page pdfs and their images. - """ - shutil.rmtree(self.temp) - - def debug_plot(self): - """Generates a matplotlib plot based on the selected extractor - debug option. - """ - import matplotlib.pyplot as plt - import matplotlib.patches as patches - - if self.debug is True: - if hasattr(self, 'debug_text'): - for text in self.debug_text: - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - xs, ys = [], [] - for t in text: - xs.extend([t[0], t[1]]) - ys.extend([t[2], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1] - ) - ) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) - plt.show() - elif hasattr(self, 'debug_images'): - for img in self.debug_images: - plt.imshow(img) - plt.show() - elif self.debug == 'contour': - try: - for img, table_bbox in self.debug_images: - for t in table_bbox.keys(): - cv2.rectangle(img, (t[0], t[1]), - (t[2], t[3]), (255, 0, 0), 3) - plt.imshow(img) - plt.show() - except AttributeError: - raise ValueError("This option can only be used with Lattice.") - elif self.debug == 'joint': - try: - for img, table_bbox in self.debug_images: - x_coord = [] - y_coord = [] - for k in table_bbox.keys(): - for coord in table_bbox[k]: - x_coord.append(coord[0]) - y_coord.append(coord[1]) - max_x, max_y = max(x_coord), max(y_coord) - plt.plot(x_coord, y_coord, 'ro') - plt.axis([0, max_x + 100, max_y + 100, 0]) - plt.imshow(img) - plt.show() - except AttributeError: - raise ValueError("This option can only be used with Lattice.") - elif self.debug == 'line': - try: - for v_s, h_s in self.debug_segments: - for v in v_s: - plt.plot([v[0], v[2]], [v[1], v[3]]) - for h in h_s: - plt.plot([h[0], h[2]], [h[1], h[3]]) - plt.show() - except AttributeError: - raise ValueError("This option can only be used with Lattice.") - elif self.debug == 'table': - try: - for tables in self.debug_tables: - for table in tables: - for r in range(len(table.rows)): - for c in range(len(table.cols)): - if table.cells[r][c].left: - plt.plot([table.cells[r][c].lb[0], - table.cells[r][c].lt[0]], - [table.cells[r][c].lb[1], - table.cells[r][c].lt[1]]) - if table.cells[r][c].right: - plt.plot([table.cells[r][c].rb[0], - table.cells[r][c].rt[0]], - [table.cells[r][c].rb[1], - table.cells[r][c].rt[1]]) - if table.cells[r][c].top: - plt.plot([table.cells[r][c].lt[0], - table.cells[r][c].rt[0]], - [table.cells[r][c].lt[1], - table.cells[r][c].rt[1]]) - if table.cells[r][c].bottom: - plt.plot([table.cells[r][c].lb[0], - table.cells[r][c].rb[0]], - [table.cells[r][c].lb[1], - table.cells[r][c].rb[1]]) - plt.show() - except AttributeError: - raise ValueError("This option can only be used with Lattice.") - else: - raise UserWarning("This method can only be called after" - " debug has been specified.") \ No newline at end of file diff --git a/camelot/plotting.py b/camelot/plotting.py new file mode 100644 index 0000000..2d0bb3c --- /dev/null +++ b/camelot/plotting.py @@ -0,0 +1,174 @@ +import cv2 +import matplotlib.pyplot as plt +import matplotlib.patches as patches + +from .handlers import PDFHandler + + +def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs): + """Plot geometry found on pdf page based on type specified, + useful for debugging and playing with different parameters to get + the best output. + + Note: kwargs annotated with ^ can only be used with mesh=False + and kwargs annotated with * can only be used with mesh=True. + + Parameters + ---------- + filepath : str + Path to pdf file. + pages : str + Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + mesh : bool (default: False) + Whether or not to use Lattice method of parsing. Stream + is used by default. + geometry_type : str, optional (default: 'text') + 'text' : Plot text objects found on page, useful to get + table_area and columns coordinates. + 'table' : Plot parsed table. + 'contour'* : Plot detected rectangles. + 'joint'* : Plot detected line intersections. + 'line'* : Plot detected lines. + table_area : list, optional (default: None) + List of table areas to analyze as strings of the form + x1,y1,x2,y2 where (x1, y1) -> left-top and + (x2, y2) -> right-bottom in pdf coordinate space. + columns^ : list, optional (default: None) + List of column x-coordinates as strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + row_close_tol^ : int, optional (default: 2) + Rows will be formed by combining text vertically + within this tolerance. + col_close_tol^ : int, optional (default: 0) + Columns will be formed by combining text horizontally + within this tolerance. + process_background* : bool, optional (default: False) + Whether or not to process lines that are in background. + line_size_scaling* : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + copy_text* : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + shift_text* : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a list + to specify where the text in a spanning cell should flow. + line_close_tol* : int, optional (default: 2) + Tolerance parameter used to merge vertical and horizontal + detected lines which lie close to each other. + joint_close_tol* : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + threshold_constant : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + margins : tuple + PDFMiner margins. (char_margin, line_margin, word_margin) + + For for information, refer `PDFMiner docs `_. + debug : bool, optional (default: False) + Whether or not to return all text objects on the page + which can be used to generate a matplotlib plot, to get + values for table_area(s) and debugging. + + """ + # validate kwargs? + p = PDFHandler(filepath, pages) + debug = True if geometry_type else False + kwargs.update({'debug': debug}) + __, geometry = p.parse(mesh=mesh, **kwargs) + + if geometry_type == 'text': + for text in geometry.text: + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + xs, ys = [], [] + for t in text: + xs.extend([t[0], t[1]]) + ys.extend([t[2], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1] + ) + ) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + plt.show() + elif geometry_type == 'table': + for tables in geometry.tables: + for table in tables: + for row in table.cells: + for cell in row: + if cell.left: + plt.plot([cell.lb[0], cell.lt[0]], + [cell.lb[1], cell.lt[1]]) + if cell.right: + plt.plot([cell.rb[0], cell.rt[0]], + [cell.rb[1], cell.rt[1]]) + if cell.top: + plt.plot([cell.lt[0], cell.rt[0]], + [cell.lt[1], cell.rt[1]]) + if cell.bottom: + plt.plot([cell.lb[0], cell.rb[0]], + [cell.lb[1], cell.rb[1]]) + plt.show() + elif geometry_type == 'contour': + if not mesh: + raise ValueError("Use mesh=True") + for img, table_bbox in geometry.images: + for t in table_bbox.keys(): + cv2.rectangle(img, (t[0], t[1]), + (t[2], t[3]), (255, 0, 0), 3) + plt.imshow(img) + plt.show() + elif geometry_type == 'joint': + if not mesh: + raise ValueError("Use mesh=True") + for img, table_bbox in geometry.images: + x_coord = [] + y_coord = [] + for k in table_bbox.keys(): + for coord in table_bbox[k]: + x_coord.append(coord[0]) + y_coord.append(coord[1]) + max_x, max_y = max(x_coord), max(y_coord) + plt.plot(x_coord, y_coord, 'ro') + plt.axis([0, max_x + 100, max_y + 100, 0]) + plt.imshow(img) + plt.show() + elif geometry_type == 'line': + if not mesh: + raise ValueError("Use mesh=True") + for v_s, h_s in geometry.segments: + for v in v_s: + plt.plot([v[0], v[2]], [v[1], v[3]]) + for h in h_s: + plt.plot([h[0], h[2]], [h[1], h[3]]) + plt.show() \ No newline at end of file diff --git a/camelot/stream.py b/camelot/stream.py deleted file mode 100644 index e794d6a..0000000 --- a/camelot/stream.py +++ /dev/null @@ -1,428 +0,0 @@ -from __future__ import division -import os -import copy -import types -import logging -import copy_reg -import warnings - -import numpy as np - -from .table import Table -from .utils import (text_in_bbox, get_table_index, get_score, count_empty, - encode_list, get_text_objects, get_page_layout) - - -__all__ = ['Stream'] -logger = logging.getLogger('app_logger') - - -def _reduce_method(m): - if m.im_self is None: - return getattr, (m.im_class, m.im_func.func_name) - else: - return getattr, (m.im_self, m.im_func.func_name) -copy_reg.pickle(types.MethodType, _reduce_method) - - -def _text_bbox(t_bbox): - """Returns bounding box for the text present on a page. - - Parameters - ---------- - t_bbox : dict - Dict with two keys 'horizontal' and 'vertical' with lists of - LTTextLineHorizontals and LTTextLineVerticals respectively. - - Returns - ------- - text_bbox : tuple - Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate - space. - """ - xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) - ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) - xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) - ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]]) - text_bbox = (xmin, ymin, xmax, ymax) - return text_bbox - - -def _group_rows(text, ytol=2): - """Groups PDFMiner text objects into rows using their - y-coordinates taking into account some tolerance ytol. - - Parameters - ---------- - text : list - List of PDFMiner text objects. - - ytol : int - Tolerance parameter. - (optional, default: 2) - - Returns - ------- - rows : list - Two-dimensional list of text objects grouped into rows. - """ - row_y = 0 - rows = [] - temp = [] - for t in text: - # is checking for upright necessary? - # if t.get_text().strip() and all([obj.upright for obj in t._objs if - # type(obj) is LTChar]): - if t.get_text().strip(): - if not np.isclose(row_y, t.y0, atol=ytol): - rows.append(sorted(temp, key=lambda t: t.x0)) - temp = [] - row_y = t.y0 - temp.append(t) - rows.append(sorted(temp, key=lambda t: t.x0)) - __ = rows.pop(0) # hacky - return rows - - -def _merge_columns(l, mtol=0): - """Merges column boundaries if they overlap or lie within some - tolerance mtol. - - Parameters - ---------- - l : list - List of column coordinate tuples. - - mtol : int - TODO - (optional, default: 0) - - Returns - ------- - merged : list - List of merged column coordinate tuples. - """ - merged = [] - for higher in l: - if not merged: - merged.append(higher) - else: - lower = merged[-1] - if mtol >= 0: - if (higher[0] <= lower[1] or - np.isclose(higher[0], lower[1], atol=mtol)): - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - elif mtol < 0: - if higher[0] <= lower[1]: - if np.isclose(higher[0], lower[1], atol=abs(mtol)): - merged.append(higher) - else: - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - return merged - - -def _join_rows(rows_grouped, text_y_max, text_y_min): - """Makes row coordinates continuous. - - Parameters - ---------- - rows_grouped : list - Two-dimensional list of text objects grouped into rows. - - text_y_max : int - - text_y_min : int - - Returns - ------- - rows : list - List of continuous row coordinate tuples. - """ - row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) - if len(r) > 0 else 0 for r in rows_grouped] - rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] - rows.insert(0, text_y_max) - rows.append(text_y_min) - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] - return rows - - -def _join_columns(cols, text_x_min, text_x_max): - """Makes column coordinates continuous. - - Parameters - ---------- - cols : list - List of column coordinate tuples. - - text_x_min : int - - text_y_max : int - - Returns - ------- - cols : list - Updated list of column coordinate tuples. - """ - cols = sorted(cols) - cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) - for i in range(0, len(cols) - 1)] - return cols - - -def _add_columns(cols, text, ytol): - """Adds columns to existing list by taking into account - the text that lies outside the current column coordinates. - - Parameters - ---------- - cols : list - List of column coordinate tuples. - - text : list - List of PDFMiner text objects. - - ytol : int - Tolerance parameter. - - Returns - ------- - cols : list - Updated list of column coordinate tuples. - """ - if text: - text = _group_rows(text, ytol=ytol) - elements = [len(r) for r in text] - new_cols = [(t.x0, t.x1) - for r in text if len(r) == max(elements) for t in r] - cols.extend(_merge_columns(sorted(new_cols))) - return cols - - -class Stream: - """Stream looks for spaces between text elements to form a table. - - If you want to give columns, ytol or mtol for each table - when specifying multiple table areas, make sure that their length - is equal to the length of table_area. Mapping between them is based - on index. - - If you don't want to specify columns for the some tables in a pdf - page having multiple tables, pass them as empty strings. - For example: ['', 'x1,x2,x3,x4', ''] - - Parameters - ---------- - table_area : list - List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's - coordinate space, denoting table areas to analyze. - (optional, default: None) - - columns : list - List of strings where each string is comma-separated values of - x-coordinates in PDFMiner's coordinate space. - (optional, default: None) - - ytol : list - List of ints specifying the y-tolerance parameters. - (optional, default: [2]) - - mtol : list - List of ints specifying the m-tolerance parameters. - (optional, default: [0]) - - margins : tuple - PDFMiner margins. (char_margin, line_margin, word_margin) - (optional, default: (1.0, 0.5, 0.1)) - - split_text : bool - Whether or not to split a text line if it spans across - different cells. - (optional, default: False) - - flag_size : bool - Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. - (optional, default: True) - - debug : bool - Set to True to generate a matplotlib plot of - LTTextLineHorizontals in order to select table_area, columns. - (optional, default: False) - """ - def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0], - margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, - debug=False): - - self.method = 'stream' - self.table_area = table_area - self.columns = columns - self.ytol = ytol - self.mtol = mtol - self.char_margin, self.line_margin, self.word_margin = margins - self.split_text = split_text - self.flag_size = flag_size - self.debug = debug - - def get_tables(self, pdfname): - """Expects a single page pdf as input with rotation corrected. - - Parameters - --------- - pdfname : string - Path to single page pdf file. - - Returns - ------- - page : dict - """ - layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, - line_margin=self.line_margin, word_margin=self.word_margin) - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - width, height = dim - bname, __ = os.path.splitext(pdfname) - logger.info('Processing {0}.'.format(os.path.basename(bname))) - if not lttextlh: - warnings.warn("{0}: Page contains no text.".format( - os.path.basename(bname))) - return {os.path.basename(bname): None} - - if self.debug: - self.debug_text = [] - self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh]) - self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv]) - return None - - if self.table_area is not None: - if self.columns is not None: - if len(self.table_area) != len(self.columns): - raise ValueError("{0}: Length of table area and columns" - " should be equal.".format(os.path.basename(bname))) - - table_bbox = {} - for area in self.table_area: - x1, y1, x2, y2 = area.split(",") - x1 = float(x1) - y1 = float(y1) - x2 = float(x2) - y2 = float(y2) - table_bbox[(x1, y2, x2, y1)] = None - else: - table_bbox = {(0, 0, width, height): None} - - if len(self.ytol) == 1 and self.ytol[0] == 2: - ytolerance = copy.deepcopy(self.ytol) * len(table_bbox) - else: - ytolerance = copy.deepcopy(self.ytol) - - if len(self.mtol) == 1 and self.mtol[0] == 0: - mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) - else: - mtolerance = copy.deepcopy(self.mtol) - - page = {} - tables = {} - # sort tables based on y-coord - for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)): - # select elements which lie within table_bbox - table_data = {} - t_bbox = {} - t_bbox['horizontal'] = text_in_bbox(k, lttextlh) - t_bbox['vertical'] = text_in_bbox(k, lttextlv) - char_bbox = text_in_bbox(k, ltchar) - table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) - for direction in t_bbox: - t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) - text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox) - rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no]) - rows = _join_rows(rows_grouped, text_y_max, text_y_min) - elements = [len(r) for r in rows_grouped] - - guess = False - if self.columns is not None and self.columns[table_no] != "": - # user has to input boundary columns too - # take (0, width) by default - # similar to else condition - # len can't be 1 - cols = self.columns[table_no].split(',') - cols = [float(c) for c in cols] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - else: - guess = True - ncols = max(set(elements), key=elements.count) - len_non_mode = len(filter(lambda x: x != ncols, elements)) - if ncols == 1: - # no tables detected - warnings.warn("{0}: Page contains no tables.".format( - os.path.basename(bname))) - cols = [(t.x0, t.x1) - for r in rows_grouped if len(r) == ncols for t in r] - cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no]) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend([t for direction in t_bbox - for t in t_bbox[direction] - if t.x0 > left and t.x1 < right]) - outer_text = [t for direction in t_bbox - for t in t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] - inner_text.extend(outer_text) - cols = _add_columns(cols, inner_text, ytolerance[table_no]) - cols = _join_columns(cols, text_x_min, text_x_max) - - table = Table(cols, rows) - table = table.set_all_edges() - assignment_errors = [] - table_data['split_text'] = [] - table_data['superscript'] = [] - for direction in t_bbox: - for t in t_bbox[direction]: - indices, error = get_table_index( - table, t, direction, split_text=self.split_text, - flag_size=self.flag_size) - assignment_errors.append(error) - if len(indices) > 1: - table_data['split_text'].append(indices) - for r_idx, c_idx, text in indices: - if all(s in text for s in ['', '']): - table_data['superscript'].append((r_idx, c_idx, text)) - table.cells[r_idx][c_idx].add_text(text) - if guess: - score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]]) - else: - score = get_score([[100, assignment_errors]]) - - table_data['score'] = score - ar = table.get_list() - ar = encode_list(ar) - table_data['data'] = ar - empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) - table_data['empty_p'] = empty_p - table_data['r_nempty_cells'] = r_nempty_cells - table_data['c_nempty_cells'] = c_nempty_cells - table_data['nrows'] = len(ar) - table_data['ncols'] = len(ar[0]) - tables['table-{0}'.format(table_no + 1)] = table_data - page[os.path.basename(bname)] = tables - - return page \ No newline at end of file diff --git a/camelot/table.py b/camelot/table.py deleted file mode 100644 index fc1a45e..0000000 --- a/camelot/table.py +++ /dev/null @@ -1,236 +0,0 @@ -import numpy as np - -from .cell import Cell - - -class Table: - """Table. - Defines a table object with coordinates relative to a left-bottom - origin, which is also PDFMiner's coordinate space. - - Parameters - ---------- - cols : list - List of tuples representing column x-coordinates in increasing - order. - - rows : list - List of tuples representing row y-coordinates in decreasing - order. - - Attributes - ---------- - cells : list - List of cell objects with row-major ordering. - - nocont_ : int - Number of lines that did not contribute to setting cell edges. - """ - - def __init__(self, cols, rows): - - self.cols = cols - self.rows = rows - self.cells = [[Cell(c[0], r[1], c[1], r[0]) - for c in cols] for r in rows] - self.nocont_ = 0 - self.image = None - - def set_all_edges(self): - """Sets all table edges to True. - """ - for r in range(len(self.rows)): - for c in range(len(self.cols)): - self.cells[r][c].left = True - self.cells[r][c].right = True - self.cells[r][c].top = True - self.cells[r][c].bottom = True - return self - - def set_border_edges(self): - """Sets table border edges to True. - """ - for r in range(len(self.rows)): - self.cells[r][0].left = True - self.cells[r][len(self.cols) - 1].right = True - for c in range(len(self.cols)): - self.cells[0][c].top = True - self.cells[len(self.rows) - 1][c].bottom = True - return self - - def set_edges(self, vertical, horizontal, jtol=2): - """Sets a cell's edges to True depending on whether they - overlap with lines found by imgproc. - - Parameters - ---------- - vertical : list - List of vertical lines detected by imgproc. Coordinates - scaled and translated to the PDFMiner's coordinate space. - - horizontal : list - List of horizontal lines detected by imgproc. Coordinates - scaled and translated to the PDFMiner's coordinate space. - """ - for v in vertical: - # find closest x coord - # iterate over y coords and find closest points - i = [i for i, t in enumerate(self.cols) - if np.isclose(v[0], t[0], atol=jtol)] - j = [j for j, t in enumerate(self.rows) - if np.isclose(v[3], t[0], atol=jtol)] - k = [k for k, t in enumerate(self.rows) - if np.isclose(v[1], t[0], atol=jtol)] - if not j: - self.nocont_ += 1 - continue - J = j[0] - if i == [0]: # only left edge - I = i[0] - if k: - K = k[0] - while J < K: - self.cells[J][I].left = True - J += 1 - else: - K = len(self.rows) - while J < K: - self.cells[J][I].left = True - J += 1 - elif i == []: # only right edge - I = len(self.cols) - 1 - if k: - K = k[0] - while J < K: - self.cells[J][I].right = True - J += 1 - else: - K = len(self.rows) - while J < K: - self.cells[J][I].right = True - J += 1 - else: # both left and right edges - I = i[0] - if k: - K = k[0] - while J < K: - self.cells[J][I].left = True - self.cells[J][I - 1].right = True - J += 1 - else: - K = len(self.rows) - while J < K: - self.cells[J][I].left = True - self.cells[J][I - 1].right = True - J += 1 - - for h in horizontal: - # find closest y coord - # iterate over x coords and find closest points - i = [i for i, t in enumerate(self.rows) - if np.isclose(h[1], t[0], atol=jtol)] - j = [j for j, t in enumerate(self.cols) - if np.isclose(h[0], t[0], atol=jtol)] - k = [k for k, t in enumerate(self.cols) - if np.isclose(h[2], t[0], atol=jtol)] - if not j: - self.nocont_ += 1 - continue - J = j[0] - if i == [0]: # only top edge - I = i[0] - if k: - K = k[0] - while J < K: - self.cells[I][J].top = True - J += 1 - else: - K = len(self.cols) - while J < K: - self.cells[I][J].top = True - J += 1 - elif i == []: # only bottom edge - I = len(self.rows) - 1 - if k: - K = k[0] - while J < K: - self.cells[I][J].bottom = True - J += 1 - else: - K = len(self.cols) - while J < K: - self.cells[I][J].bottom = True - J += 1 - else: # both top and bottom edges - I = i[0] - if k: - K = k[0] - while J < K: - self.cells[I][J].top = True - self.cells[I - 1][J].bottom = True - J += 1 - else: - K = len(self.cols) - while J < K: - self.cells[I][J].top = True - self.cells[I - 1][J].bottom = True - J += 1 - - return self - - def set_spanning(self): - """Sets a cell's spanning_h or spanning_v attribute to True - depending on whether the cell spans/extends horizontally or - vertically. - """ - for r in range(len(self.rows)): - for c in range(len(self.cols)): - bound = self.cells[r][c].get_bounded_edges() - if bound == 4: - continue - elif bound == 3: - if not self.cells[r][c].left: - if (self.cells[r][c].right and - self.cells[r][c].top and - self.cells[r][c].bottom): - self.cells[r][c].spanning_h = True - elif not self.cells[r][c].right: - if (self.cells[r][c].left and - self.cells[r][c].top and - self.cells[r][c].bottom): - self.cells[r][c].spanning_h = True - elif not self.cells[r][c].top: - if (self.cells[r][c].left and - self.cells[r][c].right and - self.cells[r][c].bottom): - self.cells[r][c].spanning_v = True - elif not self.cells[r][c].bottom: - if (self.cells[r][c].left and - self.cells[r][c].right and - self.cells[r][c].top): - self.cells[r][c].spanning_v = True - elif bound == 2: - if self.cells[r][c].left and self.cells[r][c].right: - if (not self.cells[r][c].top and - not self.cells[r][c].bottom): - self.cells[r][c].spanning_v = True - elif self.cells[r][c].top and self.cells[r][c].bottom: - if (not self.cells[r][c].left and - not self.cells[r][c].right): - self.cells[r][c].spanning_h = True - - return self - - def get_list(self): - """Returns a two-dimensional list of text assigned to each - cell. - - Returns - ------- - ar : list - """ - ar = [] - for r in range(len(self.rows)): - ar.append([self.cells[r][c].get_text().strip() - for c in range(len(self.cols))]) - return ar diff --git a/camelot/utils.py b/camelot/utils.py index 3640b37..3e87fe5 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -18,18 +18,47 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, LTTextLineVertical) +def setup_logging(name): + """Sets up a logger with StreamHandler. + + Parameters + ---------- + name : str + + Returns + ------- + logger : logging.Logger + + """ + logger = logging.getLogger(name) + + format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s' + formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') + + handler = logging.StreamHandler() + handler.setLevel(logging.INFO) + handler.setFormatter(formatter) + + logger.addHandler(handler) + + return logger + + +logger = setup_logging(__name__) + + def translate(x1, x2): """Translates x2 by x1. Parameters ---------- x1 : float - x2 : float Returns ------- x2 : float + """ x2 += x1 return x2 @@ -41,12 +70,12 @@ def scale(x, s): Parameters ---------- x : float - s : float Returns ------- x : float + """ x *= s return x @@ -58,21 +87,17 @@ def rotate(x1, y1, x2, y2, angle): Parameters ---------- x1 : float - y1 : float - x2 : float - y2 : float - angle : float Angle in radians. Returns ------- xnew : float - ynew : float + """ s = np.sin(angle) c = np.cos(angle) @@ -85,17 +110,16 @@ def rotate(x1, y1, x2, y2, angle): return xnew, ynew -def scale_to_image(k, factors): - """Translates and scales PDFMiner coordinates to OpenCV's coordinate - space. +def scale_pdf(k, factors): + """Translates and scales pdf coordinate space to image + coordinate space. Parameters ---------- k : tuple Tuple (x1, y1, x2, y2) representing table bounding box where - (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner's coordinate + (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate space. - factors : tuple Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the first two elements are scaling factors and pdf_y is height of @@ -105,8 +129,9 @@ def scale_to_image(k, factors): ------- knew : tuple Tuple (x1, y1, x2, y2) representing table bounding box where - (x1, y1) -> lt and (x2, y2) -> rb in OpenCV's coordinate + (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate space. + """ x1, y1, x2, y2 = k scaling_factor_x, scaling_factor_y, pdf_y = factors @@ -118,22 +143,19 @@ def scale_to_image(k, factors): return knew -def scale_to_pdf(tables, v_segments, h_segments, factors): - """Translates and scales OpenCV coordinates to PDFMiner's coordinate - space. +def scale_image(tables, v_segments, h_segments, factors): + """Translates and scales image coordinate space to pdf + coordinate space. Parameters ---------- tables : dict Dict with table boundaries as keys and list of intersections - in that boundary as their value. - + in that boundary as value. v_segments : list List of vertical line segments. - h_segments : list List of horizontal line segments. - factors : tuple Tuple (scaling_factor_x, scaling_factor_y, img_y) where the first two elements are scaling factors and img_y is height of @@ -142,10 +164,9 @@ def scale_to_pdf(tables, v_segments, h_segments, factors): Returns ------- tables_new : dict - v_segments_new : dict - h_segments_new : dict + """ scaling_factor_x, scaling_factor_y, img_y = factors tables_new = {} @@ -178,54 +199,26 @@ def scale_to_pdf(tables, v_segments, h_segments, factors): return tables_new, v_segments_new, h_segments_new -def setup_logging(log_filepath): - """Setup logging - Args: - log_filepath (string): Path to log file - Returns: - logging.Logger: Logger object - """ - logger = logging.getLogger("app_logger") - logger.setLevel(logging.DEBUG) - # Log File Handler (Associating one log file per webservice run) - log_file_handler = logging.FileHandler(log_filepath, - mode='a', - encoding='utf-8') - log_file_handler.setLevel(logging.DEBUG) - format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s' - formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') - log_file_handler.setFormatter(formatter) - logger.addHandler(log_file_handler) - # Stream Log Handler (For console) - stream_log_handler = logging.StreamHandler() - stream_log_handler.setLevel(logging.INFO) - formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') - stream_log_handler.setFormatter(formatter) - logger.addHandler(stream_log_handler) - return logger - - def get_rotation(lttextlh, lttextlv, ltchar): - """Detects if text in table is vertical or not using the current + """Detects if text in table is rotated or not using the current transformation matrix (CTM) and returns its orientation. Parameters ---------- lttextlh : list List of PDFMiner LTTextLineHorizontal objects. - lttextlv : list List of PDFMiner LTTextLineVertical objects. - ltchar : list List of PDFMiner LTChar objects. Returns ------- rotation : string - {'', 'left', 'right'} - '' if text in table is upright, 'left' if rotated 90 degree - anti-clockwise and 'right' if rotated 90 degree clockwise. + '' if text in table is upright, 'anticlockwise' if + rotated 90 degree anticlockwise and 'clockwise' if + rotated 90 degree clockwise. + """ rotation = '' hlen = len([t for t in lttextlh if t.get_text().strip()]) @@ -233,23 +226,21 @@ def get_rotation(lttextlh, lttextlv, ltchar): if hlen < vlen: clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) - rotation = 'left' if clockwise < anticlockwise else 'right' + rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise' return rotation -def segments_bbox(bbox, v_segments, h_segments): - """Returns all line segments present inside a - table's bounding box. +def segments_in_bbox(bbox, v_segments, h_segments): + """Returns all line segments present inside a bounding box. Parameters ---------- bbox : tuple - Tuple (x1, y1, x2, y2) representing table bounding box where - (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space. - + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate + space. v_segments : list List of vertical line segments. - h_segments : list List of vertical horizontal segments. @@ -257,9 +248,9 @@ def segments_bbox(bbox, v_segments, h_segments): ------- v_s : list List of vertical line segments that lie inside table. - h_s : list List of horizontal line segments that lie inside table. + """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) @@ -271,45 +262,43 @@ def segments_bbox(bbox, v_segments, h_segments): def text_in_bbox(bbox, text): - """Returns all text objects present inside a - table's bounding box. + """Returns all text objects present inside a bounding box. Parameters ---------- bbox : tuple - Tuple (x1, y1, x2, y2) representing table bounding box where - (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space. - - text : list - List of PDFMiner text objects. + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate + space. + text : List of PDFMiner text objects. Returns ------- t_bbox : list List of PDFMiner text objects that lie inside table. + """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 - <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 - <= rt[1] + 2] + <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 + <= rt[1] + 2] return t_bbox -def remove_close_values(ar, mtol=2): - """Removes values which are within a tolerance of mtol of another value - present in list. +def remove_close_lines(ar, line_close_tol=2): + """Removes lines which are within a tolerance, based on their x or + y axis projections. Parameters ---------- ar : list - - mtol : int - (optional, default: 2) + line_close_tol : int, optional (default: 2) Returns ------- ret : list + """ ret = [] for a in ar: @@ -317,27 +306,26 @@ def remove_close_values(ar, mtol=2): ret.append(a) else: temp = ret[-1] - if np.isclose(temp, a, atol=mtol): + if np.isclose(temp, a, atol=line_close_tol): pass else: ret.append(a) return ret -def merge_close_values(ar, mtol=2): - """Merges values which are within a tolerance of mtol by calculating - a moving mean. +def merge_close_lines(ar, line_close_tol=2): + """Merges lines which are within a tolerance by calculating a + moving mean, based on their x or y axis projections. Parameters ---------- ar : list - - mtol : int - (optional, default: 2) + line_close_tol : int, optional (default: 2) Returns ------- ret : list + """ ret = [] for a in ar: @@ -345,7 +333,7 @@ def merge_close_values(ar, mtol=2): ret.append(a) else: temp = ret[-1] - if np.isclose(temp, a, atol=mtol): + if np.isclose(temp, a, atol=line_close_tol): temp = (temp + a) / 2.0 ret[-1] = temp else: @@ -353,22 +341,21 @@ def merge_close_values(ar, mtol=2): return ret -def flag_on_size(textline, direction): - """Flags a super/subscript by enclosing it with . May give - false positives. +def flag_font_size(textline, direction): + """Flags super/subscripts in text by enclosing them with . + May give false positives. Parameters ---------- textline : list List of PDFMiner LTChar objects. - direction : string - {'horizontal', 'vertical'} Direction of the PDFMiner LTTextLine object. Returns ------- fstring : string + """ if direction == 'horizontal': d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)] @@ -395,33 +382,28 @@ def flag_on_size(textline, direction): return fstring -def split_textline(table, textline, direction, flag_size=True): +def split_textline(table, textline, direction, flag_size=False): """Splits PDFMiner LTTextLine into substrings if it spans across multiple rows/columns. Parameters ---------- - table : object - camelot.pdf.Pdf - + table : camelot.core.Table textline : object PDFMiner LTTextLine object. - direction : string - {'horizontal', 'vertical'} Direction of the PDFMiner LTTextLine object. - - flag_size : bool + flag_size : bool, optional (default: False) Whether or not to highlight a substring using if its size is different from rest of the string, useful for super and subscripts. - (optional, default: True) Returns ------- grouped_chars : list List of tuples of the form (idx, text) where idx is the index of row/column and text is the an lttextline substring. + """ idx = 0 cut_text = [] @@ -466,46 +448,37 @@ def split_textline(table, textline, direction, flag_size=True): grouped_chars = [] for key, chars in groupby(cut_text, itemgetter(0, 1)): if flag_size: - grouped_chars.append((key[0], key[1], flag_on_size([t[2] for t in chars], direction))) + grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction))) else: gchars = [t[2].get_text() for t in chars] grouped_chars.append((key[0], key[1], ''.join(gchars).strip('\n'))) return grouped_chars -def get_table_index(table, t, direction, split_text=False, flag_size=True): - """Gets indices of the cell where given text object lies by +def get_table_index(table, t, direction, split_text=False, flag_size=False): + """Gets indices of the table cell where given text object lies by comparing their y and x-coordinates. Parameters ---------- - table : object - camelot.table.Table - + table : camelot.core.Table t : object PDFMiner LTTextLine object. - direction : string - {'horizontal', 'vertical'} Direction of the PDFMiner LTTextLine object. - - split_text : bool + split_text : bool, optional (default: False) Whether or not to split a text line if it spans across multiple cells. - (optional, default: False) - - flag_size : bool + flag_size : bool, optional (default: False) Whether or not to highlight a substring using if its size is different from rest of the string, useful for super and subscripts. - (optional, default: True) Returns ------- indices : list - List of tuples of the form (idx, text) where idx is the index - of row/column and text is the an lttextline substring. - + List of tuples of the form (r_idx, c_idx, text) where r_idx + and c_idx are row and column indices. error : float Assignment error, percentage of text area that lies outside a cell. @@ -514,6 +487,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): | [Text bounding box] | | +-------+ + """ r_idx, c_idx = [-1] * 2 for r in range(len(table.rows)): @@ -528,7 +502,11 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): else: lt_col_overlap.append(-1) if len(filter(lambda x: x != -1, lt_col_overlap)) == 0: - logging.warning("Text did not fit any column.") + text = t.get_text().strip('\n') + text_range = (t.x0, t.x1) + col_range = (table.cols[0][0], table.cols[-1][1]) + logger.info("{} {} does not lie in column range {}".format( + text, text_range, col_range)) r_idx = r c_idx = lt_col_overlap.index(max(lt_col_overlap)) break @@ -552,14 +530,14 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): return split_textline(table, t, direction, flag_size=flag_size), error else: if flag_size: - return [(r_idx, c_idx, flag_on_size(t._objs, direction))], error + return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error else: return [(r_idx, c_idx, t.get_text().strip('\n'))], error -def get_score(error_weights): - """Calculates score based on weights assigned to various parameters, - and their error percentages. +def compute_accuracy(error_weights): + """Calculates a score based on weights assigned to various + parameters and their error percentages. Parameters ---------- @@ -571,6 +549,7 @@ def get_score(error_weights): Returns ------- score : float + """ SCORE_VAL = 100 try: @@ -586,6 +565,30 @@ def get_score(error_weights): return score +def compute_whitespace(d): + """Calculates the percentage of empty strings in a + two-dimensional list. + + Parameters + ---------- + d : list + + Returns + ------- + whitespace : float + Percentage of empty cells. + + """ + whitespace = 0 + r_nempty_cells, c_nempty_cells = [], [] + for i in d: + for j in i: + if j.strip() == '': + whitespace += 1 + whitespace = 100 * (whitespace / float(len(d) * len(d[0]))) + return whitespace + + def remove_empty(d): """Removes empty rows and columns from a two-dimensional list. @@ -596,6 +599,7 @@ def remove_empty(d): Returns ------- d : list + """ for i, row in enumerate(d): if row == [''] * len(row): @@ -606,50 +610,8 @@ def remove_empty(d): return d -def count_empty(d): - """Counts empty rows and columns in a two-dimensional list. - - Parameters - ---------- - d : list - - Returns - ------- - n_empty_rows : list - Number of empty rows. - - n_empty_cols : list - Number of empty columns. - - empty_p : float - Percentage of empty cells. - """ - empty_p = 0 - r_nempty_cells, c_nempty_cells = [], [] - for i in d: - for j in i: - if j.strip() == '': - empty_p += 1 - empty_p = 100 * (empty_p / float(len(d) * len(d[0]))) - for row in d: - r_nempty_c = 0 - for r in row: - if r.strip() != '': - r_nempty_c += 1 - r_nempty_cells.append(r_nempty_c) - d = zip(*d) - d = [list(col) for col in d] - for col in d: - c_nempty_c = 0 - for c in col: - if c.strip() != '': - c_nempty_c += 1 - c_nempty_cells.append(c_nempty_c) - return empty_p, r_nempty_cells, c_nempty_cells - - -def encode_list(ar): - """Encodes list of text. +def encode_(ar): + """Encodes two-dimensional list into unicode. Parameters ---------- @@ -658,52 +620,13 @@ def encode_list(ar): Returns ------- ar : list + """ ar = [[r.encode('utf-8') for r in row] for row in ar] return ar -def get_text_objects(layout, ltype="char", t=None): - """Recursively parses pdf layout to get a list of - text objects. - - Parameters - ---------- - layout : object - PDFMiner LTPage object. - - ltype : string - {'char', 'lh', 'lv'} - Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal, - and LTTextLineVertical objects respectively. - - t : list - - Returns - ------- - t : list - List of PDFMiner text objects. - """ - if ltype == "char": - LTObject = LTChar - elif ltype == "lh": - LTObject = LTTextLineHorizontal - elif ltype == "lv": - LTObject = LTTextLineVertical - if t is None: - t = [] - try: - for obj in layout._objs: - if isinstance(obj, LTObject): - t.append(obj) - else: - t += get_text_objects(obj, ltype=ltype) - except AttributeError: - pass - return t - - -def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, +def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, detect_vertical=True, all_texts=True): """Returns a PDFMiner LTPage object and page dimension of a single page pdf. See https://euske.github.io/pdfminer/ to get definitions @@ -711,28 +634,23 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, Parameters ---------- - pname : string + filename : string Path to pdf file. - char_margin : float - line_margin : float - word_margin : float - detect_vertical : bool - all_texts : bool Returns ------- layout : object PDFMiner LTPage object. - dim : tuple - pdf page dimension of the form (width, height). + Dimension of pdf page in the form (width, height). + """ - with open(pname, 'r') as f: + with open(filename, 'r') as f: parser = PDFParser(f) document = PDFDocument(parser) if not document.is_extractable: @@ -754,16 +672,56 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, return layout, dim +def get_text_objects(layout, ltype="char", t=None): + """Recursively parses pdf layout to get a list of + PDFMiner text objects. + + Parameters + ---------- + layout : object + PDFMiner LTPage object. + ltype : string + Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal, + and LTTextLineVertical objects respectively. + t : list + + Returns + ------- + t : list + List of PDFMiner text objects. + + """ + if ltype == "char": + LTObject = LTChar + elif ltype == "lh": + LTObject = LTTextLineHorizontal + elif ltype == "lv": + LTObject = LTTextLineVertical + if t is None: + t = [] + try: + for obj in layout._objs: + if isinstance(obj, LTObject): + t.append(obj) + else: + t += get_text_objects(obj, ltype=ltype) + except AttributeError: + pass + return t + + def merge_tuples(tuples): """Merges a list of overlapping tuples. Parameters ---------- tuples : list + List of tuples where a tuple is a single axis coordinate pair. + + Yields + ------ + tuple - Returns - ------- - merged : list """ merged = list(tuples[0]) for s, e in tuples: diff --git a/debug/hough_opencv.py b/debug/hough_opencv.py deleted file mode 100644 index 79140f8..0000000 --- a/debug/hough_opencv.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -usage: python hough_opencv.py file.png - -finds lines present in an image using opencv's hough transform. -""" - -import sys -import time - -import cv2 -import numpy as np -import matplotlib.pyplot as plt - - -def timeit(func): - def timed(*args, **kw): - start = time.time() - result = func(*args, **kw) - end = time.time() - print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) - return result - return timed - - -@timeit -def main(): - image = cv2.imread(sys.argv[1]) - print "image dimensions -> {0}".format(image.shape) - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - edges = cv2.Canny(gray, 50, 150, apertureSize=3) - - lines = cv2.HoughLines(edges, 1, np.pi / 180, 200) - print "found {0} lines".format(len(lines)) - for line in lines: - r, theta = line[0] - # filter horizontal and vertical lines - if theta == 0 or np.isclose(theta, np.pi / 2): - x0 = r * np.cos(theta) - y0 = r * np.sin(theta) - x1 = int(x0 + 10000 * (-np.sin(theta))) - y1 = int(y0 + 10000 * (np.cos(theta))) - x2 = int(x0 - 10000 * (-np.sin(theta))) - y2 = int(y0 - 10000 * (np.cos(theta))) - cv2.line(image, (x1, y1), (x2, y2), (0, 0, 255), 5) - plt.imshow(image) - plt.show() - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print __doc__ - else: - main() \ No newline at end of file diff --git a/debug/hough_skimage.py b/debug/hough_skimage.py deleted file mode 100644 index 93012fc..0000000 --- a/debug/hough_skimage.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -usage: python hough_skimage.py file.png - -finds lines present in an image using scikit-image's hough transform. -""" - -import sys -import time - -import cv2 -import numpy as np -from scipy.misc import imread -import matplotlib.pyplot as plt -from skimage.transform import hough_line, hough_line_peaks - - -def timeit(func): - def timed(*args, **kw): - start = time.time() - result = func(*args, **kw) - end = time.time() - print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) - return result - return timed - - -@timeit -def main(): - image = cv2.imread(sys.argv[1]) - print "image dimensions -> {0}".format(image.shape) - ret, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY) - binary = np.min(binary, axis=2) - binary = np.where(binary == 255, 0, 255) - rows, cols = binary.shape - pixel = np.zeros(binary.shape) - - fig, ax = plt.subplots(1, 1, figsize=(8,4)) - ax.imshow(image, cmap=plt.cm.gray) - - theta_in = np.linspace(0, np.pi / 2, 10) - h, theta, d = hough_line(binary, theta_in) - for _, angle, dist in zip(*hough_line_peaks(h, theta, d)): - x0 = dist * np.cos(angle) - y0 = dist * np.sin(angle) - x1 = int(x0 + 1000 * (-np.sin(angle))) - y1 = int(y0 + 1000 * (np.cos(angle))) - x2 = int(x0 - 1000 * (-np.sin(angle))) - y2 = int(y0 - 1000 * (np.cos(angle))) - ax.plot((x1, x2), (y1, y2), '-r') - a = np.cos(angle) - b = np.sin(angle) - x = np.arange(binary.shape[1]) - y = np.arange(binary.shape[0]) - x = a * x - y = b * y - R = np.round(np.add(y.reshape((binary.shape[0], 1)), x.reshape((1, binary.shape[1])))) - pixel += np.isclose(R, np.round(dist)) - - pixel = np.clip(pixel, 0, 1) - pixel = np.where(pixel == 1, 0, 1) - binary = np.where(binary == 0, 255, 0) - binary *= pixel.astype(np.int64) - ax.imshow(binary, cmap=plt.cm.gray) - ax.axis((0, cols, rows, 0)) - ax.set_title('Detected lines') - ax.set_axis_off() - ax.set_adjustable('box-forced') - plt.show() - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print __doc__ - else: - main() \ No newline at end of file diff --git a/debug/houghp_skimage.py b/debug/houghp_skimage.py deleted file mode 100644 index c7b9aec..0000000 --- a/debug/houghp_skimage.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -usage: python hough_prob.py file.png - -finds lines present in an image using scikit-image's hough transform. -""" - -import sys -import time - -from scipy.misc import imread -import matplotlib.pyplot as plt -from skimage.feature import canny -from skimage.transform import probabilistic_hough_line - - -def timeit(func): - def timed(*args, **kw): - start = time.time() - result = func(*args, **kw) - end = time.time() - print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) - return result - return timed - - -@timeit -def main(): - image = imread(sys.argv[1], mode='L') - edges = canny(image, 2, 1, 25) - lines = probabilistic_hough_line(edges, threshold=1000) - - fig, ax = plt.subplots(1, 1, figsize=(8,4), sharex=True, sharey=True) - ax.imshow(edges * 0) - - for line in lines: - p0, p1 = line - ax.plot((p0[0], p1[0]), (p0[1], p1[1])) - - ax.set_title('Probabilistic Hough') - ax.set_axis_off() - ax.set_adjustable('box-forced') - plt.show() - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print __doc__ - else: - main() \ No newline at end of file diff --git a/debug/morph_transform.py b/debug/morph_transform.py deleted file mode 100644 index cd6a6b9..0000000 --- a/debug/morph_transform.py +++ /dev/null @@ -1,114 +0,0 @@ -""" -usage: python morph_transform.py file.png scale={int} invert={bool} - -finds lines present in an image using opencv's morph transform. -""" - -import sys -import time - -import cv2 -import numpy as np -import matplotlib.pyplot as plt - - -def timeit(func): - def timed(*args, **kw): - start = time.time() - result = func(*args, **kw) - end = time.time() - print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) - return result - return timed - - -def mt(imagename, scale=40, invert=False): - img = cv2.imread(imagename) - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - if invert: - threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2) - else: - threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2) - vertical = threshold - horizontal = threshold - - verticalsize = vertical.shape[0] / scale - horizontalsize = horizontal.shape[1] / scale - - ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) - hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) - - vertical = cv2.erode(vertical, ver, (-1, -1)) - vertical = cv2.dilate(vertical, ver, (-1, -1)) - - horizontal = cv2.erode(horizontal, hor, (-1, -1)) - horizontal = cv2.dilate(horizontal, hor, (-1, -1)) - - mask = vertical + horizontal - joints = np.bitwise_and(vertical, horizontal) - contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] - - tables = {} - for c in contours: - x, y, w, h = cv2.boundingRect(c) - x1, x2 = x, x + w - y1, y2 = y, y + h - # find number of non-zero values in joints using what boundingRect returns - roi = joints[y:y+h, x:x+w] - jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) - if len(jc) <= 4: # remove contours with less than <=4 joints - continue - joint_coords = [] - for j in jc: - jx, jy, jw, jh = cv2.boundingRect(j) - c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2 - joint_coords.append((c1, c2)) - tables[(x1, y2, x2, y1)] = joint_coords - - vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for vc in vcontours: - x, y, w, h = cv2.boundingRect(vc) - x1, x2 = x, x + w - y1, y2 = y, y + h - plt.plot([(x1 + x2) / 2, (x1 + x2) / 2], [y2, y1]) - - hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for hc in hcontours: - x, y, w, h = cv2.boundingRect(hc) - x1, x2 = x, x + w - y1, y2 = y, y + h - plt.plot([x1, x2], [(y1 + y2) / 2, (y1 + y2) / 2]) - - x_coord = [] - y_coord = [] - for k in tables.keys(): - for coord in tables[k]: - x_coord.append(coord[0]) - y_coord.append(coord[1]) - plt.plot(x_coord, y_coord, 'ro') - - plt.imshow(img) - plt.show() - return tables - - -@timeit -def main(): - try: - scale = int(sys.argv[2].split('=')[1]) - except IndexError: - scale = 40 - try: - invert = bool(sys.argv[3].split('=')[1]) - except IndexError: - invert = False - t = mt(sys.argv[1], scale=scale, invert=invert) - print 'tables found: ', len(t.keys()) - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print __doc__ - else: - main() diff --git a/debug/plot_geo.py b/debug/plot_geo.py deleted file mode 100644 index 3f7aaf9..0000000 --- a/debug/plot_geo.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -usage: python plot_geo.py file.pdf - python plot_geo.py file.pdf file.png - -prints lines and rectangles present in a pdf file. -""" - -import sys -import time - -import cv2 -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.patches as patches -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfdevice import PDFDevice -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfdocument import PDFDocument -from pdfminer.converter import PDFPageAggregator -from pdfminer.pdfinterp import PDFResourceManager -from pdfminer.pdfinterp import PDFPageInterpreter -from pdfminer.layout import LAParams, LTLine, LTRect -from pdfminer.pdfpage import PDFTextExtractionNotAllowed - - -MIN_LENGTH = 1 -pdf_x, pdf_y, image_x, image_y = [0] * 4 - - -def timeit(func): - def timed(*args, **kw): - start = time.time() - result = func(*args, **kw) - end = time.time() - print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) - return result - return timed - - -def remove_coords(coords): - merged = [] - for coord in coords: - if not merged: - merged.append(coord) - else: - last = merged[-1] - if np.isclose(last, coord, atol=2): - pass - else: - merged.append(coord) - return merged - - -def parse_layout(pdfname): - global pdf_x, pdf_y - def is_horizontal(line): - if line[0] == line[2]: - return True - return False - - def is_vertical(line): - if line[1] == line[3]: - return True - return False - - vertical, horizontal = [], [] - with open(pdfname, 'rb') as f: - parser = PDFParser(f) - document = PDFDocument(parser) - if not document.is_extractable: - raise PDFTextExtractionNotAllowed - laparams = LAParams() - rsrcmgr = PDFResourceManager() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - for page in PDFPage.create_pages(document): - interpreter.process_page(page) - layout = device.get_result() - pdf_x, pdf_y = layout.bbox[2], layout.bbox[3] - for obj in layout._objs: - if isinstance(obj, LTLine): - line = (obj.x0, obj.y0, obj.x1, obj.y1) - if is_vertical(line): - vertical.append(line) - elif is_horizontal(line): - horizontal.append(line) - elif isinstance(obj, LTRect): - vertical.append((obj.x0, obj.y1, obj.x0, obj.y0)) - vertical.append((obj.x1, obj.y1, obj.x1, obj.y0)) - horizontal.append((obj.x0, obj.y1, obj.x1, obj.y1)) - horizontal.append((obj.x0, obj.y0, obj.x1, obj.y0)) - return vertical, horizontal - - -def hough_transform(imagename): - global pdf_x, pdf_y, image_x, image_y - img = cv2.imread(imagename) - image_x, image_y = img.shape[1], img.shape[0] - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - edges = cv2.Canny(gray, 50, 150, apertureSize=3) - lines = cv2.HoughLines(edges, 1, np.pi/180, 1000) - x = [] - for line in lines: - r, theta = line[0] - x0 = r * np.cos(theta) - x0 *= pdf_x / float(image_x) - x.append(x0) - y = [] - for line in lines: - r, theta = line[0] - y0 = r * np.sin(theta) - y0 = abs(y0 - image_y) - y0 *= pdf_y / float(image_y) - y.append(y0) - x = remove_coords(sorted(set([x0 for x0 in x if x0 > 0]))) - y = remove_coords(sorted(set(y), reverse=True)) - return x, y - - -def plot_lines1(vertical, horizontal): - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - ax.set_xlim(0, 1000) - ax.set_ylim(0, 1000) - - vertical = filter(lambda x: abs(x[1] - x[3]) > MIN_LENGTH, vertical) - horizontal = filter(lambda x: abs(x[0] - x[2]) > MIN_LENGTH, horizontal) - for v in vertical: - ax.plot([v[0], v[2]], [v[1], v[3]]) - for h in horizontal: - ax.plot([h[0], h[2]], [h[1], h[3]]) - plt.show() - - -def plot_lines2(imagename, vertical, horizontal): - x, y = hough_transform(imagename) - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - ax.set_xlim(0, 1000) - ax.set_ylim(0, 1000) - - for x0 in x: - for v in vertical: - if np.isclose(x0, v[0], atol=2): - ax.plot([v[0], v[2]], [v[1], v[3]]) - for y0 in y: - for h in horizontal: - if np.isclose(y0, h[1], atol=2): - ax.plot([h[0], h[2]], [h[1], h[3]]) - plt.show() - - -@timeit -def main(): - vertical, horizontal = parse_layout(sys.argv[1]) - if len(sys.argv) == 2: - plot_lines1(vertical, horizontal) - elif len(sys.argv) == 3: - plot_lines1(vertical, horizontal) - plot_lines2(sys.argv[2], vertical, horizontal) - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print __doc__ - else: - main() \ No newline at end of file diff --git a/debug/plot_intensity.py b/debug/plot_intensity.py deleted file mode 100644 index 87c386b..0000000 --- a/debug/plot_intensity.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -usage: python plot_intensity.py file.png threshold - -plots sum of pixel intensities on both axes for an image. -""" -import sys -import time -from itertools import groupby -from operator import itemgetter - -import cv2 -import numpy as np -import matplotlib.pyplot as plt -from pylab import barh - - -def timeit(func): - def timed(*args, **kw): - start = time.time() - result = func(*args, **kw) - end = time.time() - print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) - return result - return timed - - -def plot_barchart(ar): - n = len(ar) - ind = np.arange(n) - width = 0.35 - plt.bar(ind, ar, width, color='r', zorder=1) - plt.show() - - -def merge_lines(lines): - ranges = [] - for k, g in groupby(enumerate(lines), lambda (i, x): i-x): - group = map(itemgetter(1), g) - ranges.append((group[0], group[-1])) - merged = [] - for r in ranges: - merged.append((r[0] + r[1]) / 2) - return merged - - -def plot_lines(image, lines): - for y in lines: - plt.plot([0, image.shape[1]], [y, y]) - plt.imshow(image) - plt.show() - - -@timeit -def main(): - image = cv2.imread(sys.argv[1]) - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2) - y_proj = np.sum(threshold, axis=1) - line_threshold = int(sys.argv[2]) - lines = np.where(y_proj < line_threshold)[0] - lines = merge_lines(lines) - plot_lines(image, lines) - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print __doc__ - else: - main() diff --git a/debug/print_text.py b/debug/print_text.py deleted file mode 100644 index 1ab83d2..0000000 --- a/debug/print_text.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -usage: python print_text.py file.pdf - -prints horizontal and vertical text lines present in a pdf file. -""" - -import sys -import time -from pprint import pprint - -from pdfminer.layout import LAParams -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfdevice import PDFDevice -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfdocument import PDFDocument -from pdfminer.converter import PDFPageAggregator -from pdfminer.pdfinterp import PDFPageInterpreter -from pdfminer.pdfinterp import PDFResourceManager -from pdfminer.pdfpage import PDFTextExtractionNotAllowed -from pdfminer.layout import (LAParams, LTChar, LTAnno, LTTextBoxHorizontal, - LTTextLineHorizontal, LTTextLineVertical, LTLine) - - -def timeit(func): - def timed(*args, **kw): - start = time.time() - result = func(*args, **kw) - end = time.time() - print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) - return result - return timed - - -def extract_text_objects(layout, LTObject, t=None): - if t is None: - t = [] - try: - for obj in layout._objs: - if isinstance(obj, LTObject): - t.append(obj) - else: - t += extract_text_objects(obj, LTObject) - except AttributeError: - pass - return t - - -@timeit -def main(): - with open(sys.argv[1], 'rb') as f: - parser = PDFParser(f) - document = PDFDocument(parser) - if not document.is_extractable: - raise PDFTextExtractionNotAllowed - # 2.0, 0.5, 0.1 - kwargs = { - 'char_margin': 1.0, - 'line_margin': 0.5, - 'word_margin': 0.1, - 'detect_vertical': True - } - laparams = LAParams(**kwargs) - rsrcmgr = PDFResourceManager() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - for page in PDFPage.create_pages(document): - interpreter.process_page(page) - layout = device.get_result() - lh = extract_text_objects(layout, LTTextLineHorizontal) - lv = extract_text_objects(layout, LTTextLineVertical) - print "number of horizontal text lines -> {0}".format(len(lh)) - print "horizontal text lines ->" - pprint([t.get_text() for t in lh]) - print "number of vertical text lines -> {0}".format(len(lv)) - print "vertical text lines ->" - pprint([t.get_text() for t in lv]) - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print __doc__ - else: - main() \ No newline at end of file diff --git a/debug/threshold.py b/debug/threshold.py deleted file mode 100644 index ea716b2..0000000 --- a/debug/threshold.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -usage: python threshold.py file.png blocksize threshold_constant - -shows thresholded image. -""" - -import sys -import time - -import cv2 -import numpy as np -import matplotlib.pyplot as plt - - -def timeit(func): - def timed(*args, **kw): - start = time.time() - result = func(*args, **kw) - end = time.time() - print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) - return result - return timed - - -@timeit -def main(): - img = cv2.imread(sys.argv[1]) - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - blocksize = int(sys.argv[2]) - threshold_constant = float(sys.argv[3]) - threshold = cv2.adaptiveThreshold(np.invert(gray), 255, - cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, threshold_constant) - plt.imshow(img) - plt.show() - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print __doc__ - else: - main() \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst index 99a9e7f..3bd0f3d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -4,17 +4,37 @@ API Reference ============= -Pdf -=== -.. automodule:: camelot.pdf +camelot.read_pdf +================ +.. automodule:: camelot.read_pdf :members: -Lattice -======= -.. automodule:: camelot.lattice +camelot.handlers.PDFHandler +=========================== +.. automodule:: camelot.handlers.PDFHandler :members: -Stream -====== -.. automodule:: camelot.stream +camelot.parsers.Stream +====================== +.. automodule:: camelot.parsers.Stream + :members: + +camelot.parsers.Lattice +======================= +.. automodule:: camelot.parsers.Lattice + :members: + +camelot.core.Cell +================= +.. automodule:: camelot.core.Cell + :members: + +camelot.core.Table +================== +.. automodule:: camelot.core.Table + :members: + +camelot.core.TableList +====================== +.. automodule:: camelot.core.TableList :members: \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index b186ad7..4b91c69 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,11 +3,11 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -================================== -Camelot: pdf parsing made simpler! -================================== +===================================== +Camelot: PDF Table Parsing for Humans +===================================== -Camelot is a Python 2.7 library and command-line tool for getting tables out of pdf files. +Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files. Why another pdf table parsing library? ====================================== @@ -32,12 +32,22 @@ Usage :: - >>> from camelot.pdf import Pdf - >>> from camelot.lattice import Lattice - - >>> manager = Pdf(Lattice(), 'us-030.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] + >>> import camelot + >>> tables = camelot.read_pdf("foo.pdf") + >>> tables + + >>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html + >>> tables[0] + + >>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html + >>> tables[0].parsing_report + { + "accuracy": 96, + "whitespace": 80, + "order": 1, + "page": 1 + } + >>> df = tables[0].df .. csv-table:: :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" @@ -49,45 +59,6 @@ Usage "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" -Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF. - -:: - - Camelot: PDF parsing made simpler! - - usage: - camelot [options] [...] - - options: - -h, --help Show this screen. - -v, --version Show version. - -V, --verbose Verbose. - -p, --pages Comma-separated list of page numbers. - Example: -p 1,3-6,10 [default: 1] - -P, --parallel Parallelize the parsing process. - -f, --format Output format. (csv,tsv,html,json,xlsx) [default: csv] - -l, --log Log to file. - -o, --output Output directory. - -M, --cmargin Char margin. Chars closer than cmargin are - grouped together to form a word. [default: 1.0] - -L, --lmargin Line margin. Lines closer than lmargin are - grouped together to form a textbox. [default: 0.5] - -W, --wmargin Word margin. Insert blank spaces between chars - if distance between words is greater than word - margin. [default: 0.1] - -J, --split_text Split text lines if they span across multiple cells. - -K, --flag_size Flag substring if its size differs from the whole string. - Useful for super and subscripts. - -X, --print-stats List stats on the parsing process. - -Y, --save-stats Save stats to a file. - -Z, --plot Plot distributions. (page,all,rc) - - camelot methods: - lattice Looks for lines between data. - stream Looks for spaces between data. - - See 'camelot -h' for more information on a specific method. - Installation ============ @@ -95,42 +66,41 @@ Make sure you have the most updated versions for `pip` and `setuptools`. You can pip install -U pip setuptools -The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_. +The dependencies include `tk`_ and `ghostscript`_. -.. _numpy: http://www.numpy.org/ -.. _OpenCV: http://opencv.org/ -.. _ImageMagick: http://www.imagemagick.org/script/index.php +.. _tk: https://wiki.tcl.tk/3743 +.. _ghostscript: https://www.ghostscript.com/ Installing dependencies ----------------------- -numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager. +tk and ghostscript can be installed using your system's default package manager. Linux ^^^^^ -* Arch Linux - -:: - - sudo pacman -S opencv imagemagick - * Ubuntu :: - sudo apt-get install libopencv-dev python-opencv imagemagick + sudo apt-get install python-opencv python-tk ghostscript + +* Arch Linux + +:: + + sudo pacman -S opencv tk ghostscript OS X ^^^^ :: - brew install homebrew/science/opencv imagemagick + brew install homebrew/science/opencv ghostscript Finally, `cd` into the project directory and install by:: - make install + python setup.py install API Reference ============= @@ -150,14 +120,14 @@ You can check the latest sources with the command:: Contributing ------------ -See :doc:`Contributing doc `. +See :doc:`Contributing guidelines `. Testing ------- :: - make test + python setup.py test License ======= diff --git a/examples/demo_lattice.py b/examples/demo_lattice.py deleted file mode 100644 index b3ff2ea..0000000 --- a/examples/demo_lattice.py +++ /dev/null @@ -1,11 +0,0 @@ -from camelot import Pdf -from camelot import Lattice - - -extractor = Lattice(Pdf("files/column_span_1.pdf", clean=True), scale=30) -tables = extractor.get_tables() -print tables - -extractor = Lattice(Pdf("files/column_span_2.pdf"), clean=True, scale=30) -tables = extractor.get_tables() -print tables diff --git a/examples/demo_lattice_fill.py b/examples/demo_lattice_fill.py deleted file mode 100644 index 3546b00..0000000 --- a/examples/demo_lattice_fill.py +++ /dev/null @@ -1,13 +0,0 @@ -from camelot import Pdf -from camelot import Lattice - - -extractor = Lattice( - Pdf("files/row_span_1.pdf", clean=True), fill='v', scale=40) -tables = extractor.get_tables() -print tables - -extractor = Lattice( - Pdf("files/row_span_2.pdf", clean=True), fill='v', scale=30) -tables = extractor.get_tables() -print tables diff --git a/examples/demo_lattice_invert.py b/examples/demo_lattice_invert.py deleted file mode 100644 index a0bf41e..0000000 --- a/examples/demo_lattice_invert.py +++ /dev/null @@ -1,13 +0,0 @@ -from camelot import Pdf -from camelot import Lattice - - -extractor = Lattice(Pdf("files/lines_in_background_1.pdf", - clean=True), scale=30, invert=True) -tables = extractor.get_tables() -print tables - -extractor = Lattice(Pdf("files/lines_in_background_2.pdf", - clean=True), scale=30, invert=True) -tables = extractor.get_tables() -print tables diff --git a/examples/demo_lattice_rotation.py b/examples/demo_lattice_rotation.py deleted file mode 100644 index d201cf1..0000000 --- a/examples/demo_lattice_rotation.py +++ /dev/null @@ -1,11 +0,0 @@ -from camelot import Pdf -from camelot import Lattice - - -extractor = Lattice(Pdf("files/left_rotated_table.pdf", clean=True), scale=30) -tables = extractor.get_tables() -print tables - -extractor = Lattice(Pdf("files/right_rotated_table.pdf", clean=True), scale=30) -tables = extractor.get_tables() -print tables diff --git a/examples/demo_lattice_twotables.py b/examples/demo_lattice_twotables.py deleted file mode 100644 index 91c6b93..0000000 --- a/examples/demo_lattice_twotables.py +++ /dev/null @@ -1,11 +0,0 @@ -from camelot import Pdf -from camelot import Lattice - - -extractor = Lattice(Pdf("files/twotables_1.pdf", clean=True), scale=40) -tables = extractor.get_tables() -print tables - -extractor = Lattice(Pdf("files/twotables_2.pdf", clean=True), scale=30) -tables = extractor.get_tables() -print tables diff --git a/examples/demo_stream.py b/examples/demo_stream.py deleted file mode 100644 index baee02f..0000000 --- a/examples/demo_stream.py +++ /dev/null @@ -1,8 +0,0 @@ -from camelot import Pdf -from camelot import Stream - - -extractor = Stream(Pdf("files/budget_2014-15.pdf", - char_margin=1.0, clean=True)) -tables = extractor.get_tables() -print tables diff --git a/examples/demo_stream_columns.py b/examples/demo_stream_columns.py deleted file mode 100644 index 79cc6cb..0000000 --- a/examples/demo_stream_columns.py +++ /dev/null @@ -1,13 +0,0 @@ -from camelot import Pdf -from camelot import Stream - - -extractor = Stream(Pdf("files/inconsistent_rows.pdf", char_margin=1.0), - columns="65,95,285,640,715,780", ytol=10) -tables = extractor.get_tables() -print tables - -extractor = Stream(Pdf("files/consistent_rows.pdf", char_margin=1.0), - columns="28,67,180,230,425,475,700", ytol=5) -tables = extractor.get_tables() -print tables diff --git a/examples/files/consistent_rows.pdf b/examples/files/consistent_rows.pdf deleted file mode 100644 index e0213aa..0000000 Binary files a/examples/files/consistent_rows.pdf and /dev/null differ diff --git a/examples/files/inconsistent_rows.pdf b/examples/files/inconsistent_rows.pdf deleted file mode 100644 index 9eb4b63..0000000 Binary files a/examples/files/inconsistent_rows.pdf and /dev/null differ diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..d907a0b --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,11 @@ +click==6.7 +matplotlib==2.2.3 +numpy==1.13.3 +opencv-python==3.4.2.17 +pandas==0.23.4 +pdfminer==20140328 +Pillow==5.2.0 +PyPDF2==1.26.0 +pytest==3.8.0 +pytest-runner==4.2 +Sphinx==1.8.0b1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 826e271..d1a33b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,8 @@ -docopt -matplotlib -nose -pdfminer -pyexcel-xlsx -Pillow -pyocr -PyPDF2 -Sphinx +click==6.7 +matplotlib==2.2.3 +numpy==1.13.3 +opencv-python==3.4.2.17 +pandas==0.23.4 +pdfminer==20140328 +Pillow==5.2.0 +PyPDF2==1.26.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..730f976 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[aliases] +test=pytest + +[tool:pytest] +addopts = --verbose +python_files = tests/test_*.py diff --git a/setup.py b/setup.py index 1d1d4ec..14c0516 100644 --- a/setup.py +++ b/setup.py @@ -4,12 +4,12 @@ import camelot NAME = 'camelot' VERSION = camelot.__version__ -DESCRIPTION = 'camelot parses tables from PDFs!' +DESCRIPTION = 'PDF Table Parsing for Humans' with open('README.md') as f: LONG_DESCRIPTION = f.read() URL = 'https://github.com/socialcopsdev/camelot' AUTHOR = 'Vinayak Mehta' -AUTHOR_EMAIL = 'vinayak@socialcops.com' +AUTHOR_EMAIL = 'vmehta94@gmail.com' LICENSE = 'BSD License' opencv_min_version = '2.4.8' @@ -48,10 +48,8 @@ def setup_package(): author=AUTHOR, author_email=AUTHOR_EMAIL, license=LICENSE, - keywords='parse scrape pdf table', packages=['camelot'], - install_requires=reqs, - scripts=['tools/camelot']) + install_requires=reqs) try: from setuptools import setup @@ -60,18 +58,14 @@ def setup_package(): opencv_status = get_opencv_status() opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version) - instructions = ("Installation instructions are available in the README at " - "https://github.com/socialcopsdev/camelot") if opencv_status['up_to_date'] is False: if opencv_status['version']: - raise ImportError("Your installation of OpenCV " - "{0} is out-of-date.\n{1}{2}" - .format(opencv_status['version'], - opencv_req_str, instructions)) + raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}" + .format(opencv_status['version'], opencv_req_str)) else: - raise ImportError("OpenCV is not installed.\n{0}{1}" - .format(opencv_req_str, instructions)) + raise ImportError("OpenCV is not installed.\n{}" + .format(opencv_req_str)) setup(**metadata) diff --git a/tests/budget_2014-15.pdf b/tests/budget_2014-15.pdf deleted file mode 100644 index 9466e87..0000000 Binary files a/tests/budget_2014-15.pdf and /dev/null differ diff --git a/tests/column_span_1.pdf b/tests/column_span_1.pdf deleted file mode 100644 index e7c164e..0000000 Binary files a/tests/column_span_1.pdf and /dev/null differ diff --git a/tests/column_span_2.pdf b/tests/column_span_2.pdf deleted file mode 100644 index 5cab903..0000000 Binary files a/tests/column_span_2.pdf and /dev/null differ diff --git a/tests/agstat.pdf b/tests/files/agstat.pdf similarity index 100% rename from tests/agstat.pdf rename to tests/files/agstat.pdf diff --git a/examples/files/left_rotated_table.pdf b/tests/files/anticlockwise_table_1.pdf similarity index 100% rename from examples/files/left_rotated_table.pdf rename to tests/files/anticlockwise_table_1.pdf diff --git a/tests/left_rotated_table_2.pdf b/tests/files/anticlockwise_table_2.pdf similarity index 100% rename from tests/left_rotated_table_2.pdf rename to tests/files/anticlockwise_table_2.pdf diff --git a/tests/assam.pdf b/tests/files/assam.pdf similarity index 100% rename from tests/assam.pdf rename to tests/files/assam.pdf diff --git a/examples/files/lines_in_background_1.pdf b/tests/files/background_lines_1.pdf similarity index 100% rename from examples/files/lines_in_background_1.pdf rename to tests/files/background_lines_1.pdf diff --git a/examples/files/lines_in_background_2.pdf b/tests/files/background_lines_2.pdf similarity index 100% rename from examples/files/lines_in_background_2.pdf rename to tests/files/background_lines_2.pdf diff --git a/examples/files/budget_2014-15.pdf b/tests/files/budget_2014-15.pdf similarity index 100% rename from examples/files/budget_2014-15.pdf rename to tests/files/budget_2014-15.pdf diff --git a/examples/files/right_rotated_table.pdf b/tests/files/clockwise_table_1.pdf similarity index 100% rename from examples/files/right_rotated_table.pdf rename to tests/files/clockwise_table_1.pdf diff --git a/tests/right_rotated_table_2.pdf b/tests/files/clockwise_table_2.pdf similarity index 100% rename from tests/right_rotated_table_2.pdf rename to tests/files/clockwise_table_2.pdf diff --git a/examples/files/column_span_1.pdf b/tests/files/column_span_1.pdf similarity index 100% rename from examples/files/column_span_1.pdf rename to tests/files/column_span_1.pdf diff --git a/examples/files/column_span_2.pdf b/tests/files/column_span_2.pdf similarity index 100% rename from examples/files/column_span_2.pdf rename to tests/files/column_span_2.pdf diff --git a/tests/district_health.pdf b/tests/files/district_health.pdf similarity index 100% rename from tests/district_health.pdf rename to tests/files/district_health.pdf diff --git a/tests/electoral_roll.pdf b/tests/files/electoral_roll.pdf similarity index 100% rename from tests/electoral_roll.pdf rename to tests/files/electoral_roll.pdf diff --git a/tests/health.pdf b/tests/files/health.pdf similarity index 100% rename from tests/health.pdf rename to tests/files/health.pdf diff --git a/tests/medicine.pdf b/tests/files/medicine.pdf similarity index 100% rename from tests/medicine.pdf rename to tests/files/medicine.pdf diff --git a/tests/mexican_towns.pdf b/tests/files/mexican_towns.pdf similarity index 100% rename from tests/mexican_towns.pdf rename to tests/files/mexican_towns.pdf diff --git a/examples/files/missing_values.pdf b/tests/files/missing_values.pdf similarity index 100% rename from examples/files/missing_values.pdf rename to tests/files/missing_values.pdf diff --git a/tests/population_growth.pdf b/tests/files/population_growth.pdf similarity index 100% rename from tests/population_growth.pdf rename to tests/files/population_growth.pdf diff --git a/tests/rainfall_distribution.pdf b/tests/files/rainfall_distribution.pdf similarity index 100% rename from tests/rainfall_distribution.pdf rename to tests/files/rainfall_distribution.pdf diff --git a/examples/files/row_span_1.pdf b/tests/files/row_span_1.pdf similarity index 100% rename from examples/files/row_span_1.pdf rename to tests/files/row_span_1.pdf diff --git a/examples/files/row_span_2.pdf b/tests/files/row_span_2.pdf similarity index 100% rename from examples/files/row_span_2.pdf rename to tests/files/row_span_2.pdf diff --git a/tests/row_span_3.pdf b/tests/files/row_span_3.pdf similarity index 100% rename from tests/row_span_3.pdf rename to tests/files/row_span_3.pdf diff --git a/tests/tableception.pdf b/tests/files/tableception.pdf similarity index 100% rename from tests/tableception.pdf rename to tests/files/tableception.pdf diff --git a/tests/tabula_test_pdfs/12s0324.pdf b/tests/files/tabula/12s0324.pdf similarity index 100% rename from tests/tabula_test_pdfs/12s0324.pdf rename to tests/files/tabula/12s0324.pdf diff --git a/tests/tabula_test_pdfs/20.pdf b/tests/files/tabula/20.pdf similarity index 100% rename from tests/tabula_test_pdfs/20.pdf rename to tests/files/tabula/20.pdf diff --git a/tests/tabula_test_pdfs/S2MNCEbirdisland.pdf b/tests/files/tabula/S2MNCEbirdisland.pdf similarity index 100% rename from tests/tabula_test_pdfs/S2MNCEbirdisland.pdf rename to tests/files/tabula/S2MNCEbirdisland.pdf diff --git a/tests/tabula_test_pdfs/arabic.pdf b/tests/files/tabula/arabic.pdf similarity index 100% rename from tests/tabula_test_pdfs/arabic.pdf rename to tests/files/tabula/arabic.pdf diff --git a/tests/tabula_test_pdfs/argentina_diputados_voting_record.pdf b/tests/files/tabula/argentina_diputados_voting_record.pdf similarity index 100% rename from tests/tabula_test_pdfs/argentina_diputados_voting_record.pdf rename to tests/files/tabula/argentina_diputados_voting_record.pdf diff --git a/tests/tabula_test_pdfs/campaign_donors.pdf b/tests/files/tabula/campaign_donors.pdf similarity index 100% rename from tests/tabula_test_pdfs/campaign_donors.pdf rename to tests/files/tabula/campaign_donors.pdf diff --git a/tests/tabula_test_pdfs/china.pdf b/tests/files/tabula/china.pdf similarity index 100% rename from tests/tabula_test_pdfs/china.pdf rename to tests/files/tabula/china.pdf diff --git a/tests/tabula_test_pdfs/eu-002.pdf b/tests/files/tabula/eu-002.pdf similarity index 100% rename from tests/tabula_test_pdfs/eu-002.pdf rename to tests/files/tabula/eu-002.pdf diff --git a/tests/tabula_test_pdfs/eu-017.pdf b/tests/files/tabula/eu-017.pdf similarity index 100% rename from tests/tabula_test_pdfs/eu-017.pdf rename to tests/files/tabula/eu-017.pdf diff --git a/tests/tabula_test_pdfs/failing_sort.pdf b/tests/files/tabula/failing_sort.pdf similarity index 100% rename from tests/tabula_test_pdfs/failing_sort.pdf rename to tests/files/tabula/failing_sort.pdf diff --git a/tests/tabula_test_pdfs/frx_2012_disclosure.pdf b/tests/files/tabula/frx_2012_disclosure.pdf similarity index 100% rename from tests/tabula_test_pdfs/frx_2012_disclosure.pdf rename to tests/files/tabula/frx_2012_disclosure.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-001-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-001-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-001-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-001-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-001-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-001-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-001-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-001-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-001.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-001.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-001.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-001.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-001.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-001.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-001.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-001.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-002-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-002-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-002-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-002-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-002-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-002-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-002-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-002-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-002.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-002.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-002.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-002.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-002.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-002.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-002.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-002.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-003-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-003-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-003-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-003-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-003-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-003-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-003-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-003-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-003.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-003.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-003.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-003.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-003.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-003.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-003.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-003.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-004-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-004-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-004-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-004-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-004-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-004-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-004-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-004-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-004.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-004.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-004.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-004.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-004.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-004.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-004.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-004.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-005-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-005-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-005-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-005-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-005-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-005-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-005-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-005-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-005.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-005.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-005.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-005.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-005.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-005.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-005.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-005.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-006-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-006-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-006-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-006-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-006-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-006-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-006-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-006-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-006.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-006.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-006.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-006.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-006.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-006.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-006.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-006.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-007-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-007-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-007-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-007-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-007-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-007-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-007-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-007-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-007.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-007.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-007.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-007.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-007.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-007.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-007.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-007.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-008-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-008-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-008-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-008-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-008-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-008-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-008-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-008-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-008.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-008.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-008.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-008.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-008.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-008.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-008.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-008.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009a-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009a-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009a-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009a-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009a.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009a.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009a.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009a.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009a.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009b-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009b-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009b-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009b-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009b-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009b-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-009b-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-009b-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-010-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-010-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-010-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-010-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-010-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-010-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-010-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-010-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-010.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-010.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-010.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-010.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-010.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-010.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-010.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-010.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-011-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-011-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-011-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-011-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-011-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-011-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-011-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-011-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-011.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-011.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-011.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-011.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-011.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-011.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-011.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-011.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-012-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-012-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-012-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-012-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-012-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-012-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-012-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-012-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-012.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-012.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-012.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-012.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-012.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-012.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-012.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-012.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-013-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-013-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-013-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-013-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-013-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-013-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-013-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-013-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-013.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-013.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-013.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-013.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-013.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-013.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-013.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-013.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-014-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-014-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-014-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-014-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-014-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-014-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-014-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-014-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-014.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-014.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-014.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-014.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-014.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-014.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-014.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-014.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-015-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-015-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-015-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-015-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-015-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-015-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-015-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-015-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-015.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-015.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-015.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-015.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-015.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-015.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-015.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-015.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-016-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-016-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-016-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-016-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-016-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-016-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-016-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-016-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-016.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-016.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-016.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-016.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-016.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-016.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-016.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-016.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-017-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-017-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-017-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-017-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-017-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-017-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-017-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-017-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-017.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-017.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-017.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-017.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-017.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-017.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-017.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-017.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-018-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-018-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-018-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-018-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-018-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-018-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-018-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-018-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-018.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-018.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-018.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-018.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-018.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-018.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-018.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-018.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-019-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-019-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-019-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-019-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-019-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-019-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-019-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-019-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-019.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-019.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-019.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-019.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-019.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-019.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-019.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-019.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-020-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-020-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-020-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-020-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-020-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-020-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-020-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-020-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-020.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-020.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-020.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-020.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-020.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-020.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-020.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-020.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-021-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-021-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-021-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-021-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-021-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-021-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-021-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-021-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-021.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-021.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-021.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-021.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-021.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-021.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-021.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-021.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-022-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-022-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-022-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-022-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-022-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-022-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-022-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-022-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-022.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-022.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-022.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-022.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-022.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-022.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-022.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-022.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-023-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-023-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-023-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-023-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-023-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-023-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-023-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-023-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-023.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-023.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-023.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-023.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-023.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-023.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-023.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-023.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-024-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-024-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-024-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-024-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-024-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-024-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-024-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-024-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-024.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-024.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-024.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-024.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-024.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-024.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-024.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-024.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-025-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-025-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-025-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-025-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-025-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-025-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-025-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-025-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-025.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-025.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-025.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-025.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-025.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-025.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-025.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-025.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-026-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-026-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-026-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-026-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-026-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-026-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-026-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-026-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-026.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-026.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-026.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-026.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-026.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-026.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-026.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-026.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-027-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-027-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-027-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-027-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-027.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-027.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-027.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-eu/eu-027.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-001-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-001-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-001-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-001-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-001-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-001-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-001-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-001-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-001.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-001.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-001.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-001.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-001.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-001.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-001.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-001.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-002-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-002-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-002-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-002-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-002-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-002-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-002-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-002-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-002.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-002.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-002.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-002.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-002.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-002.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-002.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-002.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-003-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-003-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-003-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-003-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-003-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-003-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-003-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-003-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-003.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-003.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-003.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-003.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-003.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-003.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-003.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-003.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-004-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-004-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-004-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-004-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-004-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-004-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-004-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-004-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-004.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-004.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-004.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-004.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-004.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-004.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-004.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-004.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-005-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-005-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-005-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-005-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-005-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-005-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-005-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-005-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-005.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-005.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-005.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-005.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-005.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-005.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-005.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-005.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-006-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-006-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-006-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-006-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-006-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-006-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-006-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-006-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-006.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-006.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-006.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-006.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-006.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-006.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-006.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-006.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-007-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-007-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-007-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-007-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-007-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-007-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-007-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-007-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-007.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-007.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-007.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-007.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-007.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-007.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-007.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-007.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-008-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-008-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-008-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-008-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-008-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-008-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-008-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-008-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-008.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-008.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-008.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-008.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-008.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-008.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-008.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-008.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-009-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-009-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-009-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-009-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-009-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-009-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-009-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-009-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-009.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-009.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-009.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-009.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-009.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-009.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-009.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-009.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-010-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-010-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-010-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-010-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-010-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-010-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-010-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-010-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-010.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-010.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-010.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-010.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-010.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-010.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-010.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-010.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011a-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011a-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011a-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011a-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011a-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011a-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011a-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011a-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011a.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011a.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011a.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011a.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011a.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011a.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011a.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011a.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011b-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011b-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011b-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011b-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011b-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011b-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-011b-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-011b-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-012-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-012-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-012-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-012-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-012-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-012-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-012-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-012-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-012.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-012.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-012.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-012.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-012.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-012.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-012.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-012.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-013-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-013-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-013-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-013-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-013-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-013-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-013-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-013-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-013.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-013.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-013.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-013.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-013.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-013.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-013.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-013.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-014-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-014-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-014-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-014-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-014-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-014-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-014-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-014-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-014.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-014.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-014.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-014.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-014.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-014.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-014.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-014.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-015-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-015-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-015-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-015-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-015-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-015-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-015-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-015-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-015.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-015.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-015.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-015.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-015.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-015.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-015.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-015.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-016-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-016-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-016-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-016-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-016-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-016-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-016-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-016-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-016.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-016.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-016.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-016.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-016.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-016.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-016.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-016.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-017-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-017-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-017-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-017-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-017-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-017-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-017-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-017-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-017.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-017.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-017.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-017.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-017.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-017.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-017.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-017.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-018-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-018-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-018-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-018-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-018-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-018-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-018-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-018-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-018.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-018.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-018.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-018.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-018.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-018.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-018.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-018.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-019-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-019-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-019-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-019-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-019-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-019-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-019-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-019-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-019.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-019.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-019.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-019.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-019.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-019.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-019.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-019.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-020-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-020-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-020-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-020-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-020-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-020-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-020-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-020-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-020.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-020.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-020.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-020.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-020.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-020.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-020.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-020.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-021-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-021-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-021-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-021-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-021-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-021-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-021-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-021-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-021.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-021.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-021.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-021.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-021.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-021.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-021.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-021.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-022-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-022-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-022-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-022-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-022-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-022-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-022-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-022-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-022.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-022.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-022.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-022.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-022.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-022.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-022.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-022.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-023-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-023-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-023-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-023-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-023-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-023-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-023-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-023-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-023.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-023.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-023.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-023.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-023.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-023.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-023.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-023.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-024-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-024-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-024-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-024-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-024-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-024-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-024-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-024-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-024.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-024.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-024.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-024.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-024.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-024.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-024.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-024.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-025-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-025-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-025-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-025-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-025-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-025-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-025-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-025-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-025.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-025.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-025.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-025.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-025.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-025.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-025.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-025.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-026-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-026-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-026-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-026-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-026-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-026-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-026-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-026-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-026.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-026.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-026.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-026.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-026.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-026.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-026.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-026.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-027-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-027-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-027-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-027-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-027-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-027-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-027-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-027-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-027.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-027.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-027.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-027.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-027.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-027.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-027.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-027.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-028-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-028-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-028-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-028-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-028-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-028-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-028-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-028-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-028.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-028.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-028.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-028.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-028.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-028.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-028.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-028.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-029-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-029-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-029-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-029-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-029-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-029-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-029-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-029-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-029.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-029.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-029.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-029.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-029.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-029.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-029.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-029.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031a-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031a-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031a-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031a-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031a-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031a-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031a-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031a-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031a.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031a.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031a.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031a.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031a.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031a.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031a.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031a.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031b-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031b-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031b-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031b-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031b-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031b-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-031b-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-031b-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-032-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-032-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-032-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-032-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-032-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-032-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-032-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-032-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-032.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-032.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-032.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-032.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-032.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-032.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-032.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-032.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-033-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-033-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-033-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-033-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-033-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-033-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-033-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-033-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-033.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-033.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-033.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-033.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-033.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-033.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-033.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-033.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-034-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-034-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-034-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-034-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-034-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-034-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-034-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-034-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-034.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-034.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-034.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-034.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-034.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-034.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-034.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-034.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035a-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035a-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035a-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035a-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035a-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035a-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035a-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035a-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035a.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035a.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035a.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035a.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035a.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035a.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035a.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035a.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035b-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035b-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035b-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035b-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035b-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035b-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-035b-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-035b-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-036-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-036-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-036-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-036-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-036-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-036-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-036-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-036-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-036.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-036.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-036.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-036.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-036.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-036.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-036.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-036.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-037-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-037-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-037-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-037-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-037-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-037-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-037-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-037-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-037.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-037.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-037.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-037.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-037.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-037.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-037.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-037.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-038-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-038-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-038-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-038-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-038-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-038-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-038-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-038-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-038.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-038.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-038.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-038.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-038.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-038.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-038.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-038.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-039-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-039-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-039-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-039-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-039-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-039-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-039-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-039-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-039.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-039.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-039.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-039.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-039.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-039.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-039.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-039.pdf diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-040-reg.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-040-reg.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-040-reg.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-040-reg.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-040-str.xml b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-040-str.xml similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-040-str.xml rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-040-str.xml diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-040.json b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-040.json similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-040.json rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-040.json diff --git a/tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-040.pdf b/tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-040.pdf similarity index 100% rename from tests/tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-040.pdf rename to tests/files/tabula/icdar2013-dataset/competition-dataset-us/us-040.pdf diff --git a/tests/tabula_test_pdfs/indictb1h_14.pdf b/tests/files/tabula/indictb1h_14.pdf similarity index 100% rename from tests/tabula_test_pdfs/indictb1h_14.pdf rename to tests/files/tabula/indictb1h_14.pdf diff --git a/tests/tabula_test_pdfs/labor.pdf b/tests/files/tabula/labor.pdf similarity index 100% rename from tests/tabula_test_pdfs/labor.pdf rename to tests/files/tabula/labor.pdf diff --git a/tests/tabula_test_pdfs/m27.pdf b/tests/files/tabula/m27.pdf similarity index 100% rename from tests/tabula_test_pdfs/m27.pdf rename to tests/files/tabula/m27.pdf diff --git a/tests/tabula_test_pdfs/mednine.pdf b/tests/files/tabula/mednine.pdf similarity index 100% rename from tests/tabula_test_pdfs/mednine.pdf rename to tests/files/tabula/mednine.pdf diff --git a/tests/tabula_test_pdfs/offense.pdf b/tests/files/tabula/offense.pdf similarity index 100% rename from tests/tabula_test_pdfs/offense.pdf rename to tests/files/tabula/offense.pdf diff --git a/tests/tabula_test_pdfs/puertos1.pdf b/tests/files/tabula/puertos1.pdf similarity index 100% rename from tests/tabula_test_pdfs/puertos1.pdf rename to tests/files/tabula/puertos1.pdf diff --git a/tests/tabula_test_pdfs/rotated_page.pdf b/tests/files/tabula/rotated_page.pdf similarity index 100% rename from tests/tabula_test_pdfs/rotated_page.pdf rename to tests/files/tabula/rotated_page.pdf diff --git a/tests/tabula_test_pdfs/schools.pdf b/tests/files/tabula/schools.pdf similarity index 100% rename from tests/tabula_test_pdfs/schools.pdf rename to tests/files/tabula/schools.pdf diff --git a/tests/tabula_test_pdfs/should_detect_rulings.pdf b/tests/files/tabula/should_detect_rulings.pdf similarity index 100% rename from tests/tabula_test_pdfs/should_detect_rulings.pdf rename to tests/files/tabula/should_detect_rulings.pdf diff --git a/tests/tabula_test_pdfs/sort_exception.pdf b/tests/files/tabula/sort_exception.pdf similarity index 100% rename from tests/tabula_test_pdfs/sort_exception.pdf rename to tests/files/tabula/sort_exception.pdf diff --git a/tests/tabula_test_pdfs/spanning_cells.pdf b/tests/files/tabula/spanning_cells.pdf similarity index 100% rename from tests/tabula_test_pdfs/spanning_cells.pdf rename to tests/files/tabula/spanning_cells.pdf diff --git a/tests/tabula_test_pdfs/spreadsheet_no_bounding_frame.pdf b/tests/files/tabula/spreadsheet_no_bounding_frame.pdf similarity index 100% rename from tests/tabula_test_pdfs/spreadsheet_no_bounding_frame.pdf rename to tests/files/tabula/spreadsheet_no_bounding_frame.pdf diff --git a/tests/tabula_test_pdfs/sydney_disclosure_contract.pdf b/tests/files/tabula/sydney_disclosure_contract.pdf similarity index 100% rename from tests/tabula_test_pdfs/sydney_disclosure_contract.pdf rename to tests/files/tabula/sydney_disclosure_contract.pdf diff --git a/tests/tabula_test_pdfs/twotables.pdf b/tests/files/tabula/twotables.pdf similarity index 100% rename from tests/tabula_test_pdfs/twotables.pdf rename to tests/files/tabula/twotables.pdf diff --git a/tests/tabula_test_pdfs/us-007.pdf b/tests/files/tabula/us-007.pdf similarity index 100% rename from tests/tabula_test_pdfs/us-007.pdf rename to tests/files/tabula/us-007.pdf diff --git a/tests/tabula_test_pdfs/us-017.pdf b/tests/files/tabula/us-017.pdf similarity index 100% rename from tests/tabula_test_pdfs/us-017.pdf rename to tests/files/tabula/us-017.pdf diff --git a/tests/tabula_test_pdfs/us-024.pdf b/tests/files/tabula/us-024.pdf similarity index 100% rename from tests/tabula_test_pdfs/us-024.pdf rename to tests/files/tabula/us-024.pdf diff --git a/examples/files/twotables_1.pdf b/tests/files/twotables_1.pdf similarity index 100% rename from examples/files/twotables_1.pdf rename to tests/files/twotables_1.pdf diff --git a/examples/files/twotables_2.pdf b/tests/files/twotables_2.pdf similarity index 100% rename from examples/files/twotables_2.pdf rename to tests/files/twotables_2.pdf diff --git a/tests/left_rotated_table_1.pdf b/tests/left_rotated_table_1.pdf deleted file mode 100644 index 8b7a615..0000000 Binary files a/tests/left_rotated_table_1.pdf and /dev/null differ diff --git a/tests/lines_in_background_1.pdf b/tests/lines_in_background_1.pdf deleted file mode 100644 index f23d6b7..0000000 Binary files a/tests/lines_in_background_1.pdf and /dev/null differ diff --git a/tests/lines_in_background_2.pdf b/tests/lines_in_background_2.pdf deleted file mode 100644 index b64b2f2..0000000 Binary files a/tests/lines_in_background_2.pdf and /dev/null differ diff --git a/tests/missing_values.pdf b/tests/missing_values.pdf deleted file mode 100644 index 90b620f..0000000 Binary files a/tests/missing_values.pdf and /dev/null differ diff --git a/tests/right_rotated_table_1.pdf b/tests/right_rotated_table_1.pdf deleted file mode 100644 index 9494465..0000000 Binary files a/tests/right_rotated_table_1.pdf and /dev/null differ diff --git a/tests/row_span_1.pdf b/tests/row_span_1.pdf deleted file mode 100644 index ef2c7ce..0000000 Binary files a/tests/row_span_1.pdf and /dev/null differ diff --git a/tests/row_span_2.pdf b/tests/row_span_2.pdf deleted file mode 100644 index 39bce84..0000000 Binary files a/tests/row_span_2.pdf and /dev/null differ diff --git a/tests/test_common.py b/tests/test_common.py new file mode 100644 index 0000000..52f966a --- /dev/null +++ b/tests/test_common.py @@ -0,0 +1,80 @@ +import os + +import pandas as pd + +import camelot + +from test_data import * + +testdir = os.path.dirname(os.path.abspath(__file__)) +testdir = os.path.join(testdir, "files") + + +def test_stream(): + pass + + +def test_stream_table_rotated(): + df = pd.DataFrame(data_stream_table_rotated) + + filename = os.path.join(testdir, "clockwise_table_2.pdf") + tables = camelot.read_pdf(filename) + assert df.equals(tables[0].df) + + filename = os.path.join(testdir, "anticlockwise_table_2.pdf") + tables = camelot.read_pdf(filename) + assert df.equals(tables[0].df) + + +def test_stream_table_area(): + df = pd.DataFrame(data_stream_table_area_single) + + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf(filename, table_area=["320,500,573,335"]) + assert df.equals(tables[0].df) + + +def test_stream_columns(): + df = pd.DataFrame(data_stream_columns) + + filename = os.path.join(testdir, "mexican_towns.pdf") + tables = camelot.read_pdf( + filename, columns=["67,180,230,425,475"], row_close_tol=10) + assert df.equals(tables[0].df) + + +def test_lattice(): + df = pd.DataFrame(data_lattice) + + filename = os.path.join(testdir, + "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf") + tables = camelot.read_pdf(filename, pages="2", mesh=True) + assert df.equals(tables[0].df) + + +def test_lattice_table_rotated(): + df = pd.DataFrame(data_lattice_table_rotated) + + filename = os.path.join(testdir, "clockwise_table_1.pdf") + tables = camelot.read_pdf(filename, mesh=True) + assert df.equals(tables[0].df) + + filename = os.path.join(testdir, "anticlockwise_table_1.pdf") + tables = camelot.read_pdf(filename, mesh=True) + assert df.equals(tables[0].df) + + +def test_lattice_process_background(): + df = pd.DataFrame(data_lattice_process_background) + + filename = os.path.join(testdir, "background_lines_1.pdf") + tables = camelot.read_pdf(filename, mesh=True, process_background=True) + assert df.equals(tables[1].df) + + +def test_lattice_copy_text(): + df = pd.DataFrame(data_lattice_copy_text) + + filename = os.path.join(testdir, "row_span_1.pdf") + tables = camelot.read_pdf(filename, mesh=True, line_size_scaling=60, copy_text="v") + assert df.equals(tables[0].df) \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..3582937 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +data_stream_table_rotated = [ + ["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""], + ["","","","","","","Modern method","","","","","","","Traditional method","","","",""], + ["","","","Any","","","","","","","Other","Any","","","","Not","","Number"], + ["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"], + ["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"], + ["","Caste/tribe","","","","","","","","","","","","","","","",""], + ["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"], + ["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"], + ["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"], + ["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"], + ["","Wealth index","","","","","","","","","","","","","","","",""], + ["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"], + ["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"], + ["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"], + ["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"], + ["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"], + ["","Number of living children","","","","","","","","","","","","","","","",""], + ["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"], + ["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"], + ["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"], + ["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"], + ["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"], + ["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"], + ["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"], + ["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"], + ["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"], + ["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"], + ["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"], + ["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"], + ["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"], + ["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"], + ["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"], + ["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"], + ["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""], + ["","not shown separately.","","","","","","","","","","","","","","","",""], + ["","na = Not available","","","","","","","","","","","","","","","",""], + ["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""], + ["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""], + ["","","","","","","","","54","","","","","","","","",""] +] + + +data_stream_table_area_single = [ + ["","One Withholding"], + ["Payroll Period","Allowance"], + ["Weekly","$71.15"], + ["Biweekly","142.31"], + ["Semimonthly","154.17"], + ["Monthly","308.33"], + ["Quarterly","925.00"], + ["Semiannually","1,850.00"], + ["Annually","3,700.00"], + ["Daily or Miscellaneous","14.23"], + ["(each day of the payroll period)",""] +] + + +data_stream_columns = [ + ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"], + ["Entidad","","Municipio","","Localidad",""], + ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], + ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], + ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"], + ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"], + ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"], + ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"], + ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"], + ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"], + ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"], + ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"], + ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"], + ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"], + ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"], + ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"], + ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"], + ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"], + ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"], + ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"], + ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"], + ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"], + ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"], + ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"], + ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"], + ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"], + ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"], + ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"], + ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"], + ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"], + ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"], + ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"], + ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"], + ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"], + ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"], + ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"], + ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"], + ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"], + ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], + ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], + ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], + ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], + ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], +] + + +data_lattice = [ + ["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""], + ["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"], + ["2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%"], + ["2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%"], + ["4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%"], + ["2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"], + ["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"] +] + + +data_lattice_table_rotated = [ + ["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""], + ["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"], + ["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"], + ["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"], + ["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"], + ["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"], + ["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"], + ["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"], + ["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"], + ["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"], + ["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"], + ["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"], + ["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"] +] + + +data_lattice_process_background = [ + ["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"], + ["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"], + ["Rajasthan","2.12.2009 to 19.12.2009","","","","","",""], + ["Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453"], + ["Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153"], + ["Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183"], + ["Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855"], + ["Total","","47","92","11.81","22,455","19,584","10,644"] +] + + +data_lattice_copy_text = [ + ["Plan Type","County","Plan Name","Totals"], + ["GMC","Sacramento","Anthem Blue Cross","164,380"], + ["GMC","Sacramento","Health Net","126,547"], + ["GMC","Sacramento","Kaiser Foundation","74,620"], + ["GMC","Sacramento","Molina Healthcare","59,989"], + ["GMC","San Diego","Care 1st Health Plan","71,831"], + ["GMC","San Diego","Community Health Group","264,639"], + ["GMC","San Diego","Health Net","72,404"], + ["GMC","San Diego","Kaiser","50,415"], + ["GMC","San Diego","Molina Healthcare","206,430"], + ["GMC","Total GMC Enrollment","","1,091,255"], + ["COHS","Marin","Partnership Health Plan of CA","36,006"], + ["COHS","Mendocino","Partnership Health Plan of CA","37,243"], + ["COHS","Napa","Partnership Health Plan of CA","28,398"], + ["COHS","Solano","Partnership Health Plan of CA","113,220"], + ["COHS","Sonoma","Partnership Health Plan of CA","112,271"], + ["COHS","Yolo","Partnership Health Plan of CA","52,674"], + ["COHS","Del Norte","Partnership Health Plan of CA","11,242"], + ["COHS","Humboldt","Partnership Health Plan of CA","49,911"], + ["COHS","Lake","Partnership Health Plan of CA","29,149"], + ["COHS","Lassen","Partnership Health Plan of CA","7,360"], + ["COHS","Modoc","Partnership Health Plan of CA","2,940"], + ["COHS","Shasta","Partnership Health Plan of CA","61,763"], + ["COHS","Siskiyou","Partnership Health Plan of CA","16,715"], + ["COHS","Trinity","Partnership Health Plan of CA","4,542"], + ["COHS","Merced","Central California Alliance for Health","123,907"], + ["COHS","Monterey","Central California Alliance for Health","147,397"], + ["COHS","Santa Cruz","Central California Alliance for Health","69,458"], + ["COHS","Santa Barbara","CenCal","117,609"], + ["COHS","San Luis Obispo","CenCal","55,761"], + ["COHS","Orange","CalOptima","783,079"], + ["COHS","San Mateo","Health Plan of San Mateo","113,202"], + ["COHS","Ventura","Gold Coast Health Plan","202,217"], + ["COHS","Total COHS Enrollment","","2,176,064"], + ["Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022"], + ["PCCM","Los Angeles","AIDS Healthcare Foundation","828"], + ["PCCM","San Francisco","Family Mosaic","25"], + ["PCCM","Total PHP Enrollment","","853"], + ["All Models Total Enrollments","","","10,132,875"], + ["Source: Data Warehouse 12/14/15","","",""] +] \ No newline at end of file diff --git a/tests/test_lattice.py b/tests/test_lattice.py deleted file mode 100644 index 818e16a..0000000 --- a/tests/test_lattice.py +++ /dev/null @@ -1,125 +0,0 @@ -# coding: utf8 -import os - -from nose.tools import assert_equal - -from camelot.pdf import Pdf -from camelot.lattice import Lattice - - -testdir = os.path.dirname(os.path.abspath(__file__)) - - -def test_lattice_basic(): - - data = [ - ["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""], - ["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"], - ["2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%"], - ["2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%"], - ["4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%"], - ["2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"], - ["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"] - ] - pdfname = os.path.join(testdir, - "tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf") - manager = Pdf(Lattice(), pdfname, pagenos=[{'start': 2, 'end': 2}], - clean=True) - tables = manager.extract() - assert_equal(tables['page-2']['table-1']['data'], data) - - -def test_lattice_fill(): - - data = [ - ["Plan Type","County","Plan Name","Totals"], - ["GMC","Sacramento","Anthem Blue Cross","164,380"], - ["GMC","Sacramento","Health Net","126,547"], - ["GMC","Sacramento","Kaiser Foundation","74,620"], - ["GMC","Sacramento","Molina Healthcare","59,989"], - ["GMC","San Diego","Care 1st Health Plan","71,831"], - ["GMC","San Diego","Community Health Group","264,639"], - ["GMC","San Diego","Health Net","72,404"], - ["GMC","San Diego","Kaiser","50,415"], - ["GMC","San Diego","Molina Healthcare","206,430"], - ["GMC","Total GMC Enrollment","","1,091,255"], - ["COHS","Marin","Partnership Health Plan of CA","36,006"], - ["COHS","Mendocino","Partnership Health Plan of CA","37,243"], - ["COHS","Napa","Partnership Health Plan of CA","28,398"], - ["COHS","Solano","Partnership Health Plan of CA","113,220"], - ["COHS","Sonoma","Partnership Health Plan of CA","112,271"], - ["COHS","Yolo","Partnership Health Plan of CA","52,674"], - ["COHS","Del Norte","Partnership Health Plan of CA","11,242"], - ["COHS","Humboldt","Partnership Health Plan of CA","49,911"], - ["COHS","Lake","Partnership Health Plan of CA","29,149"], - ["COHS","Lassen","Partnership Health Plan of CA","7,360"], - ["COHS","Modoc","Partnership Health Plan of CA","2,940"], - ["COHS","Shasta","Partnership Health Plan of CA","61,763"], - ["COHS","Siskiyou","Partnership Health Plan of CA","16,715"], - ["COHS","Trinity","Partnership Health Plan of CA","4,542"], - ["COHS","Merced","Central California Alliance for Health","123,907"], - ["COHS","Monterey","Central California Alliance for Health","147,397"], - ["COHS","Santa Cruz","Central California Alliance for Health","69,458"], - ["COHS","Santa Barbara","CenCal","117,609"], - ["COHS","San Luis Obispo","CenCal","55,761"], - ["COHS","Orange","CalOptima","783,079"], - ["COHS","San Mateo","Health Plan of San Mateo","113,202"], - ["COHS","Ventura","Gold Coast Health Plan","202,217"], - ["COHS","Total COHS Enrollment","","2,176,064"], - ["Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022"], - ["PCCM","Los Angeles","AIDS Healthcare Foundation","828"], - ["PCCM","San Francisco","Family Mosaic","25"], - ["PCCM","Total PHP Enrollment","","853"], - ["All Models Total Enrollments","","","10,132,875"], - ["Source: Data Warehouse 12/14/15","","",""] - ] - pdfname = os.path.join(testdir, 'row_span_1.pdf') - manager = Pdf(Lattice(fill='v', scale=40), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables['page-1']['table-1']['data'], data) - - -def test_lattice_invert(): - - data = [ - ["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"], - ["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"], - ["Rajasthan","2.12.2009 to 19.12.2009","","","","","",""], - ["Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453"], - ["Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153"], - ["Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183"], - ["Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855"], - ["Total","","47","92","11.81","22,455","19,584","10,644"] - ] - pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') - manager = Pdf(Lattice(invert=True), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables['page-1']['table-2']['data'], data) - - -def test_lattice_table_rotation(): - - data = [ - ["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""], - ["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"], - ["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"], - ["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"], - ["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"], - ["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"], - ["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"], - ["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"], - ["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"], - ["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"], - ["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"], - ["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"], - ["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"] - ] - pdfname = os.path.join(testdir, 'left_rotated_table_1.pdf') - manager = Pdf(Lattice(), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables['page-1']['table-1']['data'], data) - - pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf') - manager = Pdf(Lattice(), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables['page-1']['table-1']['data'], data) \ No newline at end of file diff --git a/tests/test_stream.py b/tests/test_stream.py deleted file mode 100644 index a947b2b..0000000 --- a/tests/test_stream.py +++ /dev/null @@ -1,220 +0,0 @@ -# coding: utf8 -import os - -from nose.tools import assert_equal - -from camelot.pdf import Pdf -from camelot.stream import Stream - - -testdir = os.path.dirname(os.path.abspath(__file__)) - - -def test_stream_basic(): - - data = [ - ["", "Table 6.", ""], - ["", "U.S. Production, Imports, Exports, and Net Supply of Conventional Pesticides", ""], - ["", "at Producer Level, 1994/95 Estimates.", ""], - ["", "Active Ingredient", "Sales Value"], - ["", "(in billions of lbs.)", "(in billions of dollars)"], - ["Category", "1994/95", "1994/95"], - ["U.S. Production", "1.3", "7.0"], - ["U.S. Imports", "0.2", "2.2"], - ["Total Supply", "1.5", "9.2"], - ["U.S. Exports", "0.5", "2.6"], - ["Net Supply/Usage", "1.0", "6.6"], - ["SOURCE:", "EPA estimates based on ACPA Surveys, Department of Commerce Publications, tabulations and other", ""], - ["sources.", "", ""], - ["16\xe2\x80\x9494/95 Pesticides Industry Sales And Usage", "", ""] - ] - - pdfname = os.path.join(testdir, "tabula_test_pdfs/us-024.pdf") - manager = Pdf(Stream(), pdfname, pagenos=[{"start": 1, "end": 1}], - clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_stream_missing_value(): - - data = [ - ["Bhandara - Key Indicators","","","",""], - ["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""], - ["Indicators","TOTAL","RURAL","TOTAL","RURAL"], - ["Reported Prevalence of Morbidity","","","",""], - ["Any Injury .....................................................................................................................................","1.9","2.1","",""], - ["Acute Illness .................................................................................................................................","4.5","5.6","",""], - ["Chronic Illness ..............................................................................................................................","5.1","4.1","",""], - ["Reported Prevalence of Chronic Illness during last one year (%)","","","",""], - ["Disease of respiratory system ......................................................................................................","11.7","15.0","",""], - ["Disease of cardiovascular system ................................................................................................","8.9","9.3","",""], - ["Persons suffering from tuberculosis .............................................................................................","2.2","1.5","",""], - ["Anaemia Status by Haemoglobin Level14 (%)","","","",""], - ["Children (6-59 months) having anaemia ......................................................................................","68.5","71.9","",""], - ["Children (6-59 months) having severe anaemia ..........................................................................","6.7","9.4","",""], - ["Children (6-9 Years) having anaemia - Male ................................................................................","67.1","71.4","",""], - ["Children (6-9 Years) having severe anaemia - Male ....................................................................","4.4","2.4","",""], - ["Children (6-9 Years) having anaemia - Female ...........................................................................","52.4","48.8","",""], - ["Children (6-9 Years) having severe anaemia - Female ................................................................","1.2","0.0","",""], - ["Children (6-14 years) having anaemia - Male .............................................................................","50.8","62.5","",""], - ["Children (6-14 years) having severe anaemia - Male ..................................................................","3.7","3.6","",""], - ["Children (6-14 years) having anaemia - Female .........................................................................","48.3","50.0","",""], - ["Children (6-14 years) having severe anaemia - Female ..............................................................","4.3","6.1","",""], - ["Children (10-19 Years15) having anaemia - Male .........................................................................","37.9","51.2","",""], - ["Children (10-19 Years15) having severe anaemia - Male .............................................................","3.5","4.0","",""], - ["Children (10-19 Years15) having anaemia - Female .....................................................................","46.6","52.1","",""], - ["Children (10-19 Years15) having severe anaemia - Female .........................................................","6.4","6.5","",""], - ["Adolescents (15-19 years) having anaemia ................................................................................","39.4","46.5","",""], - ["Adolescents (15-19 years) having severe anaemia .....................................................................","5.4","5.1","",""], - ["Pregnant women (15-49 aged) having anaemia ..........................................................................","48.8","51.5","",""], - ["Pregnant women (15-49 aged) having severe anaemia ..............................................................","7.1","8.8","",""], - ["Women (15-49 aged) having anaemia .........................................................................................","45.2","51.7","",""], - ["Women (15-49 aged) having severe anaemia .............................................................................","4.8","5.9","",""], - ["Persons (20 years and above) having anaemia ...........................................................................","37.8","42.1","",""], - ["Persons (20 years and above) having Severe anaemia ..............................................................","4.6","4.8","",""], - ["Blood Sugar Level (age 18 years and above) (%)","","","",""], - ["Blood Sugar Level >140 mg/dl (high) ...........................................................................................","12.9","11.1","",""], - ["Blood Sugar Level >160 mg/dl (very high) ...................................................................................","7.0","5.1","",""], - ["Hypertension (age 18 years and above) (%)","","","",""], - ["Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg ) ..............................","23.8","22.8","",""], - ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], - ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], - ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], - ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""], - ["4","","","",""] - ] - pdfname = os.path.join(testdir, "missing_values.pdf") - manager = Pdf(Stream(flag_size=False), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_stream_single_table_area(): - - data = [ - ["","One Withholding"], - ["Payroll Period","Allowance"], - ["Weekly","$71.15"], - ["Biweekly","142.31"], - ["Semimonthly","154.17"], - ["Monthly","308.33"], - ["Quarterly","925.00"], - ["Semiannually","1,850.00"], - ["Annually","3,700.00"], - ["Daily or Miscellaneous","14.23"], - ["(each day of the payroll period)",""] - ] - pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf") - manager = Pdf(Stream(table_area=["320,500,573,335"]), - pdfname, pagenos=[{"start": 1, "end": 1}], clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_stream_columns(): - - data = [ - ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"], - ["Entidad","","Municipio","","Localidad",""], - ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], - ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], - ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"], - ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"], - ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"], - ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"], - ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"], - ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"], - ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"], - ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"], - ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"], - ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"], - ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"], - ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"], - ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"], - ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"], - ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"], - ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"], - ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"], - ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"], - ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"], - ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"], - ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"], - ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"], - ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"], - ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"], - ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"], - ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"], - ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"], - ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"], - ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"], - ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"], - ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"], - ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"], - ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"], - ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"], - ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], - ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], - ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], - ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], - ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], - ] - pdfname = os.path.join(testdir, "mexican_towns.pdf") - manager = Pdf(Stream(columns=["67,180,230,425,475"], ytol=[10]), pdfname, - clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_stream_table_rotation(): - - data = [ - ["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""], - ["","","","","","","Modern method","","","","","","","Traditional method","","","",""], - ["","","","Any","","","","","","","Other","Any","","","","Not","","Number"], - ["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"], - ["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"], - ["","Caste/tribe","","","","","","","","","","","","","","","",""], - ["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"], - ["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"], - ["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"], - ["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"], - ["","Wealth index","","","","","","","","","","","","","","","",""], - ["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"], - ["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"], - ["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"], - ["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"], - ["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"], - ["","Number of living children","","","","","","","","","","","","","","","",""], - ["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"], - ["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"], - ["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"], - ["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"], - ["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"], - ["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"], - ["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"], - ["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"], - ["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"], - ["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"], - ["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"], - ["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"], - ["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"], - ["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"], - ["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"], - ["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"], - ["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""], - ["","not shown separately.","","","","","","","","","","","","","","","",""], - ["","na = Not available","","","","","","","","","","","","","","","",""], - ["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""], - ["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""], - ["","","","","","","","","54","","","","","","","","",""] - ] - pdfname = os.path.join(testdir, "left_rotated_table_2.pdf") - manager = Pdf(Stream(flag_size=False), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - pdfname = os.path.join(testdir, "right_rotated_table_2.pdf") - manager = Pdf(Stream(flag_size=False), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) \ No newline at end of file diff --git a/tests/twotables_1.pdf b/tests/twotables_1.pdf deleted file mode 100644 index cbbeeda..0000000 Binary files a/tests/twotables_1.pdf and /dev/null differ diff --git a/tests/twotables_2.pdf b/tests/twotables_2.pdf deleted file mode 100644 index 5249887..0000000 Binary files a/tests/twotables_2.pdf and /dev/null differ diff --git a/tools/camelot b/tools/camelot deleted file mode 100755 index 0918fe1..0000000 --- a/tools/camelot +++ /dev/null @@ -1,694 +0,0 @@ -#!/usr/bin/env python2 -from __future__ import print_function -import os -import csv -import sys -import glob -import time -import zipfile -import warnings -import cStringIO - -import numpy as np -from docopt import docopt -from collections import Counter -import matplotlib.pyplot as plt -from PyPDF2 import PdfFileReader - -from camelot.pdf import Pdf -from camelot.lattice import Lattice -from camelot.stream import Stream -from camelot.ocr import OCRLattice, OCRStream -from camelot import utils - - -doc = """ -Camelot: PDF parsing made simpler! - -usage: - camelot [options] [...] - -options: - -h, --help Show this screen. - -v, --version Show version. - -p, --pages Comma-separated list of page numbers. - Example: -p 1,3-6,10 [default: 1] - -P, --parallel Parallelize the parsing process. - -f, --format Output format. (csv,tsv,zip,html,json,xlsx) [default: csv] - -l, --log Log to file. - -o, --output Output directory. - -M, --cmargin Char margin. Chars closer than cmargin are - grouped together to form a word. [default: 1.0] - -L, --lmargin Line margin. Lines closer than lmargin are - grouped together to form a textbox. [default: 0.5] - -W, --wmargin Word margin. Insert blank spaces between chars - if distance between words is greater than word - margin. [default: 0.1] - -J, --split_text Split text lines if they span across multiple cells. - -K, --flag_size Flag substring if its size differs from the whole string. - Useful for super and subscripts. - -X, --print-stats List stats on the parsing process. - -Y, --save-stats Save stats to a file. - -Z, --plot Plot distributions. (page,all,rc) - -camelot methods: - lattice Looks for lines between data. - stream Looks for spaces between data. - ocrl Lattice, but for images. - ocrs Stream, but for images. - -See 'camelot -h' for more information on a specific method. -""" - -lattice_doc = """ -Lattice method looks for lines between text to form a table. - -usage: - camelot lattice [-t ...] [-F ...] [-m ...] - [-j ...] [options] [--] - -options: - -t, --tarea Specific table areas to analyze. - -F, --fill Fill data in horizontal and/or vertical spanning - cells. Example: -F h, -F v, -F hv - -m, --mtol Tolerance to account for when merging lines - which are very close. [default: 2] - -j, --jtol Tolerance to account for when matching line endings - with intersections. [default: 2] - -b, --blocksize See adaptive threshold doc. [default: 15] - -C, --constant See adaptive threshold doc. [default: -2] - -s, --scale Scaling factor. Large scaling factor leads to - smaller lines being detected. [default: 15] - -I, --iterations Number of iterations for dilation. [default: 0] - -i, --invert Invert pdf image to make sure that lines are - in foreground. - -T, --shift_text Specify where the text in a spanning cell - should flow, order-sensitive. [default: lt] - -d, --debug Debug by visualizing pdf geometry. - (contour,line,joint,table) Example: -d table -""" - -stream_doc = """ -Stream method looks for whitespaces between text to form a table. - -usage: - camelot stream [-t ...] [-c ...] [-m ...] - [-y ...] [options] [--] - -options: - -t, --tarea Specific table areas to analyze. - -c, --columns Comma-separated list of column x-coordinates. - Example: -c 10.1,20.2,30.3 - -m, --mtol Tolerance to account for when merging columns - together. [default: 0] - -y, --ytol Tolerance to account for when grouping rows - together. [default: 2] - -d, --debug Debug by visualizing textboxes. -""" - - -ocrl_doc = """ -Lattice, but for images. - -usage: - camelot ocrl [-t ...] [-m ...] [options] [--] - -options: - -t, --tarea Specific table areas to analyze. - -m, --mtol Tolerance to account for when merging lines - which are very close. [default: 2] - -b, --blocksize See adaptive threshold doc. [default: 15] - -C, --constant See adaptive threshold doc. [default: -2] - -D, --dpi Dots per inch, specify image quality to be used for OCR. - [default: 300] - -g, --layout Tesseract page segmentation mode. [default: 7] - -l, --lang Specify language to be used for OCR. [default: eng] - -s, --scale Scaling factor. Large scaling factor leads to - smaller lines being detected. [default: 15] - -I, --iterations Number of iterations for dilation. [default: 0] - -d, --debug Debug by visualizing pdf geometry. - (contour,line,joint,table) Example: -d table -""" - -ocrs_doc = """ -Stream, but for images. - -usage: - camelot ocrs [-t ...] [-c ...] [options] [--] - -options: - -t, --tarea Specific table areas to analyze. - -c, --columns Comma-separated list of column x-coordinates. - Example: -c 10.1,20.2,30.3 - -b, --blocksize See adaptive threshold doc. [default: 15] - -C, --constant See adaptive threshold doc. [default: -2] - -D, --dpi Dots per inch, specify image quality to be used for OCR. - [default: 300] - -g, --layout Tesseract page segmentation mode. [default: 7] - -l, --lang Specify language to be used for OCR. [default: eng] - -G, --line-scale Line scaling factor. [default: 15] - -S, --char-scale Char scaling factor. [default: 200] - -d, --debug Debug by visualizing image. -""" - - -def plot_table_barchart(r, c, p, pno, tno): - row_idx = [i + 1 for i, row in enumerate(r)] - col_idx = [i + 1 for i, col in enumerate(c)] - r_index = np.arange(len(r)) - c_index = np.arange(len(c)) - width = 0.7 - - plt.figure(figsize=(8, 6)) - plt.subplot(2, 1, 1) - plt.title('Percentage of empty cells in table: {0:.2f}'.format(p)) - plt.xlabel('row index') - plt.ylabel('number of non-empty cells in row') - plt.bar(r_index, r) - plt.xticks(r_index + width * 0.5, row_idx) - plt.ylim(0, len(c)) - - plt.subplot(2, 1, 2) - plt.xlabel('column index') - plt.ylabel('number of non-empty cells in column') - plt.bar(c_index, c) - plt.xticks(c_index + width * 0.5, col_idx) - plt.ylim(0, len(r)) - plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300) - - -def plot_all_barchart(data, output): - r_empty_cells = [] - for page_number in data.keys(): - page = data[page_number] - for table_number in page.keys(): - table = page[table_number] - r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']]) - c = Counter(r_empty_cells) - if 0.0 not in c: - c.update({0.0: 0}) - if 1.0 not in c: - c.update({1.0: 0}) - - plt.figure(figsize=(8, 6)) - plt.xlabel('percentage of non-empty cells in a row') - plt.ylabel('percentage of rows processed') - row_p = [count / float(sum(c.values())) for count in c.values()] - plt.bar(c.keys(), row_p, align='center', width=0.05) - plt.ylim(0, 1.0) - plt.savefig(''.join([output, '_all.png']), dpi=300) - - -def plot_rc_piechart(data, output): - from matplotlib import cm - - tables = 0 - rows, cols = [], [] - for page_number in data.keys(): - page = data[page_number] - for table_number in page.keys(): - table = page[table_number] - tables += 1 - rows.append(table['nrows']) - cols.append(table['ncols']) - - r = Counter(rows) - c = Counter(cols) - - plt.figure(figsize=(8, 6)) - cs1 = cm.Set1(np.arange(len(r)) / float(len(r))) - ax1 = plt.subplot(211, aspect='equal') - ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90) - ax1.set_title('row distribution across tables') - - cs2 = cm.Set1(np.arange(len(c)) / float(len(c))) - ax2 = plt.subplot(212, aspect='equal') - ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90) - ax2.set_title('column distribution across tables') - plt.savefig(''.join([output, '_rc.png']), dpi=300) - - -def print_stats(data, p_time): - from operator import itemgetter - from itertools import groupby - - scores = [] - continuous_tables = [] - total_tables = 0 - for page_number in data.keys(): - page = data[page_number] - total_tables += len(page.keys()) - for table_number in page.keys(): - table = page[table_number] - continuous_tables.append((page_number, table_number, table['ncols'])) - scores.append(table['score']) - avg_score = np.mean(scores) - - ct_pages = [] - header_string = "" - if len(continuous_tables) > 1: - tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:]))) - for k, g in groupby(tables, key=itemgetter(2)): - g = list(g) - tables_same_ncols = set([int(t[0][5:]) for t in g]) - tables_same_ncols = sorted(list(tables_same_ncols)) - for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x): - G = list(G) - ct_pages.append((str(G[0][1]), str(G[-1][1]))) - - result_headers = [] - for ct in ct_pages: - header_idx = {} - possible_headers = [] - ncols = 0 - for page_number in range(int(ct[0]), int(ct[1]) + 1): - page = data['page-{0}'.format(page_number)] - for table_number in page.keys(): - table = page[table_number] - ncols = table['ncols'] - for i, row in enumerate(table['data']): - try: - header_idx[tuple(row)].append(i) - except KeyError: - header_idx[tuple(row)] = [i] - possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10] - possible_headers = filter(lambda z: len(z) == ncols, - [filter(lambda x: x != '', p_h) for p_h in possible_headers]) - modes = [] - for p_h in possible_headers: - try: - modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count))) - except KeyError: - pass - header = modes[modes.index(min(modes, key=lambda x: x[1]))][0] - result_headers.append(header) - - header_string = "Multi-page table headers*:\n" - header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format( - '-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip( - ct_pages, result_headers)])]) - - avg_time = "Time taken per page: {0:.2f} seconds\n".format( - p_time / float(len(data))) if len(data) not in [0, 1] else "" - equal_ncols = "\nMulti-page tables on*: {0}\n".format( - ', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) not in [0, 1] else "" - stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols] - stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n" - "{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats)) - - print(''.join([stat_string, header_string])) - - -def convert_to_html(table): - html = '' - html = ''.join([html, '
\n']) - for row in table: - html = ''.join([html, ' \n']) - for data in row: - html = ''.join([html, ' \n']) - html = ''.join([html, ' \n']) - html = ''.join([html, '
', data, '
\n']) - return html - - -def write_to_disk(data, f='csv', output=None, filename=None): - # raise something if filename and/or output are None - fname = os.path.basename(filename) - froot, __ = os.path.splitext(fname) - if f in ['csv', 'tsv']: - delimiter = ',' if f == 'csv' else '\t' - for page_number in sorted(data.keys()): - if data[page_number] is not None: - for table_number in sorted(data[page_number].keys()): - dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f) - with open(os.path.join(output, dsvname), 'w') as outfile: - writer = csv.writer( - outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL) - for row in data[page_number][table_number]['data']: - writer.writerow(row) - elif f == 'zip': - csv_zip = os.path.join(output, '{0}.zip'.format(froot)) - with zipfile.ZipFile(csv_zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) \ - as zfile: - for page_number in sorted(data.keys()): - if data[page_number] is not None: - for table_number in sorted(data[page_number].keys()): - csvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), 'csv') - outfile = cStringIO.StringIO() - writer = csv.writer( - outfile, delimiter=',', quoting=csv.QUOTE_ALL) - for row in data[page_number][table_number]['data']: - writer.writerow(row) - zfile.writestr(csvname, outfile.getvalue()) - outfile.close() - elif f == 'html': - htmlname = '{0}.html'.format(froot) - for page_number in sorted(data.keys()): - for table_number in sorted(data[page_number].keys()): - with open(os.path.join(output, htmlname), 'a') as htmlfile: - htmlfile.write(convert_to_html(data[page_number][table_number]['data'])) - elif f == 'json': - import json - with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \ - as jsonfile: - json.dump(data, jsonfile) - elif f == 'xlsx': - try: - from pyexcel_xlsx import save_data - from collections import OrderedDict - xlsx_data = OrderedDict() - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])): - sheet_name = ''.join([page_number, '_', table_number]) - xlsx_data.update({sheet_name: - [row for row in data[page_number][table_number]['data']]}) - save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data) - except ImportError: - print("link to install docs") - - -if __name__ == '__main__': - start_time = time.time() - - args = docopt(doc, version='0.1', options_first=True) - argv = [args['']] + args[''] - if args[''] == 'lattice': - args.update(docopt(lattice_doc, argv=argv)) - elif args[''] == 'stream': - args.update(docopt(stream_doc, argv=argv)) - elif args[''] == 'ocrl': - args.update(docopt(ocrl_doc, argv=argv)) - elif args[''] == 'ocrs': - args.update(docopt(ocrs_doc, argv=argv)) - - filename = args[''] - filedir = os.path.dirname(args['']) - logname, __ = os.path.splitext(filename) - logname = ''.join([logname, '.log']) - scorename, __ = os.path.splitext(filename) - scorename = ''.join([scorename, '_info.csv']) - pngname, __ = os.path.splitext(filename) - - FORMAT = '%(asctime)s - %(levelname)s - %(message)s' - if args['--log'] is not None: - logger = utils.setup_logging(args['--log']) - else: - logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log')) - - p = [] - if args['--pages'] == '1': - p.append({'start': 1, 'end': 1}) - else: - infile = PdfFileReader(open(filename, 'rb'), strict=False) - if args['--pages'] == 'all': - p.append({'start': 1, 'end': infile.getNumPages()}) - else: - for r in args['--pages'].split(','): - if '-' in r: - a, b = r.split('-') - if b == 'end': - b = infile.getNumPages() - p.append({'start': int(a), 'end': int(b)}) - else: - p.append({'start': int(r), 'end': int(r)}) - - logger.info('Applying {0} method on {1}'.format(args[''], - os.path.basename(filename))) - margins = (float(args['--cmargin']), float(args['--lmargin']), - float(args['--wmargin'])) - if args[''] == 'lattice': - try: - kwargs = { - 'table_area': args['--tarea'] if args['--tarea'] else None, - 'fill': args['--fill'] if args['--fill'] else None, - 'mtol': [int(m) for m in args['--mtol']], - 'jtol': [int(j) for j in args['--jtol']], - 'blocksize': int(args['--blocksize']), - 'threshold_constant': float(args['--constant']), - 'scale': int(args['--scale']), - 'iterations': int(args['--iterations']), - 'invert': args['--invert'], - 'margins': margins, - 'split_text': args['--split_text'], - 'flag_size': args['--flag_size'], - 'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'], - 'debug': args['--debug'] - } - manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True, - parallel=args['--parallel']) - data = manager.extract() - - processing_time = time.time() - start_time - logger.info("Finished processing in " + str(processing_time) + " seconds") - - if args['--plot']: - if args['--output']: - pngname = os.path.join(args['--output'], os.path.basename(pngname)) - plot_type = args['--plot'].split(',') - if 'page' in plot_type: - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - plot_table_barchart(table['r_nempty_cells'], - table['c_nempty_cells'], - table['empty_p'], - page_number, - table_number) - - if 'all' in plot_type: - plot_all_barchart(data, pngname) - - if 'rc' in plot_type: - plot_rc_piechart(data, pngname) - - if args['--print-stats']: - print_stats(data, processing_time) - - if args['--save-stats']: - if args['--output']: - scorename = os.path.join(args['--output'], os.path.basename(scorename)) - with open(scorename, 'w') as score_file: - score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( - ''.join([page_number, '_', table_number]), - table['nrows'], - table['ncols'], - table['empty_p'], - table['line_p'], - table['text_p'], - table['score'])) - if args['--debug']: - manager.debug_plot() - except Exception as e: - logger.exception(e.message, exc_info=True) - sys.exit() - elif args[''] == 'stream': - try: - kwargs = { - 'table_area': args['--tarea'] if args['--tarea'] else None, - 'columns': args['--columns'] if args['--columns'] else None, - 'ytol': [int(y) for y in args['--ytol']], - 'mtol': [int(m) for m in args['--mtol']], - 'margins': margins, - 'split_text': args['--split_text'], - 'flag_size': args['--flag_size'], - 'debug': args['--debug'] - } - manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True, - parallel=args['--parallel']) - data = manager.extract() - - processing_time = time.time() - start_time - logger.info("Finished processing in " + str(processing_time) + " seconds") - - if args['--plot']: - if args['--output']: - pngname = os.path.join(args['--output'], os.path.basename(pngname)) - plot_type = args['--plot'].split(',') - if 'page' in plot_type: - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - plot_table_barchart(table['r_nempty_cells'], - table['c_nempty_cells'], - table['empty_p'], - page_number, - table_number) - - if 'all' in plot_type: - plot_all_barchart(data, pngname) - - if 'rc' in plot_type: - plot_rc_piechart(data, pngname) - - if args['--print-stats']: - print_stats(data, processing_time) - - if args['--save-stats']: - if args['--output']: - scorename = os.path.join(args['--output'], os.path.basename(scorename)) - with open(scorename, 'w') as score_file: - score_file.write('table,nrows,ncols,empty_p,,score\n') - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - score_file.write('{0},{1},{2},{3},{4}\n'.format( - ''.join([page_number, '_', table_number]), - table['nrows'], - table['ncols'], - table['empty_p'], - table['score'])) - - if args['--debug']: - manager.debug_plot() - except Exception as e: - logger.exception(e.message, exc_info=True) - sys.exit() - elif args[''] == 'ocrl': - try: - kwargs = { - 'table_area': args['--tarea'] if args['--tarea'] else None, - 'mtol': [int(m) for m in args['--mtol']], - 'blocksize': int(args['--blocksize']), - 'threshold_constant': float(args['--constant']), - 'dpi': int(args['--dpi']), - 'layout': int(args['--layout']), - 'lang': args['--lang'], - 'scale': int(args['--scale']), - 'iterations': int(args['--iterations']), - 'debug': args['--debug'] - } - manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True, - parallel=args['--parallel']) - data = manager.extract() - - processing_time = time.time() - start_time - logger.info("Finished processing in " + str(processing_time) + " seconds") - - if args['--plot']: - if args['--output']: - pngname = os.path.join(args['--output'], os.path.basename(pngname)) - plot_type = args['--plot'].split(',') - if 'page' in plot_type: - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - plot_table_barchart(table['r_nempty_cells'], - table['c_nempty_cells'], - table['empty_p'], - page_number, - table_number) - - if 'all' in plot_type: - plot_all_barchart(data, pngname) - - if 'rc' in plot_type: - plot_rc_piechart(data, pngname) - - if args['--print-stats']: - print_stats(data, processing_time) - - if args['--save-stats']: - if args['--output']: - scorename = os.path.join(args['--output'], os.path.basename(scorename)) - with open(scorename, 'w') as score_file: - score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( - ''.join([page_number, '_', table_number]), - table['nrows'], - table['ncols'], - table['empty_p'], - table['line_p'], - table['text_p'], - table['score'])) - if args['--debug']: - manager.debug_plot() - except Exception as e: - logger.exception(e.message, exc_info=True) - sys.exit() - elif args[''] == 'ocrs': - try: - kwargs = { - 'table_area': args['--tarea'] if args['--tarea'] else None, - 'columns': args['--columns'] if args['--columns'] else None, - 'blocksize': int(args['--blocksize']), - 'threshold_constant': float(args['--constant']), - 'dpi': int(args['--dpi']), - 'layout': int(args['--layout']), - 'lang': args['--lang'], - 'line_scale': int(args['--line-scale']), - 'char_scale': int(args['--char-scale']), - 'debug': args['--debug'] - } - manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True, - parallel=args['--parallel']) - data = manager.extract() - - processing_time = time.time() - start_time - logger.info("Finished processing in " + str(processing_time) + " seconds") - - if args['--plot']: - if args['--output']: - pngname = os.path.join(args['--output'], os.path.basename(pngname)) - plot_type = args['--plot'].split(',') - if 'page' in plot_type: - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - plot_table_barchart(table['r_nempty_cells'], - table['c_nempty_cells'], - table['empty_p'], - page_number, - table_number) - - if 'all' in plot_type: - plot_all_barchart(data, pngname) - - if 'rc' in plot_type: - plot_rc_piechart(data, pngname) - - if args['--print-stats']: - print_stats(data, processing_time) - - if args['--save-stats']: - if args['--output']: - scorename = os.path.join(args['--output'], os.path.basename(scorename)) - with open(scorename, 'w') as score_file: - score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( - ''.join([page_number, '_', table_number]), - table['nrows'], - table['ncols'], - table['empty_p'], - table['line_p'], - table['text_p'], - table['score'])) - if args['--debug']: - manager.debug_plot() - except Exception as e: - logger.exception(e.message, exc_info=True) - sys.exit() - - if args.get('--debug') is not None and args['--debug']: - print("See 'camelot -h' for various parameters you can tweak.") - else: - output = filedir if args['--output'] is None else args['--output'] - write_to_disk(data, f=args['--format'], - output=output, filename=filename)