diff --git a/.gitignore b/.gitignore index 5351894..4fd453c 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ dist/ .coverage .pytest_cache/ +_build/ +_static/ diff --git a/README.md b/README.md index 3dbaa79..372cc09 100644 --- a/README.md +++ b/README.md @@ -23,50 +23,9 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat >>> df = tables[0].df -Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF. - -
-Camelot: PDF parsing made simpler!
-
-usage:
- camelot [options] <method> [<args>...]
-
-options:
- -h, --help                Show this screen.
- -v, --version             Show version.
- -V, --verbose             Verbose.
- -p, --pages <pageno>      Comma-separated list of page numbers.
-                           Example: -p 1,3-6,10  [default: 1]
- -P, --parallel            Parallelize the parsing process.
- -f, --format <format>     Output format. (csv,tsv,html,json,xlsx) [default: csv]
- -l, --log                 Log to file.
- -o, --output <directory>  Output directory.
- -M, --cmargin <cmargin>   Char margin. Chars closer than cmargin are
-                           grouped together to form a word. [default: 2.0]
- -L, --lmargin <lmargin>   Line margin. Lines closer than lmargin are
-                           grouped together to form a textbox. [default: 0.5]
- -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
-                           if distance between words is greater than word
-                           margin. [default: 0.1]
- -J, --split_text          Split text lines if they span across multiple cells.
- -K, --flag_size           Flag substring if its size differs from the whole string.
-                           Useful for super and subscripts.
- -X, --print-stats         List stats on the parsing process.
- -Y, --save-stats          Save stats to a file.
- -Z, --plot <dist>         Plot distributions. (page,all,rc)
-
-camelot methods:
- lattice  Looks for lines between data.
- stream   Looks for spaces between data.
-
-See 'camelot <method> -h' for more information on a specific method.
-
- ## Dependencies -Currently, camelot works under Python 2.7. - -The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ghostscript](https://www.ghostscript.com/). +The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/). ## Installation @@ -78,22 +37,22 @@ pip install -U pip setuptools ### Installing dependencies -numpy can be install using `pip`. OpenCV and ghostscript can be installed using your system's default package manager. +tk and ghostscript can be installed using your system's default package manager. #### Linux -* Arch Linux - -
-sudo pacman -S opencv tk ghostscript
-
- * Ubuntu
 sudo apt-get install python-opencv python-tk ghostscript
 
+* Arch Linux + +
+sudo pacman -S opencv tk ghostscript
+
+ #### OS X
@@ -103,7 +62,7 @@ brew install homebrew/science/opencv ghostscript
 Finally, `cd` into the project directory and install by
 
 
-make install
+python setup.py install
 
## Development @@ -118,14 +77,14 @@ git clone https://github.com/socialcopsdev/camelot.git ### Contributing -See [Contributing doc](). +See [Contributing guidelines](). ### Testing
-make test
+python setup.py test
 
## License -BSD License +BSD License \ No newline at end of file diff --git a/camelot/core.py b/camelot/core.py index 2417291..f400fe8 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -8,9 +8,48 @@ import pandas as pd class Cell(object): - """ + """Defines a cell in a table with coordinates relative to a + left-bottom origin. (pdf coordinate space) + + Parameters + ---------- + x1 : float + x-coordinate of left-bottom point. + y1 : float + y-coordinate of left-bottom point. + x2 : float + x-coordinate of right-top point. + y2 : float + y-coordinate of right-top point. + + Attributes + ---------- + lb : tuple + Tuple representing left-bottom coordinates. + lt : tuple + Tuple representing left-top coordinates. + rb : tuple + Tuple representing right-bottom coordinates. + rt : tuple + Tuple representing right-top coordinates. + left : bool + Whether or not cell is bounded on the left. + right : bool + Whether or not cell is bounded on the right. + top : bool + Whether or not cell is bounded on the top. + bottom : bool + Whether or not cell is bounded on the bottom. + hspan : bool + Whether or not cell spans horizontally. + vspan : bool + Whether or not cell spans vertically. + text : string + Text assigned to cell. + bound """ + def __init__(self, x1, y1, x2, y2): self.x1 = x1 self.y1 = y1 @@ -34,37 +73,48 @@ class Cell(object): @property def text(self): - """ - - Returns - ------- - - """ return self._text @text.setter def text(self, t): - """ - - Parameters - ---------- - t - """ self._text = ''.join([self._text, t]) @property def bound(self): - """ - - Returns - ------- - + """The number of sides on which the cell is bounded. """ return self.top + self.bottom + self.left + self.right class Table(object): - """ + """Defines a table with coordinates relative to a left-bottom + origin. (pdf coordinate space) + + Parameters + ---------- + cols : list + List of tuples representing column x-coordinates in increasing + order. + rows : list + List of tuples representing row y-coordinates in decreasing + order. + + Attributes + ---------- + df : object + pandas.DataFrame + shape : tuple + Shape of the table. + accuracy : float + Accuracy with which text was assigned to the cell. + whitespace : float + Percentage of whitespace in the table. + order : int + Table number on pdf page. + page : int + Pdf page number. + data + parsing_report """ def __init__(self, cols, rows): @@ -84,11 +134,7 @@ class Table(object): @property def data(self): - """ - - Returns - ------- - + """Returns two-dimensional list of strings in table. """ d = [] for row in self.cells: @@ -97,11 +143,8 @@ class Table(object): @property def parsing_report(self): - """ - - Returns - ------- - + """Returns a parsing report with accuracy, %whitespace, + table number on page and page number. """ # pretty? report = { @@ -112,27 +155,8 @@ class Table(object): } return report - def set_border(self): - """ - - Returns - ------- - - """ - for r in range(len(self.rows)): - self.cells[r][0].left = True - self.cells[r][len(self.cols) - 1].right = True - for c in range(len(self.cols)): - self.cells[0][c].top = True - self.cells[len(self.rows) - 1][c].bottom = True - return self - def set_all_edges(self): - """ - - Returns - ------- - + """Sets all table edges to True. """ for row in self.cells: for cell in row: @@ -140,16 +164,16 @@ class Table(object): return self def set_edges(self, vertical, horizontal, joint_close_tol=2): - """ + """Sets a cell's edges to True depending on whether the cell's + coordinates overlap with the line's coordinates within a + tolerance. Parameters ---------- - vertical - horizontal - joint_close_tol - - Returns - ------- + vertical : list + List of detected vertical lines. + horizontal : list + List of detected horizontal lines. """ for v in vertical: @@ -256,12 +280,20 @@ class Table(object): return self - def set_span(self): + def set_border(self): + """Sets table border edges to True. """ + for r in range(len(self.rows)): + self.cells[r][0].left = True + self.cells[r][len(self.cols) - 1].right = True + for c in range(len(self.cols)): + self.cells[0][c].top = True + self.cells[len(self.rows) - 1][c].bottom = True + return self - Returns - ------- - + def set_span(self): + """Sets a cell's hspan or vspan attribute to True depending + on whether the cell spans horizontally or vertically. """ for row in self.cells: for cell in row: @@ -288,6 +320,8 @@ class Table(object): return self def to_csv(self, path, **kwargs): + """Write Table to a comma-separated values (csv) file. + """ kw = { 'encoding': 'utf-8', 'index': False, @@ -297,6 +331,8 @@ class Table(object): self.df.to_csv(path, **kw) def to_json(self, path, **kwargs): + """Write Table to a JSON file. + """ kw = { 'orient': 'records' } @@ -306,6 +342,8 @@ class Table(object): f.write(json_string) def to_excel(self, path, **kwargs): + """Write Table to an Excel file. + """ kw = { 'sheet_name': 'page-{}-table-{}'.format(self.page, self.order), 'encoding': 'utf-8' @@ -316,13 +354,21 @@ class Table(object): writer.save() def to_html(self, path, **kwargs): + """Write Table to an HTML file. + """ html_string = self.df.to_html(**kwargs) with open(path, 'w') as f: f.write(html_string) class TableList(object): - """ + """Defines a list of camelot.core.Table objects. Each table can + be accessed using its index. + + Attributes + ---------- + n : int + Number of tables in the list. """ def __init__(self, tables): @@ -371,6 +417,18 @@ class TableList(object): z.write(filepath, os.path.basename(filepath)) def export(self, path, f='csv', compress=False): + """Exports the list of tables to specified file format. + + Parameters + ---------- + path : str + Filepath + f : str + File format. Can be csv, json, excel and html. + compress : bool + Whether or not to add files to a ZIP archive. + + """ dirname = os.path.dirname(path) basename = os.path.basename(path) root, ext = os.path.splitext(basename) @@ -402,9 +460,6 @@ class TableList(object): class Geometry(object): - """ - - """ def __init__(self): self.text = [] self.images = () @@ -421,9 +476,6 @@ class Geometry(object): class GeometryList(object): - """ - - """ def __init__(self, geometry): self.text = [g.text for g in geometry] self.images = [g.images for g in geometry] diff --git a/camelot/handlers.py b/camelot/handlers.py index af4db00..8585432 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -9,18 +9,43 @@ from .utils import get_page_layout, get_text_objects, get_rotation class PDFHandler(object): - """ + """Handles all operations like temp directory creation, splitting + file into single page pdfs, parsing each pdf and then removing the + temp directory. + + Parameter + --------- + filename : str + Path to pdf file. + pages : str + Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end """ def __init__(self, filename, pages='1'): self.filename = filename if not self.filename.endswith('.pdf'): raise TypeError("File format not supported.") - self.pages = self.__get_pages(self.filename, pages) + self.pages = self._get_pages(self.filename, pages) self.tempdir = tempfile.mkdtemp() - def __get_pages(self, filename, pages): - # refactor + def _get_pages(self, filename, pages): + """Converts pages string to list of ints. + + Parameters + ---------- + filename : str + Path to pdf file. + pages : str + Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + + Returns + ------- + P : list + List of int page numbers. + + """ page_numbers = [] if pages == '1': page_numbers.append({'start': 1, 'end': 1}) @@ -42,8 +67,19 @@ class PDFHandler(object): P.extend(range(p['start'], p['end'] + 1)) return sorted(set(P)) - def __save_page(self, filename, page, temp): - # refactor + def _save_page(self, filename, page, temp): + """Saves specified page from pdf into a temporary directory. + + Parameters + ---------- + filename : str + Path to pdf file. + page : int + Page number + temp : str + Tmp directory + + """ with open(filename, 'rb') as fileobj: infile = PdfFileReader(fileobj, strict=False) fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) @@ -65,28 +101,37 @@ class PDFHandler(object): infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) outfile = PdfFileWriter() p = infile.getPage(0) - if rotation == 'left': + if rotation == 'anticlockwise': p.rotateClockwise(90) - elif rotation == 'right': + elif rotation == 'clockwise': p.rotateCounterClockwise(90) outfile.addPage(p) with open(fpath, 'wb') as f: outfile.write(f) def parse(self, mesh=False, **kwargs): - """ + """Extracts tables by calling parser.get_tables on all single + page pdfs. Parameters ---------- - mesh - kwargs + mesh : bool (default: False) + Whether or not to use Lattice method of parsing. Stream + is used by default. + kwargs : dict + See camelot.read_pdf kwargs. Returns ------- + tables : camelot.core.TableList + List of tables found in pdf. + geometry : camelot.core.GeometryList + List of geometry objects (contours, lines, joints) + found in pdf. """ for p in self.pages: - self.__save_page(self.filename, p, self.tempdir) + self._save_page(self.filename, p, self.tempdir) pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p)) for p in self.pages] tables = [] diff --git a/camelot/image_processing.py b/camelot/image_processing.py index ec2b384..23923b2 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -9,17 +9,31 @@ from .utils import merge_tuples def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): - """ + """Thresholds an image using OpenCV's adaptiveThreshold. Parameters ---------- - imagename - process_background - blocksize - c + imagename : string + Path to image file. + process_background : bool, optional (default: False) + Whether or not to process lines that are in background. + blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + c : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. Returns ------- + img : object + numpy.ndarray representing the original image. + threshold : object + numpy.ndarray representing the thresholded image. """ img = cv2.imread(imagename) @@ -35,17 +49,35 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): - """ + """Finds horizontal and vertical lines by applying morphological + transformations on an image. Parameters ---------- - threshold - direction - line_size_scaling - iterations + threshold : object + numpy.ndarray representing the thresholded image. + direction : string, optional (default: 'horizontal') + Specifies whether to find vertical or horizontal lines. + line_size_scaling : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. Returns ------- + dmask : object + numpy.ndarray representing pixels where vertical/horizontal + lines lie. + lines : list + List of tuples representing vertical/horizontal lines with + coordinates relative to a left-top origin in + image coordinate space. """ lines = [] @@ -84,15 +116,21 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio def find_table_contours(vertical, horizontal): - """ + """Finds table boundaries using OpenCV's findContours. Parameters ---------- - vertical - horizontal + vertical : object + numpy.ndarray representing pixels where vertical lines lie. + horizontal : object + numpy.ndarray representing pixels where horizontal lines lie. Returns ------- + cont : list + List of tuples representing table boundaries. Each tuple is of + the form (x, y, w, h) where (x, y) -> left-top, w -> width and + h -> height in image coordinate space. """ mask = vertical + horizontal @@ -114,16 +152,26 @@ def find_table_contours(vertical, horizontal): def find_table_joints(contours, vertical, horizontal): - """ + """Finds joints/intersections present inside each table boundary. Parameters ---------- - contours - vertical - horizontal + contours : list + List of tuples representing table boundaries. Each tuple is of + the form (x, y, w, h) where (x, y) -> left-top, w -> width and + h -> height in image coordinate space. + vertical : object + numpy.ndarray representing pixels where vertical lines lie. + horizontal : object + numpy.ndarray representing pixels where horizontal lines lie. Returns ------- + tables : dict + Dict with table boundaries as keys and list of intersections + in that boundary as their value. + Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb + and (x2, y2) -> rt in image coordinate space. """ joints = np.bitwise_and(vertical, horizontal) @@ -150,15 +198,24 @@ def find_table_joints(contours, vertical, horizontal): def remove_lines(threshold, line_size_scaling=15): - """ + """Removes lines from a thresholded image. Parameters ---------- - threshold - line_size_scaling + threshold : object + numpy.ndarray representing the thresholded image. + line_size_scaling : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. Returns ------- + threshold : object + numpy.ndarray representing the thresholded image + with horizontal and vertical lines removed. """ size = threshold.shape[0] // line_size_scaling @@ -178,16 +235,23 @@ def remove_lines(threshold, line_size_scaling=15): def find_cuts(threshold, char_size_scaling=200): - """ + """Finds cuts made by text projections on y-axis. Parameters ---------- - threshold - char_size_scaling + threshold : object + numpy.ndarray representing the thresholded image. + line_size_scaling : int, optional (default: 200) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. Returns ------- - + y_cuts : list + List of cuts on y-axis. """ size = threshold.shape[0] // char_size_scaling char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) diff --git a/camelot/io.py b/camelot/io.py index 54f8ef5..33007d4 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -2,20 +2,93 @@ from .handlers import PDFHandler def read_pdf(filepath, pages='1', mesh=False, **kwargs): - """ + """Read PDF and return parsed data tables. + + Note: kwargs annotated with ^ can only be used with mesh=False + and kwargs annotated with * can only be used with mesh=True. Parameters ---------- - filepath - pages - mesh - kwargs + filepath : str + Path to pdf file. + pages : str + Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + mesh : bool (default: False) + Whether or not to use Lattice method of parsing. Stream + is used by default. + table_area : list, optional (default: None) + List of table areas to analyze as strings of the form + x1,y1,x2,y2 where (x1, y1) -> left-top and + (x2, y2) -> right-bottom in pdf coordinate space. + columns^ : list, optional (default: None) + List of column x-coordinates as strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + row_close_tol^ : int, optional (default: 2) + Rows will be formed by combining text vertically + within this tolerance. + col_close_tol^ : int, optional (default: 0) + Columns will be formed by combining text horizontally + within this tolerance. + process_background* : bool, optional (default: False) + Whether or not to process lines that are in background. + line_size_scaling* : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + copy_text* : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + shift_text* : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a list + to specify where the text in a spanning cell should flow. + line_close_tol* : int, optional (default: 2) + Tolerance parameter used to merge vertical and horizontal + detected lines which lie close to each other. + joint_close_tol* : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + threshold_constant : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + margins : tuple + PDFMiner margins. (char_margin, line_margin, word_margin) + + For for information, refer `PDFMiner docs `_. + debug : bool, optional (default: False) + Whether or not to return all text objects on the page + which can be used to generate a matplotlib plot, to get + values for table_area(s) and debugging. Returns ------- + tables : camelot.core.TableList """ - # explicit type conversion + # validate kwargs? p = PDFHandler(filepath, pages) tables, __ = p.parse(mesh=mesh, **kwargs) return tables \ No newline at end of file diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 79cb986..3ffe146 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -5,8 +5,7 @@ from ..utils import get_page_layout, get_text_objects class BaseParser(object): - """ - + """Defines a base parser. """ def _generate_layout(self, filename): self.filename = filename diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 15663dc..40a9040 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -11,7 +11,7 @@ from .base import BaseParser from ..core import Table from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, merge_close_lines, get_table_index, compute_accuracy, - count_empty_strings, encode_, setup_logging) + compute_whitespace, setup_logging, encode_) from ..image_processing import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) @@ -20,14 +20,74 @@ logger = setup_logging(__name__) class Lattice(BaseParser): - """ + """Lattice method of parsing looks for lines between text + to form a table. + + Parameters + ---------- + table_area : list, optional (default: None) + List of table areas to analyze as strings of the form + x1,y1,x2,y2 where (x1, y1) -> left-top and + (x2, y2) -> right-bottom in pdf coordinate space. + process_background : bool, optional (default: False) + Whether or not to process lines that are in background. + line_size_scaling : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + copy_text : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + shift_text : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a list + to specify where the text in a spanning cell should flow. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + line_close_tol : int, optional (default: 2) + Tolerance parameter used to merge vertical and horizontal + detected lines which lie close to each other. + joint_close_tol : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + threshold_constant : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + margins : tuple + PDFMiner margins. (char_margin, line_margin, word_margin) + + For for information, refer `PDFMiner docs `_. + debug : bool, optional (default: False) + Whether or not to return all text objects on the page + which can be used to generate a matplotlib plot, to get + values for table_area(s) and debugging. """ def __init__(self, table_area=None, process_background=False, line_size_scaling=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, line_close_tol=2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, - iterations=0, margins=(1.0, 0.5, 0.1), debug=None): + iterations=0, margins=(1.0, 0.5, 0.1), debug=False): self.table_area = table_area self.process_background = process_background self.line_size_scaling = line_size_scaling @@ -45,6 +105,27 @@ class Lattice(BaseParser): @staticmethod def _reduce_index(t, idx, shift_text): + """Reduces index of a text object if it lies within a spanning + cell. + + Parameters + ---------- + table : camelot.core.Table + idx : list + List of tuples of the form (r_idx, c_idx, text). + shift_text : list + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a + list to specify where the text in a spanning cell should + flow. + + Returns + ------- + indices : list + List of tuples of the form (r_idx, c_idx, text) where + r_idx and c_idx are new row and column indices for text. + + """ indices = [] for r_idx, c_idx, text in idx: for d in shift_text: @@ -69,6 +150,22 @@ class Lattice(BaseParser): @staticmethod def _copy_spanning_text(t, copy_text=None): + """Copies over text in empty spanning cells. + + Parameters + ---------- + t : camelot.core.Table + copy_text : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + + Returns + ------- + t : camelot.core.Table + + """ for f in copy_text: if f == "h": for i in range(len(t.cells)): @@ -199,7 +296,7 @@ class Lattice(BaseParser): table.df = pd.DataFrame(data) table.shape = table.df.shape - whitespace, __, __ = count_empty_strings(data) + whitespace = compute_whitespace(data) table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 @@ -208,16 +305,6 @@ class Lattice(BaseParser): return table def extract_tables(self, filename): - """ - - Parameters - ---------- - filename - - Returns - ------- - - """ logger.info('Processing {}'.format(os.path.basename(filename))) self._generate_layout(filename) @@ -237,7 +324,7 @@ class Lattice(BaseParser): table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) _tables.append(table) - if self.debug is not None: + if self.debug: text = [] text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index e1a4980..f547bf0 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -8,19 +8,54 @@ import pandas as pd from .base import BaseParser from ..core import Table from ..utils import (text_in_bbox, get_table_index, compute_accuracy, - count_empty_strings, encode_, setup_logging) + compute_whitespace, setup_logging, encode_) logger = setup_logging(__name__) class Stream(BaseParser): - """ + """Stream method of parsing looks for spaces between text + to form a table. + + If you want to specify columns when specifying multiple table + areas, make sure that the length of both lists are equal. + + Parameters + ---------- + table_area : list, optional (default: None) + List of table areas to analyze as strings of the form + x1,y1,x2,y2 where (x1, y1) -> left-top and + (x2, y2) -> right-bottom in pdf coordinate space. + columns : list, optional (default: None) + List of column x-coordinates as strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + row_close_tol : int, optional (default: 2) + Rows will be formed by combining text vertically + within this tolerance. + col_close_tol : int, optional (default: 0) + Columns will be formed by combining text horizontally + within this tolerance. + margins : tuple, optional (default: (1.0, 0.5, 0.1)) + PDFMiner margins. (char_margin, line_margin, word_margin) + + For for information, refer `PDFMiner docs `_. + debug : bool, optional (default: False) + Whether or not to return all text objects on the page + which can be used to generate a matplotlib plot, to get + values for table_area(s), columns and debugging. """ def __init__(self, table_area=None, columns=None, split_text=False, flag_size=False, row_close_tol=2, col_close_tol=0, - margins=(1.0, 0.5, 0.1), debug=None): + margins=(1.0, 0.5, 0.1), debug=False): self.table_area = table_area self.columns = columns self._validate_columns() @@ -33,6 +68,20 @@ class Stream(BaseParser): @staticmethod def _text_bbox(t_bbox): + """Returns bounding box for the text present on a page. + + Parameters + ---------- + t_bbox : dict + Dict with two keys 'horizontal' and 'vertical' with lists of + LTTextLineHorizontals and LTTextLineVerticals respectively. + + Returns + ------- + text_bbox : tuple + Tuple (x0, y0, x1, y1) in pdf coordinate space. + + """ xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) @@ -42,6 +91,21 @@ class Stream(BaseParser): @staticmethod def _group_rows(text, row_close_tol=2): + """Groups PDFMiner text objects into rows vertically + within a tolerance. + + Parameters + ---------- + text : list + List of PDFMiner text objects. + row_close_tol : int, optional (default: 2) + + Returns + ------- + rows : list + Two-dimensional list of text objects grouped into rows. + + """ row_y = 0 rows = [] temp = [] @@ -61,6 +125,21 @@ class Stream(BaseParser): @staticmethod def _merge_columns(l, col_close_tol=0): + """Merges column boundaries horizontally if they overlap + or lie within a tolerance. + + Parameters + ---------- + l : list + List of column x-coordinate tuples. + col_close_tol : int, optional (default: 0) + + Returns + ------- + merged : list + List of merged column x-coordinate tuples. + + """ merged = [] for higher in l: if not merged: @@ -89,6 +168,21 @@ class Stream(BaseParser): @staticmethod def _join_rows(rows_grouped, text_y_max, text_y_min): + """Makes row coordinates continuous. + + Parameters + ---------- + rows_grouped : list + Two-dimensional list of text objects grouped into rows. + text_y_max : int + text_y_min : int + + Returns + ------- + rows : list + List of continuous row y-coordinate tuples. + + """ row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0 for r in rows_grouped] rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] @@ -100,6 +194,23 @@ class Stream(BaseParser): @staticmethod def _add_columns(cols, text, row_close_tol): + """Adds columns to existing list by taking into account + the text that lies outside the current column x-coordinates. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text : list + List of PDFMiner text objects. + ytol : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ if text: text = Stream._group_rows(text, row_close_tol=row_close_tol) elements = [len(r) for r in text] @@ -110,6 +221,21 @@ class Stream(BaseParser): @staticmethod def _join_columns(cols, text_x_min, text_x_max): + """Makes column coordinates continuous. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text_x_min : int + text_y_max : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ cols = sorted(cols) cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] cols.insert(0, text_x_min) @@ -207,7 +333,7 @@ class Stream(BaseParser): table.df = pd.DataFrame(data) table.shape = table.df.shape - whitespace, __, __ = count_empty_strings(data) + whitespace = compute_whitespace(data) table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 @@ -216,16 +342,6 @@ class Stream(BaseParser): return table def extract_tables(self, filename): - """ - - Parameters - ---------- - filename - - Returns - ------- - - """ logger.info('Processing {}'.format(os.path.basename(filename))) self._generate_layout(filename) @@ -244,7 +360,7 @@ class Stream(BaseParser): table = self._generate_table(table_idx, cols, rows) _tables.append(table) - if self.debug is not None: + if self.debug: text = [] text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) diff --git a/camelot/plotting.py b/camelot/plotting.py index 7d4e4d3..2d0bb3c 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -6,19 +6,101 @@ from .handlers import PDFHandler def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs): - """ + """Plot geometry found on pdf page based on type specified, + useful for debugging and playing with different parameters to get + the best output. + + Note: kwargs annotated with ^ can only be used with mesh=False + and kwargs annotated with * can only be used with mesh=True. Parameters ---------- - filepath - pages - mesh - geometry_type - kwargs + filepath : str + Path to pdf file. + pages : str + Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + mesh : bool (default: False) + Whether or not to use Lattice method of parsing. Stream + is used by default. + geometry_type : str, optional (default: 'text') + 'text' : Plot text objects found on page, useful to get + table_area and columns coordinates. + 'table' : Plot parsed table. + 'contour'* : Plot detected rectangles. + 'joint'* : Plot detected line intersections. + 'line'* : Plot detected lines. + table_area : list, optional (default: None) + List of table areas to analyze as strings of the form + x1,y1,x2,y2 where (x1, y1) -> left-top and + (x2, y2) -> right-bottom in pdf coordinate space. + columns^ : list, optional (default: None) + List of column x-coordinates as strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. + row_close_tol^ : int, optional (default: 2) + Rows will be formed by combining text vertically + within this tolerance. + col_close_tol^ : int, optional (default: 0) + Columns will be formed by combining text horizontally + within this tolerance. + process_background* : bool, optional (default: False) + Whether or not to process lines that are in background. + line_size_scaling* : int, optional (default: 15) + Factor by which the page dimensions will be divided to get + smallest length of lines that should be detected. + + The larger this value, smaller the detected lines. Making it + too large will lead to text being detected as lines. + copy_text* : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + shift_text* : list, optional (default: ['l', 't']) + {'l', 'r', 't', 'b'} + Select one or more strings from above and pass them as a list + to specify where the text in a spanning cell should flow. + line_close_tol* : int, optional (default: 2) + Tolerance parameter used to merge vertical and horizontal + detected lines which lie close to each other. + joint_close_tol* : int, optional (default: 2) + Tolerance parameter used to decide whether the detected lines + and points lie close to each other. + threshold_blocksize : int, optional (default: 15) + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + For more information, refer `OpenCV's adaptiveThreshold `_. + threshold_constant : int, optional (default: -2) + Constant subtracted from the mean or weighted mean. + Normally, it is positive but may be zero or negative as well. + + For more information, refer `OpenCV's adaptiveThreshold `_. + iterations : int, optional (default: 0) + Number of times for erosion/dilation is applied. + + For more information, refer `OpenCV's dilate `_. + margins : tuple + PDFMiner margins. (char_margin, line_margin, word_margin) + + For for information, refer `PDFMiner docs `_. + debug : bool, optional (default: False) + Whether or not to return all text objects on the page + which can be used to generate a matplotlib plot, to get + values for table_area(s) and debugging. + """ - # explicit type conversion + # validate kwargs? p = PDFHandler(filepath, pages) - kwargs.update({'debug': geometry_type}) + debug = True if geometry_type else False + kwargs.update({'debug': debug}) __, geometry = p.parse(mesh=mesh, **kwargs) if geometry_type == 'text': diff --git a/camelot/utils.py b/camelot/utils.py index d132b5a..075c0df 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -19,14 +19,15 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, def setup_logging(name): - """ + """Sets up a logger with StreamHandler. Parameters ---------- - name + name : str Returns ------- + logger : logging.Logger """ logger = logging.getLogger(name) @@ -47,15 +48,16 @@ logger = setup_logging(__name__) def translate(x1, x2): - """ + """Translates x2 by x1. Parameters ---------- - x1 - x2 + x1 : float + x2 : float Returns ------- + x2 : float """ x2 += x1 @@ -63,15 +65,16 @@ def translate(x1, x2): def scale(x, s): - """ + """Scales x by scaling factor s. Parameters ---------- - x - s + x : float + s : float Returns ------- + x : float """ x *= s @@ -79,18 +82,21 @@ def scale(x, s): def rotate(x1, y1, x2, y2, angle): - """ + """Rotates point x2, y2 about point x1, y1 by angle. Parameters ---------- - x1 - y1 - x2 - y2 - angle + x1 : float + y1 : float + x2 : float + y2 : float + angle : float + Angle in radians. Returns ------- + xnew : float + ynew : float """ s = np.sin(angle) @@ -105,15 +111,26 @@ def rotate(x1, y1, x2, y2, angle): def scale_pdf(k, factors): - """ + """Translates and scales pdf coordinate space to image + coordinate space. Parameters ---------- - k - factors + k : tuple + Tuple (x1, y1, x2, y2) representing table bounding box where + (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate + space. + factors : tuple + Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the + first two elements are scaling factors and pdf_y is height of + pdf. Returns ------- + knew : tuple + Tuple (x1, y1, x2, y2) representing table bounding box where + (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate + space. """ x1, y1, x2, y2 = k @@ -127,17 +144,28 @@ def scale_pdf(k, factors): def scale_image(tables, v_segments, h_segments, factors): - """ + """Translates and scales image coordinate space to pdf + coordinate space. Parameters ---------- - tables - v_segments - h_segments - factors + tables : dict + Dict with table boundaries as keys and list of intersections + in that boundary as value. + v_segments : list + List of vertical line segments. + h_segments : list + List of horizontal line segments. + factors : tuple + Tuple (scaling_factor_x, scaling_factor_y, img_y) where the + first two elements are scaling factors and img_y is height of + image. Returns ------- + tables_new : dict + v_segments_new : dict + h_segments_new : dict """ scaling_factor_x, scaling_factor_y, img_y = factors @@ -172,16 +200,23 @@ def scale_image(tables, v_segments, h_segments, factors): def get_rotation(lttextlh, lttextlv, ltchar): - """ + """Detects if text in table is rotated or not using the current + transformation matrix (CTM) and returns its orientation. Parameters ---------- - lttextlh - lttextlv - ltchar + lttextlh : list + List of PDFMiner LTTextLineHorizontal objects. + lttextlv : list + List of PDFMiner LTTextLineVertical objects. + ltchar : list + List of PDFMiner LTChar objects. Returns ------- + rotation : string + '' if text in table is upright, 'left' if rotated 90 degree + anticlockwise and 'right' if rotated 90 degree clockwise. """ rotation = '' @@ -190,21 +225,30 @@ def get_rotation(lttextlh, lttextlv, ltchar): if hlen < vlen: clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) - rotation = 'left' if clockwise < anticlockwise else 'right' + rotation = 'clockwise' if clockwise < anticlockwise else 'anticlockwise' return rotation def segments_in_bbox(bbox, v_segments, h_segments): - """ + """Returns all line segments present inside a bounding box. Parameters ---------- - bbox - v_segments - h_segments + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate + space. + v_segments : list + List of vertical line segments. + h_segments : list + List of vertical horizontal segments. Returns ------- + v_s : list + List of vertical line segments that lie inside table. + h_s : list + List of horizontal line segments that lie inside table. """ lb = (bbox[0], bbox[1]) @@ -217,35 +261,42 @@ def segments_in_bbox(bbox, v_segments, h_segments): def text_in_bbox(bbox, text): - """ + """Returns all text objects present inside a bounding box. Parameters ---------- - bbox - text + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate + space. + text : List of PDFMiner text objects. Returns ------- + t_bbox : list + List of PDFMiner text objects that lie inside table. """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 - <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 - <= rt[1] + 2] + <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 + <= rt[1] + 2] return t_bbox def remove_close_lines(ar, line_close_tol=2): - """ + """Removes lines which are within a tolerance, based on their x or + y axis projections. Parameters ---------- - ar - line_close_tol + ar : list + line_close_tol : int, optional (default: 2) Returns ------- + ret : list """ ret = [] @@ -262,15 +313,17 @@ def remove_close_lines(ar, line_close_tol=2): def merge_close_lines(ar, line_close_tol=2): - """ + """Merges lines which are within a tolerance by calculating a + moving mean, based on their x or y axis projections. Parameters ---------- - ar - line_close_tol + ar : list + line_close_tol : int, optional (default: 2) Returns ------- + ret : list """ ret = [] @@ -288,15 +341,19 @@ def merge_close_lines(ar, line_close_tol=2): def flag_font_size(textline, direction): - """ + """Flags super/subscripts in text by enclosing them with . + May give false positives. Parameters ---------- - textline - direction + textline : list + List of PDFMiner LTChar objects. + direction : string + Direction of the PDFMiner LTTextLine object. Returns ------- + fstring : string """ if direction == 'horizontal': @@ -324,18 +381,27 @@ def flag_font_size(textline, direction): return fstring -def split_textline(table, textline, direction, flag_size=True): - """ +def split_textline(table, textline, direction, flag_size=False): + """Splits PDFMiner LTTextLine into substrings if it spans across + multiple rows/columns. Parameters ---------- - table - textline - direction - flag_size + table : camelot.core.Table + textline : object + PDFMiner LTTextLine object. + direction : string + Direction of the PDFMiner LTTextLine object. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. Returns ------- + grouped_chars : list + List of tuples of the form (idx, text) where idx is the index + of row/column and text is the an lttextline substring. """ idx = 0 @@ -388,19 +454,38 @@ def split_textline(table, textline, direction, flag_size=True): return grouped_chars -def get_table_index(table, t, direction, split_text=False, flag_size=True): - """ +def get_table_index(table, t, direction, split_text=False, flag_size=False): + """Gets indices of the table cell where given text object lies by + comparing their y and x-coordinates. Parameters ---------- - table - t - direction - split_text - flag_size + table : camelot.core.Table + t : object + PDFMiner LTTextLine object. + direction : string + Direction of the PDFMiner LTTextLine object. + split_text : bool, optional (default: False) + Whether or not to split a text line if it spans across + multiple cells. + flag_size : bool, optional (default: False) + Whether or not to highlight a substring using + if its size is different from rest of the string, useful for + super and subscripts. Returns ------- + indices : list + List of tuples of the form (r_idx, c_idx, text) where r_idx + and c_idx are row and column indices. + error : float + Assignment error, percentage of text area that lies outside + a cell. + +-------+ + | | + | [Text bounding box] + | | + +-------+ """ r_idx, c_idx = [-1] * 2 @@ -450,14 +535,19 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): def compute_accuracy(error_weights): - """ + """Calculates a score based on weights assigned to various + parameters and their error percentages. Parameters ---------- - error_weights + error_weights : list + Two-dimensional list of the form [[p1, e1], [p2, e2], ...] + where pn is the weight assigned to list of errors en. + Sum of pn should be equal to 100. Returns ------- + score : float """ SCORE_VAL = 100 @@ -474,50 +564,40 @@ def compute_accuracy(error_weights): return score -def count_empty_strings(d): - """ +def compute_whitespace(d): + """Calculates the percentage of empty strings in a + two-dimensional list. Parameters ---------- - d + d : list Returns ------- + whitespace : float + Percentage of empty cells. """ - empty_p = 0 + whitespace = 0 r_nempty_cells, c_nempty_cells = [], [] for i in d: for j in i: if j.strip() == '': - empty_p += 1 - empty_p = 100 * (empty_p / float(len(d) * len(d[0]))) - for row in d: - r_nempty_c = 0 - for r in row: - if r.strip() != '': - r_nempty_c += 1 - r_nempty_cells.append(r_nempty_c) - d = zip(*d) - d = [list(col) for col in d] - for col in d: - c_nempty_c = 0 - for c in col: - if c.strip() != '': - c_nempty_c += 1 - c_nempty_cells.append(c_nempty_c) - return empty_p, r_nempty_cells, c_nempty_cells + whitespace += 1 + whitespace = 100 * (whitespace / float(len(d) * len(d[0]))) + return whitespace -def remove_empty_strings(d): - """ +def remove_empty(d): + """Removes empty rows and columns from a two-dimensional list. Parameters ---------- - d + d : list Returns ------- + d : list """ for i, row in enumerate(d): @@ -530,70 +610,46 @@ def remove_empty_strings(d): def encode_(ar): - """ + """Encodes two-dimensional list into unicode. Parameters ---------- - ar + ar : list Returns ------- + ar : list """ ar = [[r.encode('utf-8') for r in row] for row in ar] return ar -def get_text_objects(layout, ltype="char", t=None): - """ - - Parameters - ---------- - layout - ltype - t - - Returns - ------- - - """ - if ltype == "char": - LTObject = LTChar - elif ltype == "lh": - LTObject = LTTextLineHorizontal - elif ltype == "lv": - LTObject = LTTextLineVertical - if t is None: - t = [] - try: - for obj in layout._objs: - if isinstance(obj, LTObject): - t.append(obj) - else: - t += get_text_objects(obj, ltype=ltype) - except AttributeError: - pass - return t - - -def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, +def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, detect_vertical=True, all_texts=True): - """ + """Returns a PDFMiner LTPage object and page dimension of a single + page pdf. See https://euske.github.io/pdfminer/ to get definitions + of kwargs. Parameters ---------- - pname - char_margin - line_margin - word_margin - detect_vertical - all_texts + filename : string + Path to pdf file. + char_margin : float + line_margin : float + word_margin : float + detect_vertical : bool + all_texts : bool Returns ------- + layout : object + PDFMiner LTPage object. + dim : tuple + Dimension of pdf page in the form (width, height). """ - with open(pname, 'r') as f: + with open(filename, 'r') as f: parser = PDFParser(f) document = PDFDocument(parser) if not document.is_extractable: @@ -615,12 +671,56 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, return layout, dim -def merge_tuples(tuples): - """ +def get_text_objects(layout, ltype="char", t=None): + """Recursively parses pdf layout to get a list of + PDFMiner text objects. Parameters ---------- - tuples + layout : object + PDFMiner LTPage object. + ltype : string + Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal, + and LTTextLineVertical objects respectively. + t : list + + Returns + ------- + t : list + List of PDFMiner text objects. + + """ + if ltype == "char": + LTObject = LTChar + elif ltype == "lh": + LTObject = LTTextLineHorizontal + elif ltype == "lv": + LTObject = LTTextLineVertical + if t is None: + t = [] + try: + for obj in layout._objs: + if isinstance(obj, LTObject): + t.append(obj) + else: + t += get_text_objects(obj, ltype=ltype) + except AttributeError: + pass + return t + + +def merge_tuples(tuples): + """Merges a list of overlapping tuples. + + Parameters + ---------- + tuples : list + List of tuples where a tuple is a single axis coordinate pair. + + Yields + ------ + tuple + """ merged = list(tuples[0]) for s, e in tuples: diff --git a/docs/api.rst b/docs/api.rst index 99a9e7f..3bd0f3d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -4,17 +4,37 @@ API Reference ============= -Pdf -=== -.. automodule:: camelot.pdf +camelot.read_pdf +================ +.. automodule:: camelot.read_pdf :members: -Lattice -======= -.. automodule:: camelot.lattice +camelot.handlers.PDFHandler +=========================== +.. automodule:: camelot.handlers.PDFHandler :members: -Stream -====== -.. automodule:: camelot.stream +camelot.parsers.Stream +====================== +.. automodule:: camelot.parsers.Stream + :members: + +camelot.parsers.Lattice +======================= +.. automodule:: camelot.parsers.Lattice + :members: + +camelot.core.Cell +================= +.. automodule:: camelot.core.Cell + :members: + +camelot.core.Table +================== +.. automodule:: camelot.core.Table + :members: + +camelot.core.TableList +====================== +.. automodule:: camelot.core.TableList :members: \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index b186ad7..4b91c69 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,11 +3,11 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -================================== -Camelot: pdf parsing made simpler! -================================== +===================================== +Camelot: PDF Table Parsing for Humans +===================================== -Camelot is a Python 2.7 library and command-line tool for getting tables out of pdf files. +Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files. Why another pdf table parsing library? ====================================== @@ -32,12 +32,22 @@ Usage :: - >>> from camelot.pdf import Pdf - >>> from camelot.lattice import Lattice - - >>> manager = Pdf(Lattice(), 'us-030.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] + >>> import camelot + >>> tables = camelot.read_pdf("foo.pdf") + >>> tables + + >>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html + >>> tables[0] + + >>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html + >>> tables[0].parsing_report + { + "accuracy": 96, + "whitespace": 80, + "order": 1, + "page": 1 + } + >>> df = tables[0].df .. csv-table:: :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" @@ -49,45 +59,6 @@ Usage "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" -Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF. - -:: - - Camelot: PDF parsing made simpler! - - usage: - camelot [options] [...] - - options: - -h, --help Show this screen. - -v, --version Show version. - -V, --verbose Verbose. - -p, --pages Comma-separated list of page numbers. - Example: -p 1,3-6,10 [default: 1] - -P, --parallel Parallelize the parsing process. - -f, --format Output format. (csv,tsv,html,json,xlsx) [default: csv] - -l, --log Log to file. - -o, --output Output directory. - -M, --cmargin Char margin. Chars closer than cmargin are - grouped together to form a word. [default: 1.0] - -L, --lmargin Line margin. Lines closer than lmargin are - grouped together to form a textbox. [default: 0.5] - -W, --wmargin Word margin. Insert blank spaces between chars - if distance between words is greater than word - margin. [default: 0.1] - -J, --split_text Split text lines if they span across multiple cells. - -K, --flag_size Flag substring if its size differs from the whole string. - Useful for super and subscripts. - -X, --print-stats List stats on the parsing process. - -Y, --save-stats Save stats to a file. - -Z, --plot Plot distributions. (page,all,rc) - - camelot methods: - lattice Looks for lines between data. - stream Looks for spaces between data. - - See 'camelot -h' for more information on a specific method. - Installation ============ @@ -95,42 +66,41 @@ Make sure you have the most updated versions for `pip` and `setuptools`. You can pip install -U pip setuptools -The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_. +The dependencies include `tk`_ and `ghostscript`_. -.. _numpy: http://www.numpy.org/ -.. _OpenCV: http://opencv.org/ -.. _ImageMagick: http://www.imagemagick.org/script/index.php +.. _tk: https://wiki.tcl.tk/3743 +.. _ghostscript: https://www.ghostscript.com/ Installing dependencies ----------------------- -numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager. +tk and ghostscript can be installed using your system's default package manager. Linux ^^^^^ -* Arch Linux - -:: - - sudo pacman -S opencv imagemagick - * Ubuntu :: - sudo apt-get install libopencv-dev python-opencv imagemagick + sudo apt-get install python-opencv python-tk ghostscript + +* Arch Linux + +:: + + sudo pacman -S opencv tk ghostscript OS X ^^^^ :: - brew install homebrew/science/opencv imagemagick + brew install homebrew/science/opencv ghostscript Finally, `cd` into the project directory and install by:: - make install + python setup.py install API Reference ============= @@ -150,14 +120,14 @@ You can check the latest sources with the command:: Contributing ------------ -See :doc:`Contributing doc `. +See :doc:`Contributing guidelines `. Testing ------- :: - make test + python setup.py test License ======= diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..d907a0b --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,11 @@ +click==6.7 +matplotlib==2.2.3 +numpy==1.13.3 +opencv-python==3.4.2.17 +pandas==0.23.4 +pdfminer==20140328 +Pillow==5.2.0 +PyPDF2==1.26.0 +pytest==3.8.0 +pytest-runner==4.2 +Sphinx==1.8.0b1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0535f0c..d1a33b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -docopt==0.6.2 +click==6.7 matplotlib==2.2.3 -nose==1.3.7 +numpy==1.13.3 +opencv-python==3.4.2.17 +pandas==0.23.4 pdfminer==20140328 -pyexcel-xlsx==0.5.6 Pillow==5.2.0 -PyPDF2==1.26.0 -Sphinx==1.8.0b1 \ No newline at end of file +PyPDF2==1.26.0 \ No newline at end of file diff --git a/setup.py b/setup.py index b496b51..14c0516 100644 --- a/setup.py +++ b/setup.py @@ -4,12 +4,12 @@ import camelot NAME = 'camelot' VERSION = camelot.__version__ -DESCRIPTION = 'camelot parses tables from PDFs!' +DESCRIPTION = 'PDF Table Parsing for Humans' with open('README.md') as f: LONG_DESCRIPTION = f.read() URL = 'https://github.com/socialcopsdev/camelot' AUTHOR = 'Vinayak Mehta' -AUTHOR_EMAIL = 'vinayak@socialcops.com' +AUTHOR_EMAIL = 'vmehta94@gmail.com' LICENSE = 'BSD License' opencv_min_version = '2.4.8' @@ -58,18 +58,14 @@ def setup_package(): opencv_status = get_opencv_status() opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version) - instructions = ("Installation instructions are available in the README at " - "https://github.com/socialcopsdev/camelot") if opencv_status['up_to_date'] is False: if opencv_status['version']: - raise ImportError("Your installation of OpenCV " - "{0} is out-of-date.\n{1}{2}" - .format(opencv_status['version'], - opencv_req_str, instructions)) + raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}" + .format(opencv_status['version'], opencv_req_str)) else: - raise ImportError("OpenCV is not installed.\n{0}{1}" - .format(opencv_req_str, instructions)) + raise ImportError("OpenCV is not installed.\n{}" + .format(opencv_req_str)) setup(**metadata)