diff --git a/HISTORY.md b/HISTORY.md index 9b06b01..6a0a2e7 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,12 @@ Release History master ------ +**Improvements** + +* [#170](https://github.com/socialcopsdev/camelot/issues/170) Add option to pass pdfminer layout kwargs. [#232](https://github.com/socialcopsdev/camelot/pull/232) by Vinayak Mehta. + * Keyword arguments for [pdfminer.layout.LAParams](https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33) can now be passed using `layout_kwargs` in `read_pdf()`. + * The `margins` keyword argument in `read_pdf()` is now deprecated. + 0.5.0 (2018-12-13) ------------------ diff --git a/camelot/cli.py b/camelot/cli.py index e978a3c..a1a571e 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -43,6 +43,8 @@ pass_config = click.make_pass_decorator(Config) help='Split text that spans across multiple cells.') @click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on' ' font size. Useful to detect super/subscripts.') +@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before' + ' assigning it to a cell.') @click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), help='PDFMiner char_margin, line_margin and word_margin.') @click.pass_context @@ -68,10 +70,10 @@ def cli(ctx, *args, **kwargs): @click.option('-shift', '--shift_text', default=['l', 't'], type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, help='Direction in which text in a spanning cell will flow.') -@click.option('-l', '--line_close_tol', default=2, +@click.option('-l', '--line_tol', default=2, help='Tolerance parameter used to merge close vertical' ' and horizontal lines.') -@click.option('-j', '--joint_close_tol', default=2, +@click.option('-j', '--joint_tol', default=2, help='Tolerance parameter used to decide whether' ' the detected lines and points lie close to each other.') @click.option('-block', '--threshold_blocksize', default=15, @@ -84,6 +86,8 @@ def cli(ctx, *args, **kwargs): ' may be zero or negative as well.') @click.option('-I', '--iterations', default=0, help='Number of times for erosion/dilation will be applied.') +@click.option('-res', '--resolution', default=300, + help='Resolution used for PDF to PNG conversion.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']), help='Plot elements found on PDF page for visual debugging.') @@ -133,9 +137,11 @@ def lattice(c, *args, **kwargs): ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-C', '--columns', default=[], multiple=True, help='X coordinates of column separators.') -@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter' +@click.option('-e', '--edge_tol', default=50, help='Tolerance parameter' + ' for extending textedges vertically.') +@click.option('-r', '--row_tol', default=2, help='Tolerance parameter' ' used to combine text vertically, to generate rows.') -@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' +@click.option('-c', '--column_tol', default=0, help='Tolerance parameter' ' used to combine text horizontally, to generate columns.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'grid', 'contour', 'textedge']), diff --git a/camelot/core.py b/camelot/core.py index ac63e54..4e5869a 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -13,8 +13,6 @@ import pandas as pd # minimum number of vertical textline intersections for a textedge # to be considered valid TEXTEDGE_REQUIRED_ELEMENTS = 4 -# y coordinate tolerance for extending textedge -TEXTEDGE_EXTEND_TOLERANCE = 50 # padding added to table area on the left, right and bottom TABLE_AREA_PADDING = 10 @@ -55,11 +53,11 @@ class TextEdge(object): return ''.format( round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) - def update_coords(self, x, y0): + def update_coords(self, x, y0, edge_tol=50): """Updates the text edge's x and bottom y coordinates and sets the is_valid attribute. """ - if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE): + if np.isclose(self.y0, y0, atol=edge_tol): self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.y0 = y0 self.intersections += 1 @@ -74,7 +72,8 @@ class TextEdges(object): the PDF page. The dict has three keys based on the alignments, and each key's value is a list of camelot.core.TextEdge objects. """ - def __init__(self): + def __init__(self, edge_tol=50): + self.edge_tol = edge_tol self._textedges = {'left': [], 'right': [], 'middle': []} @staticmethod @@ -115,7 +114,8 @@ class TextEdges(object): if idx is None: self.add(textline, align) else: - self._textedges[align][idx].update_coords(x_coord, textline.y0) + self._textedges[align][idx].update_coords( + x_coord, textline.y0, edge_tol=self.edge_tol) def generate(self, textlines): """Generates the text edges dict based on horizontal text @@ -359,7 +359,7 @@ class Table(object): cell.left = cell.right = cell.top = cell.bottom = True return self - def set_edges(self, vertical, horizontal, joint_close_tol=2): + def set_edges(self, vertical, horizontal, joint_tol=2): """Sets a cell's edges to True depending on whether the cell's coordinates overlap with the line's coordinates within a tolerance. @@ -376,11 +376,11 @@ class Table(object): # find closest x coord # iterate over y coords and find closest start and end points i = [i for i, t in enumerate(self.cols) - if np.isclose(v[0], t[0], atol=joint_close_tol)] + if np.isclose(v[0], t[0], atol=joint_tol)] j = [j for j, t in enumerate(self.rows) - if np.isclose(v[3], t[0], atol=joint_close_tol)] + if np.isclose(v[3], t[0], atol=joint_tol)] k = [k for k, t in enumerate(self.rows) - if np.isclose(v[1], t[0], atol=joint_close_tol)] + if np.isclose(v[1], t[0], atol=joint_tol)] if not j: continue J = j[0] @@ -427,11 +427,11 @@ class Table(object): # find closest y coord # iterate over x coords and find closest start and end points i = [i for i, t in enumerate(self.rows) - if np.isclose(h[1], t[0], atol=joint_close_tol)] + if np.isclose(h[1], t[0], atol=joint_tol)] j = [j for j, t in enumerate(self.cols) - if np.isclose(h[0], t[0], atol=joint_close_tol)] + if np.isclose(h[0], t[0], atol=joint_tol)] k = [k for k, t in enumerate(self.cols) - if np.isclose(h[2], t[0], atol=joint_close_tol)] + if np.isclose(h[2], t[0], atol=joint_tol)] if not j: continue J = j[0] diff --git a/camelot/handlers.py b/camelot/handlers.py index a312131..35708ee 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -125,7 +125,7 @@ class PDFHandler(object): with open(fpath, 'wb') as f: outfile.write(f) - def parse(self, flavor='lattice', suppress_stdout=False, **kwargs): + def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -136,6 +136,8 @@ class PDFHandler(object): Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. kwargs : dict See camelot.read_pdf kwargs. @@ -153,6 +155,7 @@ class PDFHandler(object): for p in self.pages] parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) for p in pages: - t = parser.extract_tables(p, suppress_stdout=suppress_stdout) + t = parser.extract_tables(p, suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs) tables.extend(t) return TableList(tables) diff --git a/camelot/io.py b/camelot/io.py index 4b436ff..96ffa27 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra def read_pdf(filepath, pages='1', password=None, flavor='lattice', - suppress_stdout=False, **kwargs): + suppress_stdout=False, layout_kwargs={}, **kwargs): """Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' @@ -26,6 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Lattice is used by default. suppress_stdout : bool, optional (default: True) Print all logs and warnings. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -38,10 +40,13 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. - row_close_tol^ : int, optional (default: 2) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + row_tol^ : int, optional (default: 2) Tolerance parameter used to combine text vertically, to generate rows. - col_close_tol^ : int, optional (default: 0) + column_tol^ : int, optional (default: 0) Tolerance parameter used to combine text horizontally, to generate columns. process_background* : bool, optional (default: False) @@ -57,10 +62,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', shift_text* : list, optional (default: ['l', 't']) {'l', 'r', 't', 'b'} Direction in which text in a spanning cell will flow. - line_close_tol* : int, optional (default: 2) + line_tol* : int, optional (default: 2) Tolerance parameter used to merge close vertical and horizontal lines. - joint_close_tol* : int, optional (default: 2) + joint_tol* : int, optional (default: 2) Tolerance parameter used to decide whether the detected lines and points lie close to each other. threshold_blocksize* : int, optional (default: 15) @@ -77,10 +82,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Number of times for erosion/dilation is applied. For more information, refer `OpenCV's dilate `_. - margins : tuple - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. + resolution* : int, optional (default: 300) + Resolution used for PDF to PNG conversion. Returns ------- @@ -98,5 +101,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', validate_input(kwargs, flavor=flavor) p = PDFHandler(filepath, pages=pages, password=password) kwargs = remove_extra(kwargs, flavor=flavor) - tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, **kwargs) + tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs, **kwargs) return tables diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index ebc4564..a3280de 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -8,13 +8,11 @@ from ..utils import get_page_layout, get_text_objects class BaseParser(object): """Defines a base parser. """ - def _generate_layout(self, filename): + def _generate_layout(self, filename, layout_kwargs): self.filename = filename + self.layout_kwargs = layout_kwargs self.layout, self.dimensions = get_page_layout( - self.filename, - char_margin=self.char_margin, - line_margin=self.line_margin, - word_margin=self.word_margin) + filename, **layout_kwargs) self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.vertical_text = get_text_objects(self.layout, ltype="lv") self.pdf_width, self.pdf_height = self.dimensions diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index c1b8e0d..1924d84 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -53,10 +53,13 @@ class Lattice(BaseParser): flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. - line_close_tol : int, optional (default: 2) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + line_tol : int, optional (default: 2) Tolerance parameter used to merge close vertical and horizontal lines. - joint_close_tol : int, optional (default: 2) + joint_tol : int, optional (default: 2) Tolerance parameter used to decide whether the detected lines and points lie close to each other. threshold_blocksize : int, optional (default: 15) @@ -73,17 +76,15 @@ class Lattice(BaseParser): Number of times for erosion/dilation is applied. For more information, refer `OpenCV's dilate `_. - margins : tuple - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. + resolution : int, optional (default: 300) + Resolution used for PDF to PNG conversion. """ def __init__(self, table_areas=None, process_background=False, line_size_scaling=15, copy_text=None, shift_text=['l', 't'], - split_text=False, flag_size=False, line_close_tol=2, - joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, - iterations=0, margins=(1.0, 0.5, 0.1), **kwargs): + split_text=False, flag_size=False, strip_text='', line_tol=2, + joint_tol=2, threshold_blocksize=15, threshold_constant=-2, + iterations=0, resolution=300, **kwargs): self.table_areas = table_areas self.process_background = process_background self.line_size_scaling = line_size_scaling @@ -91,12 +92,13 @@ class Lattice(BaseParser): self.shift_text = shift_text self.split_text = split_text self.flag_size = flag_size - self.line_close_tol = line_close_tol - self.joint_close_tol = joint_close_tol + self.strip_text = strip_text + self.line_tol = line_tol + self.joint_tol = joint_tol self.threshold_blocksize = threshold_blocksize self.threshold_constant = threshold_constant self.iterations = iterations - self.char_margin, self.line_margin, self.word_margin = margins + self.resolution = resolution @staticmethod def _reduce_index(t, idx, shift_text): @@ -245,9 +247,9 @@ class Lattice(BaseParser): rows.extend([tk[1], tk[3]]) # sort horizontal and vertical segments cols = merge_close_lines( - sorted(cols), line_close_tol=self.line_close_tol) + sorted(cols), line_tol=self.line_tol) rows = merge_close_lines( - sorted(rows, reverse=True), line_close_tol=self.line_close_tol) + sorted(rows, reverse=True), line_tol=self.line_tol) # make grid using x and y coord of shortlisted rows and cols cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] @@ -264,7 +266,7 @@ class Lattice(BaseParser): table = Table(cols, rows) # set table edges to True using ver+hor lines - table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol) + table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) # set table border edges to True table = table.set_border() # set spanning cells to True @@ -277,7 +279,7 @@ class Lattice(BaseParser): for t in self.t_bbox[direction]: indices, error = get_table_index( table, t, direction, split_text=self.split_text, - flag_size=self.flag_size) + flag_size=self.flag_size, strip_text=self.strip_text) if indices[:2] != (-1, -1): pos_errors.append(error) indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) @@ -310,8 +312,8 @@ class Lattice(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - self._generate_layout(filename) + def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + self._generate_layout(filename, layout_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 4bf482d..049bc9f 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -38,29 +38,31 @@ class Stream(BaseParser): flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. - row_close_tol : int, optional (default: 2) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + edge_tol : int, optional (default: 50) + Tolerance parameter for extending textedges vertically. + row_tol : int, optional (default: 2) Tolerance parameter used to combine text vertically, to generate rows. - col_close_tol : int, optional (default: 0) + column_tol : int, optional (default: 0) Tolerance parameter used to combine text horizontally, to generate columns. - margins : tuple, optional (default: (1.0, 0.5, 0.1)) - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. """ def __init__(self, table_areas=None, columns=None, split_text=False, - flag_size=False, row_close_tol=2, col_close_tol=0, - margins=(1.0, 0.5, 0.1), **kwargs): + flag_size=False, strip_text='', edge_tol=50, row_tol=2, + column_tol=0, **kwargs): self.table_areas = table_areas self.columns = columns self._validate_columns() self.split_text = split_text self.flag_size = flag_size - self.row_close_tol = row_close_tol - self.col_close_tol = col_close_tol - self.char_margin, self.line_margin, self.word_margin = margins + self.strip_text = strip_text + self.edge_tol = edge_tol + self.row_tol = row_tol + self.column_tol = column_tol @staticmethod def _text_bbox(t_bbox): @@ -86,7 +88,7 @@ class Stream(BaseParser): return text_bbox @staticmethod - def _group_rows(text, row_close_tol=2): + def _group_rows(text, row_tol=2): """Groups PDFMiner text objects into rows vertically within a tolerance. @@ -94,7 +96,7 @@ class Stream(BaseParser): ---------- text : list List of PDFMiner text objects. - row_close_tol : int, optional (default: 2) + row_tol : int, optional (default: 2) Returns ------- @@ -110,7 +112,7 @@ class Stream(BaseParser): # if t.get_text().strip() and all([obj.upright for obj in t._objs if # type(obj) is LTChar]): if t.get_text().strip(): - if not np.isclose(row_y, t.y0, atol=row_close_tol): + if not np.isclose(row_y, t.y0, atol=row_tol): rows.append(sorted(temp, key=lambda t: t.x0)) temp = [] row_y = t.y0 @@ -120,7 +122,7 @@ class Stream(BaseParser): return rows @staticmethod - def _merge_columns(l, col_close_tol=0): + def _merge_columns(l, column_tol=0): """Merges column boundaries horizontally if they overlap or lie within a tolerance. @@ -128,7 +130,7 @@ class Stream(BaseParser): ---------- l : list List of column x-coordinate tuples. - col_close_tol : int, optional (default: 0) + column_tol : int, optional (default: 0) Returns ------- @@ -142,17 +144,17 @@ class Stream(BaseParser): merged.append(higher) else: lower = merged[-1] - if col_close_tol >= 0: + if column_tol >= 0: if (higher[0] <= lower[1] or - np.isclose(higher[0], lower[1], atol=col_close_tol)): + np.isclose(higher[0], lower[1], atol=column_tol)): upper_bound = max(lower[1], higher[1]) lower_bound = min(lower[0], higher[0]) merged[-1] = (lower_bound, upper_bound) else: merged.append(higher) - elif col_close_tol < 0: + elif column_tol < 0: if higher[0] <= lower[1]: - if np.isclose(higher[0], lower[1], atol=abs(col_close_tol)): + if np.isclose(higher[0], lower[1], atol=abs(column_tol)): merged.append(higher) else: upper_bound = max(lower[1], higher[1]) @@ -189,7 +191,7 @@ class Stream(BaseParser): return rows @staticmethod - def _add_columns(cols, text, row_close_tol): + def _add_columns(cols, text, row_tol): """Adds columns to existing list by taking into account the text that lies outside the current column x-coordinates. @@ -208,7 +210,7 @@ class Stream(BaseParser): """ if text: - text = Stream._group_rows(text, row_close_tol=row_close_tol) + text = Stream._group_rows(text, row_tol=row_tol) elements = [len(r) for r in text] new_cols = [(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r] @@ -254,11 +256,10 @@ class Stream(BaseParser): Assumes that tables are situated relatively far apart vertically. """ - # TODO: add support for arabic text #141 # sort textlines in reading order textlines.sort(key=lambda x: (-x.y0, x.x0)) - textedges = TextEdges() + textedges = TextEdges(edge_tol=self.edge_tol) # generate left, middle and right textedges textedges.generate(textlines) # select relevant edges @@ -300,7 +301,7 @@ class Stream(BaseParser): self.t_bbox = t_bbox text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) - rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_close_tol=self.row_close_tol) + rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol) rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] @@ -331,7 +332,7 @@ class Stream(BaseParser): warnings.warn("No tables found in table area {}".format( table_idx + 1)) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] - cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) + cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) inner_text = [] for i in range(1, len(cols)): left = cols[i - 1][1] @@ -343,7 +344,7 @@ class Stream(BaseParser): for t in self.t_bbox[direction] if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, self.row_close_tol) + cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._join_columns(cols, text_x_min, text_x_max) return cols, rows @@ -359,7 +360,7 @@ class Stream(BaseParser): for t in self.t_bbox[direction]: indices, error = get_table_index( table, t, direction, split_text=self.split_text, - flag_size=self.flag_size) + flag_size=self.flag_size, strip_text=self.strip_text) if indices[:2] != (-1, -1): pos_errors.append(error) for r_idx, c_idx, text in indices: @@ -388,8 +389,8 @@ class Stream(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - self._generate_layout(filename) + def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + self._generate_layout(filename, layout_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/utils.py b/camelot/utils.py index cd55e4e..88564f7 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -20,16 +20,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, stream_kwargs = [ 'columns', - 'row_close_tol', - 'col_close_tol' + 'row_tol', + 'column_tol' ] lattice_kwargs = [ 'process_background', 'line_size_scaling', 'copy_text', 'shift_text', - 'line_close_tol', - 'joint_close_tol', + 'line_tol', + 'joint_tol', 'threshold_blocksize', 'threshold_constant', 'iterations' @@ -281,14 +281,14 @@ def text_in_bbox(bbox, text): return t_bbox -def merge_close_lines(ar, line_close_tol=2): +def merge_close_lines(ar, line_tol=2): """Merges lines which are within a tolerance by calculating a moving mean, based on their x or y axis projections. Parameters ---------- ar : list - line_close_tol : int, optional (default: 2) + line_tol : int, optional (default: 2) Returns ------- @@ -301,7 +301,7 @@ def merge_close_lines(ar, line_close_tol=2): ret.append(a) else: temp = ret[-1] - if np.isclose(temp, a, atol=line_close_tol): + if np.isclose(temp, a, atol=line_tol): temp = (temp + a) / 2.0 ret[-1] = temp else: @@ -309,7 +309,12 @@ def merge_close_lines(ar, line_close_tol=2): return ret -def flag_font_size(textline, direction): +# TODO: combine the following functions into a TextProcessor class which +# applies corresponding transformations sequentially +# (inspired from sklearn.pipeline.Pipeline) + + +def flag_font_size(textline, direction, strip_text=''): """Flags super/subscripts in text by enclosing them with . May give false positives. @@ -319,6 +324,9 @@ def flag_font_size(textline, direction): List of PDFMiner LTChar objects. direction : string Direction of the PDFMiner LTTextLine object. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. Returns ------- @@ -344,13 +352,13 @@ def flag_font_size(textline, direction): fchars = [t[0] for t in chars] if ''.join(fchars).strip(): flist.append(''.join(fchars)) - fstring = ''.join(flist) + fstring = ''.join(flist).strip(strip_text) else: - fstring = ''.join([t.get_text() for t in textline]) + fstring = ''.join([t.get_text() for t in textline]).strip(strip_text) return fstring -def split_textline(table, textline, direction, flag_size=False): +def split_textline(table, textline, direction, flag_size=False, strip_text=''): """Splits PDFMiner LTTextLine into substrings if it spans across multiple rows/columns. @@ -365,6 +373,9 @@ def split_textline(table, textline, direction, flag_size=False): Whether or not to highlight a substring using if its size is different from rest of the string. (Useful for super and subscripts.) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. Returns ------- @@ -416,14 +427,15 @@ def split_textline(table, textline, direction, flag_size=False): grouped_chars = [] for key, chars in groupby(cut_text, itemgetter(0, 1)): if flag_size: - grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction))) + grouped_chars.append((key[0], key[1], + flag_font_size([t[2] for t in chars], direction, strip_text=strip_text))) else: gchars = [t[2].get_text() for t in chars] - grouped_chars.append((key[0], key[1], ''.join(gchars))) + grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text))) return grouped_chars -def get_table_index(table, t, direction, split_text=False, flag_size=False): +def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',): """Gets indices of the table cell where given text object lies by comparing their y and x-coordinates. @@ -441,6 +453,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False): Whether or not to highlight a substring using if its size is different from rest of the string. (Useful for super and subscripts) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. Returns ------- @@ -495,12 +510,12 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False): error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea if split_text: - return split_textline(table, t, direction, flag_size=flag_size), error + return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error else: if flag_size: - return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error + return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error else: - return [(r_idx, c_idx, t.get_text())], error + return [(r_idx, c_idx, t.get_text().strip(strip_text))], error def compute_accuracy(error_weights): @@ -558,7 +573,7 @@ def compute_whitespace(d): def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, - detect_vertical=True, all_texts=True): + detect_vertical=True, all_texts=True): """Returns a PDFMiner LTPage object and page dimension of a single page pdf. See https://euske.github.io/pdfminer/ to get definitions of kwargs. diff --git a/docs/_static/png/edge_tol_1.png b/docs/_static/png/edge_tol_1.png new file mode 100644 index 0000000..f7f7a67 Binary files /dev/null and b/docs/_static/png/edge_tol_1.png differ diff --git a/docs/_static/png/edge_tol_2.png b/docs/_static/png/edge_tol_2.png new file mode 100644 index 0000000..a5ec743 Binary files /dev/null and b/docs/_static/png/edge_tol_2.png differ diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 37e8d01..ca40bb8 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -316,10 +316,87 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc "Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28" "...","...","...","...","...","...","...","...","...","...","..." -Control how text is grouped into rows -------------------------------------- +Strip characters from text +-------------------------- -You can pass ``row_close_tol=<+int>`` to group the rows closer together, as shown below. +You can strip unwanted characters like spaces, dots and newlines from a string using the ``strip_text`` keyword argument. Take a look at `this PDF `_ as an example, the text at the start of each row contains a lot of unwanted spaces, dots and newlines. + +:: + + >>> tables = camelot.read_pdf('12s0324.pdf', flavor='stream', strip_text=' .\n') + >>> tables[0].df + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot -strip ' .\n' stream 12s0324.pdf + +.. csv-table:: + + "...","...","...","...","...","...","...","...","...","..." + "Forcible rape","17.5","2.6","14.9","17.2","2.5","14.7","–","–","–" + "Robbery","102.1","25.5","76.6","90.0","22.9","67.1","12.1","2.5","9.5" + "Aggravated assault","338.4","40.1","298.3","264.0","30.2","233.8","74.4","9.9","64.5" + "Property crime","1,396 .4","338 .7","1,057 .7","875 .9","210 .8","665 .1","608 .2","127 .9","392 .6" + "Burglary","240.9","60.3","180.6","205.0","53.4","151.7","35.9","6.9","29.0" + "...","...","...","...","...","...","...","...","...","..." + +Improve guessed table areas +--------------------------- + +While using :ref:`Stream `, automatic table detection can fail for PDFs like `this one `_. That's because the text is relatively far apart vertically, which can lead to shorter textedges being calculated. + +.. note:: To know more about how textedges are calculated to guess table areas, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis `_. + +Let's see the table area that is detected by default. + +:: + + >>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream') + >>> camelot.plot(tables[0], kind='contour') + >>> plt.show() + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot stream -plot contour edge.pdf + +.. figure:: ../_static/png/edge_tol_1.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: Table area with default edge_tol + :align: left + +To improve the detected area, you can increase the ``edge_tol`` (default: 50) value to counter the effect of text being placed relatively far apart vertically. Larger ``edge_tol`` will lead to longer textedges being detected, leading to an improved guess of the table area. Let's use a value of 500. + +:: + + >>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream', edge_tol=500) + >>> camelot.plot(tables[0], kind='contour') + >>> plt.show() + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot stream -e 500 -plot contour edge.pdf + +.. figure:: ../_static/png/edge_tol_2.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: Table area with default edge_tol + :align: left + +As you can see, the guessed table area has improved! + +Improve guessed table rows +-------------------------- + +You can pass ``row_tol=<+int>`` to group the rows closer together, as shown below. :: @@ -337,7 +414,7 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show :: - >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_close_tol=10) + >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_tol=10) >>> tables[0].df .. tip:: @@ -524,3 +601,14 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in "4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..." "4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..." "4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..." + +Tweak layout generation +----------------------- + +Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 `_ and `#215 `_), PDFMiner can group characters that should belong to the same sentence into separate sentences. + +To deal with such cases, you can tweak PDFMiner's `LAParams kwargs `_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() `. To know more about the parameters you can tweak, you can check out `PDFMiner docs `_. + +:: + + >>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': False}) diff --git a/tests/data.py b/tests/data.py index 677c58b..c223227 100755 --- a/tests/data.py +++ b/tests/data.py @@ -312,6 +312,63 @@ data_stream_flag_size = [ ["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"] ] +data_stream_strip_text = [ + ["V i n s a u Ve r r e", ""], + ["Les Blancs", "12.5CL"], + ["A.O.P Côtes du Rhône", ""], + ["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"], + ["A.O.P Vacqueyras", ""], + ["Domaine de Montvac « Melodine » 2016", "10 €"], + ["A.O.P Châteauneuf du Pape", ""], + ["Domaine de Beaurenard 2017", "13 €"], + ["A.O.P Côteaux du Languedoc", ""], + ["Villa Tempora « Un temps pour elle » 2014", "9 €"], + ["A.O.P Côtes de Provence", ""], + ["Château Grand Boise 2017", "9 €"], + ["Les Rosés", "12,5 CL"], + ["A.O.P Côtes du Rhône", ""], + ["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"], + ["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"], + ["A.O.P Vacqueyras", ""], + ["Domaine de Montvac 2017", "9 €"], + ["A.O.P Languedoc", ""], + ["Domaine de Joncas « Nébla » 2015", "8 €"], + ["Villa Tempora « L’arroseur arrosé » 2015", "9 €"], + ["A.O.P Côtes de Provence", ""], + ["Château Grand Boise « Sainte Victoire » 2017", "9 €"], + ["Château Léoube 2016", "10 €"] +] + +data_stream_edge_tol = [ + ["Key figures", ""], + ["", "2016"], + ["(all amounts in EUR)", ""], + ["C\nlass A", ""], + ["N\net Asset Value at 31 December", "5,111,372"], + ["N\number of outstanding units at 31 December", "49,136"], + ["N\net Asset Value per unit at 31 December", "104.03"], + ["C\nlass B", ""], + ["N\net Asset Value at 31 December", "49,144,825"], + ["N\number of outstanding units at 31 December", "471,555"], + ["N\net Asset Value per unit at 31 December", "104.22"], + ["T\notal for the Fund", ""], + ["N\net Asset Value at 31 December", "54,256,197"], + ["N\number of outstanding units at 31 December", "520,691"], + ["I\nnvestment result", ""], + ["Direct result", "-"], + ["Revaluation", "2,076,667"], + ["Costs", "(106,870)"], + ["T\notal investment result for the period1", "1,969,797"], + ["I\nnvestment result per unit2", ""], + ["Direct result", "-"], + ["Revaluation", "3.99"], + ["Costs", "(0.21)"], + ["T\notal investment result per unit", "3.78"], + ["1 The results cover the period from inception of the Fund at 8 April 2016 through 31 December 2016.", ""], + ["2 The result per unit is calculated using the total number of outstanding unit as per the end of the", ""], + ["period.", ""] +] + data_lattice = [ ["Cycle \nName", "KI \n(1/km)", "Distance \n(mi)", "Percent Fuel Savings", "", "", ""], ["", "", "", "Improved \nSpeed", "Decreased \nAccel", "Eliminate \nStops", "Decreased \nIdle"], @@ -485,9 +542,49 @@ data_lattice_shift_text_right_bottom = [ ] data_arabic = [ - ['ً\n\xa0\nﺎﺒﺣﺮﻣ', 'ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ'], - ['ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ', '؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ'], - ['1234', 'ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ'], - ['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'], - ['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', ''] + ["ً\n\xa0\nﺎﺒﺣﺮﻣ", "ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ"], + ["ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ", "؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ"], + ["1234", "ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ"], + ["؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ", "ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ"], + ["Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic", ""] +] + +data_stream_layout_kwargs = [ + ["V i n s a u Ve r r e", ""], + ["Les Blancs", "12.5CL"], + ["A.O.P Côtes du Rhône", ""], + ["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"], + ["A.O.P Vacqueyras", ""], + ["Domaine de Montvac « Melodine » 2016", "10 €"], + ["A.O.P Châteauneuf du Pape", ""], + ["Domaine de Beaurenard 2017", "13 €"], + ["A.O.P Côteaux du Languedoc", ""], + ["Villa Tempora « Un temps pour elle » 2014", "9 €"], + ["A.O.P Côtes de Provence", ""], + ["Château Grand Boise 2017", "9 €"], + ["Les Rosés", "12,5 CL"], + ["A.O.P Côtes du Rhône", ""], + ["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"], + ["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"], + ["A.O.P Vacqueyras", ""], + ["Domaine de Montvac 2017", "9 €"], + ["A.O.P Languedoc", ""], + ["Domaine de Joncas « Nébla » 2015", "8 €"], + ["Villa Tempora « L’arroseur arrosé » 2015", "9 €"], + ["A.O.P Côtes de Provence", ""], + ["Château Grand Boise « Sainte Victoire » 2017", "9 €"], + ["Château Léoube 2016", "10 €"], + ["Les Rouges", "12,CL"], + ["A.O.P Côtes du Rhône", ""], + ["Domaine de Dionysos « La Cigalette »", "8 €"], + ["Château Saint Estève d’Uchaux « Grande Réserve » 2014", "9 €"], + ["Domaine de la Guicharde « Cuvée Massillan » 2016", "9 €"], + ["Domaine de la Florane « Terre Pourpre » 2014", "10 €"], + ["L’Oratoire St Martin « Réserve des Seigneurs » 2015", "11 €"], + ["A.O.P Saint Joseph", ""], + ["Domaine Monier Perréol « Châtelet » 2015", "13 €"], + ["A.O.P Châteauneuf du Pape", ""], + ["Domaine de Beaurenard 2011", "15 €"], + ["A.O.P Cornas", ""], + ["Domaine Lionnet « Terre Brûlée » 2012", "15 €"] ] diff --git a/tests/files/baseline_plots/test_grid_plot.png b/tests/files/baseline_plots/test_grid_plot.png index 4487eb3..3b835f5 100644 Binary files a/tests/files/baseline_plots/test_grid_plot.png and b/tests/files/baseline_plots/test_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_joint_plot.png b/tests/files/baseline_plots/test_joint_plot.png index 934aa74..e9e40ec 100644 Binary files a/tests/files/baseline_plots/test_joint_plot.png and b/tests/files/baseline_plots/test_joint_plot.png differ diff --git a/tests/files/baseline_plots/test_lattice_contour_plot.png b/tests/files/baseline_plots/test_lattice_contour_plot.png index 57b3962..a8d3326 100644 Binary files a/tests/files/baseline_plots/test_lattice_contour_plot.png and b/tests/files/baseline_plots/test_lattice_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_line_plot.png b/tests/files/baseline_plots/test_line_plot.png index a7ac276..e8099ce 100644 Binary files a/tests/files/baseline_plots/test_line_plot.png and b/tests/files/baseline_plots/test_line_plot.png differ diff --git a/tests/files/detect_vertical_false.pdf b/tests/files/detect_vertical_false.pdf new file mode 100644 index 0000000..17d8a0d Binary files /dev/null and b/tests/files/detect_vertical_false.pdf differ diff --git a/tests/files/edge_tol.pdf b/tests/files/edge_tol.pdf new file mode 100755 index 0000000..c4f43cd Binary files /dev/null and b/tests/files/edge_tol.pdf differ diff --git a/tests/test_common.py b/tests/test_common.py index 5f8c81c..83c436b 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -81,7 +81,7 @@ def test_stream_columns(): filename = os.path.join(testdir, "mexican_towns.pdf") tables = camelot.read_pdf( - filename, flavor="stream", columns=["67,180,230,425,475"], row_close_tol=10) + filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10) assert df.equals(tables[0].df) @@ -102,6 +102,31 @@ def test_stream_flag_size(): assert df.equals(tables[0].df) +def test_stream_strip_text(): + df = pd.DataFrame(data_stream_strip_text) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n") + assert df.equals(tables[0].df) + + +def test_stream_edge_tol(): + df = pd.DataFrame(data_stream_edge_tol) + + filename = os.path.join(testdir, "edge_tol.pdf") + tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500) + assert df.equals(tables[0].df) + + +def test_stream_layout_kwargs(): + df = pd.DataFrame(data_stream_layout_kwargs) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf( + filename, flavor="stream", layout_kwargs={"detect_vertical": False}) + assert df.equals(tables[0].df) + + def test_lattice(): df = pd.DataFrame(data_lattice) @@ -179,7 +204,7 @@ def test_repr(): tables = camelot.read_pdf(filename) assert repr(tables) == "" assert repr(tables[0]) == "" - assert repr(tables[0].cells[0][0]) == "" + assert repr(tables[0].cells[0][0]) == "" def test_arabic():