diff --git a/camelot/core.py b/camelot/core.py index 5712e65..cef90c4 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -431,6 +431,12 @@ class Table(object): self.whitespace = compute_whitespace(data) self.pdf_size = (parser.pdf_width, parser.pdf_height) + _text = [] + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text]) + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text]) + self._text = _text + + def get_pdf_image(self): """Compute pdf image and cache it """ diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 66dd98c..e47e1eb 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -12,7 +12,7 @@ from ..utils import ( scale_image, scale_pdf, segments_in_bbox, - text_in_bbox, + text_in_bbox_per_axis, merge_close_lines, ) from ..image_processing import ( @@ -252,19 +252,17 @@ class Lattice(BaseParser): table_bbox, vertical_segments, horizontal_segments, pdf_scalers ) - def _generate_columns_and_rows(self, table_idx, tk): + + def _generate_columns_and_rows(self, tk): # select elements which lie within table_bbox - t_bbox = {} v_s, h_s = segments_in_bbox( tk, self.vertical_segments, self.horizontal_segments ) - t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) - t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) - - t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) - t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) - - self.t_bbox = t_bbox + self.t_bbox = text_in_bbox_per_axis( + tk, + self.horizontal_text, + self.vertical_text + ) cols, rows = zip(*self.table_bbox[tk]) cols, rows = list(cols), list(rows) @@ -299,10 +297,6 @@ class Lattice(BaseParser): table.record_parse_metadata(self) # for plotting - _text = [] - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) - table._text = _text table._image = self.pdf_image # Reuse the image used for calc table._bbox_unscaled = self.table_bbox_unscaled table._segments = (self.vertical_segments, self.horizontal_segments) @@ -321,8 +315,7 @@ class Lattice(BaseParser): for table_idx, tk in enumerate( sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) ): - cols, rows, v_s, h_s = self._generate_columns_and_rows( - table_idx, tk) + cols, rows, v_s, h_s = self._generate_columns_and_rows(tk) table = self._generate_table( table_idx, cols, rows, v_s=v_s, h_s=h_s) table._bbox = tk diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 2badb39..087f5ce 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -7,7 +7,7 @@ import numpy as np from .base import BaseParser from ..core import TextEdges -from ..utils import (text_in_bbox) +from ..utils import (text_in_bbox, text_in_bbox_per_axis) class Stream(BaseParser): @@ -331,14 +331,11 @@ class Stream(BaseParser): def _generate_columns_and_rows(self, table_idx, tk): # select elements which lie within table_bbox - t_bbox = {} - t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) - t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) - - t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) - t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) - - self.t_bbox = t_bbox + self.t_bbox = text_in_bbox_per_axis( + tk, + self.horizontal_text, + self.vertical_text + ) text_x_min, text_y_min, text_x_max, text_y_max = \ self._text_bbox(self.t_bbox) @@ -415,10 +412,6 @@ class Stream(BaseParser): table.record_parse_metadata(self) # for plotting - _text = [] - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) - table._text = _text table._bbox = self.table_bbox table._segments = None table._textedges = self.textedges @@ -435,12 +428,12 @@ class Stream(BaseParser): _tables = [] # sort tables based on y-coord - for table_idx, tk in enumerate( + for table_idx, bbox in enumerate( sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) ): - cols, rows = self._generate_columns_and_rows(table_idx, tk) + cols, rows = self._generate_columns_and_rows(table_idx, bbox) table = self._generate_table(table_idx, cols, rows) - table._bbox = tk + table._bbox = bbox _tables.append(table) return _tables diff --git a/camelot/utils.py b/camelot/utils.py index e6f8e50..7e789b2 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -417,6 +417,35 @@ def text_in_bbox(bbox, text): return t_bbox +def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text): + """Returns all text objects present inside a bounding box, split between + horizontal and vertical text. + + Parameters + ---------- + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate + space. + horizontal_text : List of PDFMiner text objects. + vertical_text : List of PDFMiner text objects. + + Returns + ------- + t_bbox : dict + Dict of lists of PDFMiner text objects that lie inside table, with one + key each for "horizontal" and "vertical" + + """ + t_bbox = {} + t_bbox["horizontal"] = text_in_bbox(bbox, horizontal_text) + t_bbox["vertical"] = text_in_bbox(bbox, vertical_text) + + t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) + t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) + return t_bbox + + def bbox_from_text(textlines): """Returns the smallest bbox containing all the text objects passed as a parameters.