diff --git a/camelot/core.py b/camelot/core.py index 0cd7fa6..ee9adcd 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -171,11 +171,10 @@ class TextAlignments(): idx_insert = None if idx_closest is None: idx_insert = 0 - elif np.isclose( - alignment_array[idx_closest].coord, - coord, - atol=0.5 - ): + # Note: np.isclose is slow! + elif coord - 0.5 < \ + alignment_array[idx_closest].coord < \ + coord + 0.5: self._update_alignment( alignment_array[idx_closest], coord, @@ -460,7 +459,8 @@ class Table(): self._image = None self._image_path = None # Temporary file to hold an image of the pdf - self._text = [] # List of text box coordinates + self._text = [] # List of text box coordinates + self.textlines = [] # List of actual textlines on the page def __repr__(self): return "<{} shape={}>".format(self.__class__.__name__, self.shape) diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 4c18d77..0ad4c42 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -8,13 +8,11 @@ import pandas as pd from ..utils import ( bbox_from_str, - bbox_from_textlines, compute_accuracy, compute_whitespace, get_text_objects, get_table_index, text_in_bbox, - text_in_bbox_per_axis, ) from ..core import Table @@ -243,6 +241,7 @@ class BaseParser(): [(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) table._text = _text + table.textlines = self.horizontal_text + self.vertical_text class TextBaseParser(BaseParser): @@ -454,84 +453,6 @@ class TextBaseParser(BaseParser): raise ValueError("Length of table_areas and columns" " should be equal") - def _generate_columns_and_rows(self, bbox, table_idx): - # select elements which lie within table_bbox - self.t_bbox = text_in_bbox_per_axis( - bbox, - self.horizontal_text, - self.vertical_text - ) - - text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( - self.t_bbox["horizontal"] + self.t_bbox["vertical"] - ) - rows_grouped = self._group_rows( - self.t_bbox["horizontal"], row_tol=self.row_tol) - rows = self._join_rows(rows_grouped, text_y_max, text_y_min) - elements = [len(r) for r in rows_grouped] - - if self.columns is not None and self.columns[table_idx] != "": - # user has to input boundary columns too - # take (0, pdf_width) by default - # similar to else condition - # len can't be 1 - cols = self.columns[table_idx].split(",") - cols = [float(c) for c in cols] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - else: - # calculate mode of the list of number of elements in - # each row to guess the number of columns - ncols = max(set(elements), key=elements.count) - if ncols == 1: - # if mode is 1, the page usually contains not tables - # but there can be cases where the list can be skewed, - # try to remove all 1s from list in this case and - # see if the list contains elements, if yes, then use - # the mode after removing 1s - elements = list(filter(lambda x: x != 1, elements)) - if elements: - ncols = max(set(elements), key=elements.count) - else: - warnings.warn( - "No tables found in table area {}" - .format(table_idx + 1) - ) - cols = [ - (t.x0, t.x1) - for r in rows_grouped - if len(r) == ncols - for t in r - ] - cols = self._merge_columns( - sorted(cols), - column_tol=self.column_tol - ) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend( - [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > left and t.x1 < right - ] - ) - outer_text = [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0] - ] - inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, self.row_tol) - cols = self._join_columns(cols, text_x_min, text_x_max) - - return cols, rows, None, None - def record_parse_metadata(self, table): """Record data about the origin of the table """ diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index bff0d58..b686df0 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -6,6 +6,7 @@ from __future__ import division import copy import math import numpy as np +import warnings from .base import TextBaseParser from ..core import ( @@ -20,7 +21,8 @@ from ..utils import ( text_in_bbox, bbox_from_textlines, distance_tl_to_bbox, - find_columns_coordinates + find_columns_coordinates, + text_in_bbox_per_axis, ) # maximum number of columns over which a header can spread @@ -574,3 +576,91 @@ class Hybrid(TextBaseParser): lambda tl: tl not in textlines_processed, textlines )) + + def _generate_columns_and_rows(self, bbox, table_idx): + # select elements which lie within table_bbox + self.t_bbox = text_in_bbox_per_axis( + bbox, + self.horizontal_text, + self.vertical_text + ) + + all_tls = list( + filter( + lambda tl: len(tl.get_text().strip()) > 0, + self.t_bbox["horizontal"] # + self.t_bbox["vertical"] + ) + ) + text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( + all_tls + ) + # FRHTODO: + # This algorithm takes the horizontal textlines in the bbox, and groups + # them into rows based on their bottom y0. + # That's wrong: it misses the vertical items, and misses out on all + # the alignment identification work we've done earlier. + rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol) + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + elements = [len(r) for r in rows_grouped] + + if self.columns is not None and self.columns[table_idx] != "": + # user has to input boundary columns too + # take (0, pdf_width) by default + # similar to else condition + # len can't be 1 + cols = self.columns[table_idx].split(",") + cols = [float(c) for c in cols] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + else: + # calculate mode of the list of number of elements in + # each row to guess the number of columns + ncols = max(set(elements), key=elements.count) + if ncols == 1: + # if mode is 1, the page usually contains not tables + # but there can be cases where the list can be skewed, + # try to remove all 1s from list in this case and + # see if the list contains elements, if yes, then use + # the mode after removing 1s + elements = list(filter(lambda x: x != 1, elements)) + if elements: + ncols = max(set(elements), key=elements.count) + else: + warnings.warn( + "No tables found in table area {}" + .format(table_idx + 1) + ) + cols = [ + (t.x0, t.x1) + for r in rows_grouped + if len(r) == ncols + for t in r + ] + cols = self._merge_columns( + sorted(cols), + column_tol=self.column_tol + ) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend( + [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right + ] + ) + outer_text = [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0] + ] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.row_tol) + cols = self._join_columns(cols, text_x_min, text_x_max) + + return cols, rows, None, None diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 988490f..6a1da23 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -2,11 +2,15 @@ from __future__ import division +import warnings + from .base import TextBaseParser from ..core import TextEdges from ..utils import ( bbox_from_str, - text_in_bbox + bbox_from_textlines, + text_in_bbox, + text_in_bbox_per_axis, ) @@ -124,3 +128,86 @@ class Stream(TextBaseParser): for area_str in self.table_areas: table_bbox[bbox_from_str(area_str)] = None self.table_bbox = table_bbox + + def _generate_columns_and_rows(self, bbox, table_idx): + # select elements which lie within table_bbox + self.t_bbox = text_in_bbox_per_axis( + bbox, + self.horizontal_text, + self.vertical_text + ) + + text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( + self.t_bbox["horizontal"] + self.t_bbox["vertical"] + ) + # FRHTODO: + # This algorithm takes the horizontal textlines in the bbox, and groups + # them into rows based on their bottom y0. + # That's wrong: it misses the vertical items, and misses out on all + # the alignment identification work we've done earlier. + rows_grouped = self._group_rows( + self.t_bbox["horizontal"], row_tol=self.row_tol) + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + elements = [len(r) for r in rows_grouped] + + if self.columns is not None and self.columns[table_idx] != "": + # user has to input boundary columns too + # take (0, pdf_width) by default + # similar to else condition + # len can't be 1 + cols = self.columns[table_idx].split(",") + cols = [float(c) for c in cols] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + else: + # calculate mode of the list of number of elements in + # each row to guess the number of columns + ncols = max(set(elements), key=elements.count) + if ncols == 1: + # if mode is 1, the page usually contains not tables + # but there can be cases where the list can be skewed, + # try to remove all 1s from list in this case and + # see if the list contains elements, if yes, then use + # the mode after removing 1s + elements = list(filter(lambda x: x != 1, elements)) + if elements: + ncols = max(set(elements), key=elements.count) + else: + warnings.warn( + "No tables found in table area {}" + .format(table_idx + 1) + ) + cols = [ + (t.x0, t.x1) + for r in rows_grouped + if len(r) == ncols + for t in r + ] + cols = self._merge_columns( + sorted(cols), + column_tol=self.column_tol + ) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend( + [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right + ] + ) + outer_text = [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0] + ] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.row_tol) + cols = self._join_columns(cols, text_x_min, text_x_max) + + return cols, rows, None, None diff --git a/camelot/plotting.py b/camelot/plotting.py index d3d7064..6bf6102 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -8,7 +8,20 @@ except ImportError: else: _HAS_MPL = True -from .utils import (bbox_from_str, get_textline_coords) +from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords) + +from pdfminer.layout import ( + LTTextLineVertical, +) + + +def extend_axe_lim(ax, bbox, margin=10): + """Ensure the ax limits include the input bbox + """ + x0, x1 = ax.get_xlim() + y0, y1 = ax.get_ylim() + ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin)) + ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin)) def draw_labeled_bbox( @@ -17,6 +30,8 @@ def draw_labeled_bbox( linestyle="solid", label_pos="top,left" ): + """Utility drawing function to draw a box with an associated text label + """ ax.add_patch( patches.Rectangle( (bbox[0], bbox[1]), @@ -80,6 +95,37 @@ def draw_pdf(table, ax, to_pdf_scale=True): def draw_parse_constraints(table, ax): """Draw any user provided constraints (area, region, columns, etc) + Parameters + ---------- + table : camelot.core.Table + + ax : matplotlib.axes.Axes (optional) + + """ + if table.parse_details: + zone_constraints = { + "region": "table_regions", + "area": "table_areas", + } + for zone_name, zone_id in zone_constraints.items(): + # Display a bbox per region / area + for zone_str in table.parse_details[zone_id] or []: + draw_labeled_bbox( + ax, bbox_from_str(zone_str), + "{zone_name}: ({zone_str})".format( + zone_name=zone_name, + zone_str=zone_str + ), + color="purple", + linestyle="dotted", + linewidth=1, + label_pos="bottom,right" + ) + + +def draw_text(table, ax): + """Draw text, horizontal in blue, vertical in red + Parameters ---------- table : camelot.core.Table @@ -88,27 +134,19 @@ def draw_parse_constraints(table, ax): ax : matplotlib.axes.Axes """ - if table.parse_details: - # Display a bbox per region - for region_str in table.parse_details["table_regions"] or []: - draw_labeled_bbox( - ax, bbox_from_str(region_str), - "region: ({region_str})".format(region_str=region_str), - color="purple", - linestyle="dotted", - linewidth=1, - label_pos="bottom,right" - ) - # Display a bbox per area - for area_str in table.parse_details["table_areas"] or []: - draw_labeled_bbox( - ax, bbox_from_str(area_str), - "area: ({area_str})".format(area_str=area_str), - color="pink", - linestyle="dotted", - linewidth=1, - label_pos="bottom,right" + bbox = bbox_from_textlines(table.textlines) + for t in table.textlines: + color = "red" if isinstance(t, LTTextLineVertical) else "blue" + ax.add_patch( + patches.Rectangle( + (t.x0, t.y0), + t.x1 - t.x0, + t.y1 - t.y0, + color=color, + alpha=0.2 + ) ) + extend_axe_lim(ax, bbox) def prepare_plot(table, ax=None, to_pdf_scale=True): @@ -188,20 +226,7 @@ class PlotMethods(): """ ax = prepare_plot(table, ax) - xs, ys = [], [] - for t in table._text: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1], - alpha=0.5 - ) - ) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) + draw_text(table, ax) return ax.get_figure() @staticmethod @@ -255,18 +280,8 @@ class PlotMethods(): else: table_bbox = {table._bbox: None} - xs, ys = [], [] if not _FOR_LATTICE: - for t in table._text: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), t[2] - t[0], t[3] - t[1], - color="blue", - alpha=0.5 - ) - ) + draw_text(table, ax) for t in table_bbox.keys(): ax.add_patch( @@ -276,10 +291,8 @@ class PlotMethods(): ) ) if not _FOR_LATTICE: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) + extend_axe_lim(ax, t) + return ax.get_figure() @staticmethod @@ -297,19 +310,7 @@ class PlotMethods(): """ ax = prepare_plot(table, ax) - xs, ys = [], [] - for t in table._text: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), t[2] - t[0], t[3] - t[1], - color="blue", - alpha=0.2 - ) - ) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) + draw_text(table, ax) if table.flavor == "hybrid": for network in table.parse_details["network_searches"]: diff --git a/tests/files/baseline_plots/test_hybrid_contour_plot.png b/tests/files/baseline_plots/test_hybrid_contour_plot.png index 26d2b57..10b3b1d 100644 Binary files a/tests/files/baseline_plots/test_hybrid_contour_plot.png and b/tests/files/baseline_plots/test_hybrid_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png b/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png index 5e67f83..032b3e3 100644 Binary files a/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png and b/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png b/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png index 1fa67b2..17d81b5 100644 Binary files a/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png and b/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_textedge_plot.png b/tests/files/baseline_plots/test_hybrid_textedge_plot.png index 080d4c6..ad209d9 100644 Binary files a/tests/files/baseline_plots/test_hybrid_textedge_plot.png and b/tests/files/baseline_plots/test_hybrid_textedge_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_contour_plot.png b/tests/files/baseline_plots/test_stream_contour_plot.png index 9cc7f36..452c388 100644 Binary files a/tests/files/baseline_plots/test_stream_contour_plot.png and b/tests/files/baseline_plots/test_stream_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_textedge_plot.png b/tests/files/baseline_plots/test_stream_textedge_plot.png index 8e6a10b..08f3fd6 100644 Binary files a/tests/files/baseline_plots/test_stream_textedge_plot.png and b/tests/files/baseline_plots/test_stream_textedge_plot.png differ diff --git a/tests/files/baseline_plots/test_text_plot.png b/tests/files/baseline_plots/test_text_plot.png index 497af37..3bf7a8c 100644 Binary files a/tests/files/baseline_plots/test_text_plot.png and b/tests/files/baseline_plots/test_text_plot.png differ