diff --git a/camelot/core.py b/camelot/core.py index 9921b95..f729e46 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -15,8 +15,6 @@ from .utils import ( get_index_closest_point, get_textline_coords, build_file_path_in_temp_dir, - compute_accuracy, - compute_whitespace, export_pdf_as_png ) @@ -141,9 +139,9 @@ class TextAlignments(object): def __init__(self, alignment_names): # For each possible alignment, list of tuples coordinate/textlines - self._textedges = {} + self._text_alignments = {} for alignment_name in alignment_names: - self._textedges[alignment_name] = [] + self._text_alignments[alignment_name] = [] @staticmethod def _create_new_text_alignment(coord, textline, align): @@ -156,12 +154,12 @@ class TextAlignments(object): """Updates an existing text edge in the current dict. """ coords = get_textline_coords(textline) - for alignment, edge_array in self._textedges.items(): - coord = coords[alignment] + for alignment_id, alignment_array in self._text_alignments.items(): + coord = coords[alignment_id] # Find the index of the closest existing element (or 0 if none) idx_closest = get_index_closest_point( - coord, edge_array, fn=lambda x: x.coord + coord, alignment_array, fn=lambda x: x.coord ) # Check if the edges before/after are close enough @@ -169,17 +167,25 @@ class TextAlignments(object): idx_insert = None if idx_closest is None: idx_insert = 0 - elif np.isclose(edge_array[idx_closest].coord, coord, atol=0.5): - self._update_edge(edge_array[idx_closest], coord, textline) - elif edge_array[idx_closest].coord < coord: + elif np.isclose( + alignment_array[idx_closest].coord, + coord, + atol=0.5 + ): + self._update_edge( + alignment_array[idx_closest], + coord, + textline + ) + elif alignment_array[idx_closest].coord < coord: idx_insert = idx_closest + 1 else: idx_insert = idx_closest if idx_insert is not None: - new_edge = self._create_new_text_alignment( - coord, textline, alignment + new_alignment = self._create_new_text_alignment( + coord, textline, alignment_id ) - edge_array.insert(idx_insert, new_edge) + alignment_array.insert(idx_insert, new_alignment) class TextEdges(TextAlignments): @@ -201,7 +207,7 @@ class TextEdges(TextAlignments): """Adds a new text edge to the current dict. """ te = self._create_new_text_alignment(coord, textline, align) - self._textedges[align].append(te) + self._text_alignments[align].append(te) def _update_edge(self, edge, coord, textline): edge.update_coords(coord, textline, self.edge_tol) @@ -221,15 +227,15 @@ class TextEdges(TextAlignments): """ intersections_sum = { "left": sum( - len(te.textlines) for te in self._textedges["left"] + len(te.textlines) for te in self._text_alignments["left"] if te.is_valid ), "right": sum( - len(te.textlines) for te in self._textedges["right"] + len(te.textlines) for te in self._text_alignments["right"] if te.is_valid ), "middle": sum( - len(te.textlines) for te in self._textedges["middle"] + len(te.textlines) for te in self._text_alignments["middle"] if te.is_valid ), } @@ -240,7 +246,7 @@ class TextEdges(TextAlignments): relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] return list(filter( lambda te: te.is_valid, - self._textedges[relevant_align]) + self._text_alignments[relevant_align]) ) def get_table_areas(self, textlines, relevant_textedges): @@ -443,9 +449,9 @@ class Table(object): self.filename = None self.order = None self.page = None - self.flavor = None # Flavor of the parser that generated the table - self.pdf_size = None # Dimensions of the original PDF page - self.debug_info = None # Field holding debug data + self.flavor = None # Flavor of the parser used + self.pdf_size = None # Dimensions of the original PDF page + self.parse_details = None # Field holding debug data self._image = None self._image_path = None # Temporary file to hold an image of the pdf @@ -485,31 +491,6 @@ class Table(object): } return report - def record_parse_metadata(self, parser): - """Record data about the origin of the table - """ - self.flavor = parser.id - self.filename = parser.filename - self.debug_info = parser.debug_info - pos_errors = parser.compute_parse_errors(self) - self.accuracy = compute_accuracy([[100, pos_errors]]) - - if parser.copy_text is not None: - self.copy_spanning_text(parser.copy_text) - - data = self.data - self.df = pd.DataFrame(data) - self.shape = self.df.shape - - self.whitespace = compute_whitespace(data) - self.pdf_size = (parser.pdf_width, parser.pdf_height) - - _text = [] - _text.extend( - [(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text]) - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text]) - self._text = _text - def get_pdf_image(self): """Compute pdf image and cache it """ diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index e4b5071..6816b62 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -3,11 +3,18 @@ import os import warnings +import numpy as np +import pandas as pd + from ..utils import ( + bbox_from_str, + bbox_from_textlines, + compute_accuracy, + compute_whitespace, get_text_objects, get_table_index, text_in_bbox, - bbox_from_str, + text_in_bbox_per_axis, ) from ..core import Table @@ -42,7 +49,7 @@ class BaseParser(object): self.t_bbox = None # For plotting details of parsing algorithms - self.debug_info = {} if debug else None + self.parse_details = {} if debug else None def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs): @@ -63,9 +70,9 @@ class BaseParser(object): self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) - if self.debug_info is not None: - self.debug_info["table_regions"] = self.table_regions - self.debug_info["table_areas"] = self.table_areas + if self.parse_details is not None: + self.parse_details["table_regions"] = self.table_regions + self.parse_details["table_areas"] = self.table_areas def _apply_regions_filter(self, textlines): """If regions have been specified, filter textlines to these regions. @@ -194,6 +201,31 @@ class BaseParser(object): return _tables + def record_parse_metadata(self, table): + """Record data about the origin of the table + """ + table.flavor = self.id + table.filename = self.filename + table.parse_details = self.parse_details + pos_errors = self.compute_parse_errors(table) + table.accuracy = compute_accuracy([[100, pos_errors]]) + + if self.copy_text is not None: + table.copy_spanning_text(self.copy_text) + + data = table.data + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + table.whitespace = compute_whitespace(data) + table.pdf_size = (self.pdf_width, self.pdf_height) + + _text = [] + _text.extend( + [(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + table._text = _text + class TextBaseParser(BaseParser): """Base class for all text parsers. @@ -211,15 +243,17 @@ class TextBaseParser(BaseParser): edge_tol=50, row_tol=2, column_tol=0, + debug=False, **kwargs ): super().__init__( - "stream", + parser_id, table_regions=table_regions, table_areas=table_areas, split_text=split_text, strip_text=strip_text, flag_size=flag_size, + debug=debug, ) self.columns = columns self._validate_columns() @@ -227,4 +261,271 @@ class TextBaseParser(BaseParser): self.row_tol = row_tol self.column_tol = column_tol - self.textedges = None + @staticmethod + def _group_rows(text, row_tol=2): + """Groups PDFMiner text objects into rows vertically + within a tolerance. + + Parameters + ---------- + text : list + List of PDFMiner text objects. + row_tol : int, optional (default: 2) + + Returns + ------- + rows : list + Two-dimensional list of text objects grouped into rows. + + """ + row_y = None + rows = [] + temp = [] + non_empty_text = [t for t in text if t.get_text().strip()] + for t in non_empty_text: + # is checking for upright necessary? + # if t.get_text().strip() and all([obj.upright \ + # for obj in t._objs + # if type(obj) is LTChar]): + if row_y is None: + row_y = t.y0 + elif not np.isclose(row_y, t.y0, atol=row_tol): + rows.append(sorted(temp, key=lambda t: t.x0)) + temp = [] + # We update the row's bottom as we go, to be forgiving if there + # is a gradual change across multiple columns. + row_y = t.y0 + temp.append(t) + rows.append(sorted(temp, key=lambda t: t.x0)) + return rows + + @staticmethod + def _merge_columns(l, column_tol=0): + """Merges column boundaries horizontally if they overlap + or lie within a tolerance. + + Parameters + ---------- + l : list + List of column x-coordinate tuples. + column_tol : int, optional (default: 0) + + Returns + ------- + merged : list + List of merged column x-coordinate tuples. + + """ + merged = [] + for higher in l: + if not merged: + merged.append(higher) + else: + lower = merged[-1] + if column_tol >= 0: + if higher[0] <= lower[1] or np.isclose( + higher[0], lower[1], atol=column_tol + ): + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + elif column_tol < 0: + if higher[0] <= lower[1]: + if np.isclose(higher[0], lower[1], + atol=abs(column_tol)): + merged.append(higher) + else: + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + return merged + + @staticmethod + def _join_rows(rows_grouped, text_y_max, text_y_min): + """Makes row coordinates continuous. For the row to "touch" + we split the existing gap between them in half. + + Parameters + ---------- + rows_grouped : list + Two-dimensional list of text objects grouped into rows. + text_y_max : int + text_y_min : int + + Returns + ------- + rows : list + List of continuous row y-coordinate tuples. + + """ + row_boundaries = [ + [ + max(t.y1 for t in r), + min(t.y0 for t in r) + ] + for r in rows_grouped + ] + for i in range(0, len(row_boundaries)-1): + top_row = row_boundaries[i] + bottom_row = row_boundaries[i+1] + top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2 + row_boundaries[0][0] = text_y_max + row_boundaries[-1][1] = text_y_min + return row_boundaries + + @staticmethod + def _add_columns(cols, text, row_tol): + """Adds columns to existing list by taking into account + the text that lies outside the current column x-coordinates. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text : list + List of PDFMiner text objects. + ytol : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ + if text: + text = TextBaseParser._group_rows(text, row_tol=row_tol) + elements = [len(r) for r in text] + new_cols = [ + (t.x0, t.x1) + for r in text if len(r) == max(elements) + for t in r + ] + cols.extend(TextBaseParser._merge_columns(sorted(new_cols))) + return cols + + @staticmethod + def _join_columns(cols, text_x_min, text_x_max): + """Makes column coordinates continuous. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text_x_min : int + text_y_max : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ + cols = sorted(cols) + cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + return cols + + def _validate_columns(self): + if self.table_areas is not None and self.columns is not None: + if len(self.table_areas) != len(self.columns): + raise ValueError("Length of table_areas and columns" + " should be equal") + + def _generate_columns_and_rows(self, bbox, table_idx): + # select elements which lie within table_bbox + self.t_bbox = text_in_bbox_per_axis( + bbox, + self.horizontal_text, + self.vertical_text + ) + + text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( + self.t_bbox["horizontal"] + self.t_bbox["vertical"] + ) + rows_grouped = self._group_rows( + self.t_bbox["horizontal"], row_tol=self.row_tol) + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + elements = [len(r) for r in rows_grouped] + + if self.columns is not None and self.columns[table_idx] != "": + # user has to input boundary columns too + # take (0, pdf_width) by default + # similar to else condition + # len can't be 1 + cols = self.columns[table_idx].split(",") + cols = [float(c) for c in cols] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + else: + # calculate mode of the list of number of elements in + # each row to guess the number of columns + ncols = max(set(elements), key=elements.count) + if ncols == 1: + # if mode is 1, the page usually contains not tables + # but there can be cases where the list can be skewed, + # try to remove all 1s from list in this case and + # see if the list contains elements, if yes, then use + # the mode after removing 1s + elements = list(filter(lambda x: x != 1, elements)) + if elements: + ncols = max(set(elements), key=elements.count) + else: + warnings.warn( + "No tables found in table area {}" + .format(table_idx + 1) + ) + cols = [ + (t.x0, t.x1) + for r in rows_grouped + if len(r) == ncols + for t in r + ] + cols = self._merge_columns( + sorted(cols), + column_tol=self.column_tol + ) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend( + [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right + ] + ) + outer_text = [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0] + ] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.row_tol) + cols = self._join_columns(cols, text_x_min, text_x_max) + + return cols, rows, None, None + + def record_parse_metadata(self, table): + """Record data about the origin of the table + """ + super().record_parse_metadata(table) + # for plotting + table._bbox = self.table_bbox + table._segments = None + + def _generate_table(self, table_idx, cols, rows, **kwargs): + table = self._initialize_new_table(table_idx, cols, rows) + table = table.set_all_edges() + self.record_parse_metadata(table) + + return table diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 898cfc0..bd98aed 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -5,7 +5,6 @@ from __future__ import division import numpy as np import copy -import warnings from .base import TextBaseParser from ..core import ( @@ -17,7 +16,6 @@ from ..core import ( from ..utils import ( bbox_from_str, text_in_bbox, - text_in_bbox_per_axis, bbox_from_textlines, distance_tl_to_bbox, find_columns_coordinates @@ -142,11 +140,11 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): class AlignmentCounter(object): """ - Represents all textlines aligned with a textline for each alignment. + For a given textline, represent all other textlines aligned with it. - A textline can be vertically aligned with others by having matching left, - right, or middle edge, and horizontally aligned by having matching top, - bottom, or center edge. + A textline can be vertically aligned with others if their bbox match on + left, right, or middle coord, and horizontally aligned if they match top, + bottom, or center coord. """ @@ -210,15 +208,15 @@ class AlignmentCounter(object): class TextNetworks(TextAlignments): - """Text elements connected via both vertical (top, bottom, middle) and - horizontal (left, right, and middle) alignments found on the PDF page. + """Text elements connected by vertical AND horizontal alignments. + The alignment dict has six keys based on the hor/vert alignments, and each key's value is a list of camelot.core.TextAlignment objects. """ def __init__(self): super().__init__(ALL_ALIGNMENTS) - # For each textline, dictionary "edge type" to + # For each textline, dictionary "alignment type" to # "number of textlines aligned" self._textlines_alignments = {} @@ -226,10 +224,10 @@ class TextNetworks(TextAlignments): edge.register_aligned_textline(textline, coord) def _register_all_text_lines(self, textlines): - """Add all textlines to our edge repository to + """Add all textlines to our network repository to identify alignments. """ - # Identify all the edge alignments + # Identify all the alignments for tl in textlines: if len(tl.get_text().strip()) > 0: self._register_textline(tl) @@ -237,7 +235,7 @@ class TextNetworks(TextAlignments): def _compute_alignment_counts(self): """Build a dictionary textline -> alignment object. """ - for align_id, textedges in self._textedges.items(): + for align_id, textedges in self._text_alignments.items(): for textedge in textedges: for textline in textedge.textlines: alignments = self._textlines_alignments.get( @@ -254,8 +252,8 @@ class TextNetworks(TextAlignments): the core table. """ h_gaps, v_gaps = [], [] - for align_id in self._textedges: - edge_array = self._textedges[align_id] + for align_id in self._text_alignments: + edge_array = self._text_alignments[align_id] gaps = [] vertical = align_id in HORIZONTAL_ALIGNMENTS sort_function = (lambda tl: tl.y0) \ @@ -299,7 +297,7 @@ class TextNetworks(TextAlignments): removed_singletons = True while removed_singletons: removed_singletons = False - for alignment_id, textalignments in self._textedges.items(): + for alignment_id, textalignments in self._text_alignments.items(): # For each alignment edge, remove items if they are singletons # either horizontally or vertically for ta in textalignments: @@ -313,7 +311,7 @@ class TextNetworks(TextAlignments): self._textlines_alignments = {} self._compute_alignment_counts() - def _most_connected_textline(self): + def most_connected_textline(self): """ Retrieve the textline that is most connected across vertical and horizontal axis. @@ -340,7 +338,7 @@ class TextNetworks(TextAlignments): # alignments across horizontal and vertical axis. # It will serve as a reference axis along which to collect the average # spacing between rows/cols. - most_aligned_tl = self._most_connected_textline() + most_aligned_tl = self.most_connected_textline() if most_aligned_tl is None: return None @@ -378,7 +376,7 @@ class TextNetworks(TextAlignments): ) return gaps_hv - def _build_bbox_candidate(self, gaps_hv, debug_info=None): + def _build_bbox_candidate(self, gaps_hv, parse_details=None): """ Seed the process with the textline with the highest alignment score, then expand the bbox with textlines within threshold. @@ -387,7 +385,7 @@ class TextNetworks(TextAlignments): gaps_hv : tuple The maximum distance allowed to consider surrounding lines/columns as part of the same table. - debug_info : array (optional) + parse_details : array (optional) Optional parameter array, in which to store extra information to help later visualization of the table creation. """ @@ -396,23 +394,23 @@ class TextNetworks(TextAlignments): # It will serve both as a starting point for the table boundary # search, and as a way to estimate the average spacing between # rows/cols. - most_aligned_tl = self._most_connected_textline() + most_aligned_tl = self.most_connected_textline() # Calculate the 75th percentile of the horizontal/vertical # gaps between textlines. Use this as a reference for a threshold # to not exceed while looking for table boundaries. max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1] - if debug_info is not None: + if parse_details is not None: # Store debug info - debug_info_search = { + parse_details_search = { "max_h_gap": max_h_gap, "max_v_gap": max_v_gap, "iterations": [] } - debug_info.append(debug_info_search) + parse_details.append(parse_details_search) else: - debug_info_search = None + parse_details_search = None MINIMUM_TEXTLINES_IN_TABLE = 6 bbox = (most_aligned_tl.x0, most_aligned_tl.y0, @@ -426,9 +424,9 @@ class TextNetworks(TextAlignments): tls_in_bbox = [most_aligned_tl] last_bbox = None while last_bbox != bbox: - if debug_info_search is not None: + if parse_details_search is not None: # Store debug info - debug_info_search["iterations"].append(bbox) + parse_details_search["iterations"].append(bbox) last_bbox = bbox # Go through all remaining textlines, expand our bbox @@ -461,35 +459,6 @@ class TextNetworks(TextAlignments): self._register_all_text_lines(textlines) self._compute_alignment_counts() - def plot_alignments(self, ax): - """Displays a visualization of the alignments as currently computed. - """ - # FRHTODO: This is too busy and doesn't plot lines - most_aligned_tl = sorted( - self._textlines_alignments.keys(), - key=lambda textline: - self._textlines_alignments[textline].alignment_score(), - reverse=True - )[0] - - ax.add_patch( - patches.Rectangle( - (most_aligned_tl.x0, most_aligned_tl.y0), - most_aligned_tl.x1 - most_aligned_tl.x0, - most_aligned_tl.y1 - most_aligned_tl.y0, - color="red", - alpha=0.5 - ) - ) - for tl, alignments in self._textlines_alignments.items(): - ax.text( - tl.x0 - 5, - tl.y0 - 5, - f"{alignments.max_h_count()}x{alignments.max_v_count()}", - fontsize=5, - color="black" - ) - class Hybrid(TextBaseParser): """Hybrid method of parsing looks for spaces between text @@ -555,190 +524,9 @@ class Hybrid(TextBaseParser): edge_tol=edge_tol, row_tol=row_tol, column_tol=column_tol, + debug=debug, ) - # FRHTODO: Check if needed, refactor with Stream - @staticmethod - def _group_rows(text, row_tol=2): - """Groups PDFMiner text objects into rows vertically - within a tolerance. - - Parameters - ---------- - text : list - List of PDFMiner text objects. - row_tol : int, optional (default: 2) - - Returns - ------- - rows : list - Two-dimensional list of text objects grouped into rows. - - """ - row_y = None - rows = [] - temp = [] - non_empty_text = [t for t in text if t.get_text().strip()] - for t in non_empty_text: - # is checking for upright necessary? - # if t.get_text().strip() and all([obj.upright \ - # for obj in t._objs - # if type(obj) is LTChar]): - if row_y is None: - row_y = t.y0 - elif not np.isclose(row_y, t.y0, atol=row_tol): - rows.append(sorted(temp, key=lambda t: t.x0)) - temp = [] - # We update the row's bottom as we go, to be forgiving if there - # is a gradual change across multiple columns. - row_y = t.y0 - temp.append(t) - rows.append(sorted(temp, key=lambda t: t.x0)) - return rows - - # FRHTODO: Check if needed, refactor with Stream - @staticmethod - def _merge_columns(l, column_tol=0): - """Merges column boundaries horizontally if they overlap - or lie within a tolerance. - - Parameters - ---------- - l : list - List of column x-coordinate tuples. - column_tol : int, optional (default: 0) - - Returns - ------- - merged : list - List of merged column x-coordinate tuples. - - """ - merged = [] - for higher in l: - if not merged: - merged.append(higher) - else: - lower = merged[-1] - if column_tol >= 0: - if higher[0] <= lower[1] or np.isclose( - higher[0], lower[1], atol=column_tol - ): - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - elif column_tol < 0: - if higher[0] <= lower[1]: - if np.isclose(higher[0], lower[1], - atol=abs(column_tol)): - merged.append(higher) - else: - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - return merged - - # FRHTODO: Check if needed, refactor with Stream - @staticmethod - def _join_rows(rows_grouped, text_y_max, text_y_min): - """Makes row coordinates continuous. For the row to "touch" - we split the existing gap between them in half. - - Parameters - ---------- - rows_grouped : list - Two-dimensional list of text objects grouped into rows. - text_y_max : int - text_y_min : int - - Returns - ------- - rows : list - List of continuous row y-coordinate tuples. - - """ - row_boundaries = [ - [ - max(t.y1 for t in r), - min(t.y0 for t in r) - ] - for r in rows_grouped - ] - for i in range(0, len(row_boundaries)-1): - top_row = row_boundaries[i] - bottom_row = row_boundaries[i+1] - top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2 - row_boundaries[0][0] = text_y_max - row_boundaries[-1][1] = text_y_min - return row_boundaries - - # FRHTODO: Check if needed, refactor with Stream - @staticmethod - def _add_columns(cols, text, row_tol): - """Add columns to existing list by taking into account - the text that lies outside the current column x-coordinates. - - Parameters - ---------- - cols : list - List of column x-coordinate tuples. - text : list - List of PDFMiner text objects. - ytol : int - - Returns - ------- - cols : list - Updated list of column x-coordinate tuples. - - """ - if text: - text = Hybrid._group_rows(text, row_tol=row_tol) - elements = [len(r) for r in text] - new_cols = [ - (t.x0, t.x1) - for r in text if len(r) == max(elements) - for t in r - ] - cols.extend(Hybrid._merge_columns(sorted(new_cols))) - return cols - - # FRHTODO: Check if needed, refactor with Stream - @staticmethod - def _join_columns(cols, text_x_min, text_x_max): - """Makes column coordinates continuous. - - Parameters - ---------- - cols : list - List of column x-coordinate tuples. - text_x_min : int - text_y_max : int - - Returns - ------- - cols : list - Updated list of column x-coordinate tuples. - - """ - cols = sorted(cols) - cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - return cols - - # FRHTODO: Check is needed, refactor with Stream - def _validate_columns(self): - if self.table_areas is not None and self.columns is not None: - if len(self.table_areas) != len(self.columns): - raise ValueError("Length of table_areas and columns" - " should be equal") - def _generate_table_bbox(self): if self.table_areas is not None: table_bbox = {} @@ -756,25 +544,21 @@ class Hybrid(TextBaseParser): textlines_processed = {} self.table_bbox = {} - if self.debug_info is not None: - debug_info_edges_searches = [] - self.debug_info["edges_searches"] = debug_info_edges_searches - debug_info_bboxes_searches = [] - self.debug_info["bboxes_searches"] = debug_info_bboxes_searches + if self.parse_details is not None: + parse_details_network_searches = [] + self.parse_details["network_searches"] = \ + parse_details_network_searches + parse_details_bbox_searches = [] + self.parse_details["bbox_searches"] = parse_details_bbox_searches else: - debug_info_edges_searches = None - debug_info_bboxes_searches = None + parse_details_network_searches = None + parse_details_bbox_searches = None while True: - self.textedges = TextNetworks() - self.textedges.generate(textlines) - self.textedges._remove_unconnected_edges() - if debug_info_edges_searches is not None: - # Preserve the current edge calculation for display debugging - debug_info_edges_searches.append( - copy.deepcopy(self.textedges) - ) - gaps_hv = self.textedges._compute_plausible_gaps() + text_network = TextNetworks() + text_network.generate(textlines) + text_network._remove_unconnected_edges() + gaps_hv = text_network._compute_plausible_gaps() if gaps_hv is None: return None # edge_tol instructions override the calculated vertical gap @@ -782,13 +566,19 @@ class Hybrid(TextBaseParser): gaps_hv[0], gaps_hv[1] if self.edge_tol is None else self.edge_tol ) - bbox = self.textedges._build_bbox_candidate( + bbox = text_network._build_bbox_candidate( edge_tol_hv, - debug_info=debug_info_bboxes_searches + parse_details=parse_details_bbox_searches ) if bbox is None: break + if parse_details_network_searches is not None: + # Preserve the current edge calculation for display debugging + parse_details_network_searches.append( + copy.deepcopy(text_network) + ) + # Get all the textlines that are at least 50% in the box tls_in_bbox = text_in_bbox(bbox, textlines) @@ -808,10 +598,10 @@ class Hybrid(TextBaseParser): gaps_hv[1] ) - if self.debug_info is not None: - if "col_searches" not in self.debug_info: - self.debug_info["col_searches"] = [] - self.debug_info["col_searches"].append({ + if self.parse_details is not None: + if "col_searches" not in self.parse_details: + self.parse_details["col_searches"] = [] + self.parse_details["col_searches"].append({ "core_bbox": bbox, "cols_anchors": cols_anchors, "expanded_bbox": expanded_bbox @@ -826,95 +616,3 @@ class Hybrid(TextBaseParser): lambda tl: tl not in textlines_processed, textlines )) - - # FRHTODO: Check is needed, refactor with Stream - def _generate_columns_and_rows(self, bbox, table_idx): - # select elements which lie within table_bbox - self.t_bbox = text_in_bbox_per_axis( - bbox, - self.horizontal_text, - self.vertical_text - ) - - text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( - self.t_bbox["horizontal"] + self.t_bbox["vertical"] - ) - rows_grouped = self._group_rows( - self.t_bbox["horizontal"], row_tol=self.row_tol) - rows = self._join_rows(rows_grouped, text_y_max, text_y_min) - elements = [len(r) for r in rows_grouped] - - if self.columns is not None and self.columns[table_idx] != "": - # user has to input boundary columns too - # take (0, pdf_width) by default - # similar to else condition - # len can't be 1 - cols = self.columns[table_idx].split(",") - cols = [float(c) for c in cols] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - else: - # calculate mode of the list of number of elements in - # each row to guess the number of columns - ncols = max(set(elements), key=elements.count) - if ncols == 1: - # if mode is 1, the page usually contains not tables - # but there can be cases where the list can be skewed, - # try to remove all 1s from list in this case and - # see if the list contains elements, if yes, then use - # the mode after removing 1s - elements = list(filter(lambda x: x != 1, elements)) - if elements: - ncols = max(set(elements), key=elements.count) - else: - warnings.warn( - "No tables found in table area {}" - .format(table_idx + 1) - ) - cols = [ - (t.x0, t.x1) - for r in rows_grouped - if len(r) == ncols - for t in r - ] - cols = self._merge_columns( - sorted(cols), - column_tol=self.column_tol - ) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend( - [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > left and t.x1 < right - ] - ) - outer_text = [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0] - ] - inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, self.row_tol) - cols = self._join_columns(cols, text_x_min, text_x_max) - - return cols, rows, None, None - - # FRHTODO: Check is needed, refactor with Stream - def _generate_table(self, table_idx, cols, rows, **kwargs): - table = self._initialize_new_table(table_idx, cols, rows) - table = table.set_all_edges() - table.record_parse_metadata(self) - - # for plotting - table._bbox = self.table_bbox - table._segments = None - table._textedges = self.textedges - - return table diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index d6ba65d..b8b82ed 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -168,6 +168,15 @@ class Lattice(BaseParser): indices.append((r_idx, c_idx, text)) return indices + def record_parse_metadata(self, table): + """Record data about the origin of the table + """ + super().record_parse_metadata(table) + # for plotting + table._image = self.pdf_image # Reuse the image used for calc + table._bbox_unscaled = self.table_bbox_unscaled + table._segments = (self.vertical_segments, self.horizontal_segments) + def _generate_table_bbox(self): def scale_areas(areas): scaled_areas = [] @@ -293,12 +302,5 @@ class Lattice(BaseParser): # set spanning cells to True table = table.set_span() - table.record_parse_metadata(self) - - # for plotting - table._image = self.pdf_image # Reuse the image used for calc - table._bbox_unscaled = self.table_bbox_unscaled - table._segments = (self.vertical_segments, self.horizontal_segments) - table._textedges = None - + self.record_parse_metadata(table) return table diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 91e2fde..8b72e09 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -1,17 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import division -import warnings - -import numpy as np from .base import TextBaseParser from ..core import TextEdges from ..utils import ( bbox_from_str, - bbox_from_textlines, - text_in_bbox, - text_in_bbox_per_axis + text_in_bbox ) @@ -79,182 +74,7 @@ class Stream(TextBaseParser): row_tol=row_tol, column_tol=column_tol, ) - - @staticmethod - def _group_rows(text, row_tol=2): - """Groups PDFMiner text objects into rows vertically - within a tolerance. - - Parameters - ---------- - text : list - List of PDFMiner text objects. - row_tol : int, optional (default: 2) - - Returns - ------- - rows : list - Two-dimensional list of text objects grouped into rows. - - """ - row_y = None - rows = [] - temp = [] - non_empty_text = [t for t in text if t.get_text().strip()] - for t in non_empty_text: - # is checking for upright necessary? - # if t.get_text().strip() and all([obj.upright \ - # for obj in t._objs - # if type(obj) is LTChar]): - if row_y is None: - row_y = t.y0 - elif not np.isclose(row_y, t.y0, atol=row_tol): - rows.append(sorted(temp, key=lambda t: t.x0)) - temp = [] - # We update the row's bottom as we go, to be forgiving if there - # is a gradual change across multiple columns. - row_y = t.y0 - temp.append(t) - rows.append(sorted(temp, key=lambda t: t.x0)) - return rows - - @staticmethod - def _merge_columns(l, column_tol=0): - """Merges column boundaries horizontally if they overlap - or lie within a tolerance. - - Parameters - ---------- - l : list - List of column x-coordinate tuples. - column_tol : int, optional (default: 0) - - Returns - ------- - merged : list - List of merged column x-coordinate tuples. - - """ - merged = [] - for higher in l: - if not merged: - merged.append(higher) - else: - lower = merged[-1] - if column_tol >= 0: - if higher[0] <= lower[1] or np.isclose( - higher[0], lower[1], atol=column_tol - ): - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - elif column_tol < 0: - if higher[0] <= lower[1]: - if np.isclose(higher[0], lower[1], - atol=abs(column_tol)): - merged.append(higher) - else: - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - return merged - - @staticmethod - def _join_rows(rows_grouped, text_y_max, text_y_min): - """Makes row coordinates continuous. For the row to "touch" - we split the existing gap between them in half. - - Parameters - ---------- - rows_grouped : list - Two-dimensional list of text objects grouped into rows. - text_y_max : int - text_y_min : int - - Returns - ------- - rows : list - List of continuous row y-coordinate tuples. - - """ - row_boundaries = [ - [ - max(t.y1 for t in r), - min(t.y0 for t in r) - ] - for r in rows_grouped - ] - for i in range(0, len(row_boundaries)-1): - top_row = row_boundaries[i] - bottom_row = row_boundaries[i+1] - top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2 - row_boundaries[0][0] = text_y_max - row_boundaries[-1][1] = text_y_min - return row_boundaries - - @staticmethod - def _add_columns(cols, text, row_tol): - """Adds columns to existing list by taking into account - the text that lies outside the current column x-coordinates. - - Parameters - ---------- - cols : list - List of column x-coordinate tuples. - text : list - List of PDFMiner text objects. - ytol : int - - Returns - ------- - cols : list - Updated list of column x-coordinate tuples. - - """ - if text: - text = Stream._group_rows(text, row_tol=row_tol) - elements = [len(r) for r in text] - new_cols = [ - (t.x0, t.x1) - for r in text if len(r) == max(elements) - for t in r - ] - cols.extend(Stream._merge_columns(sorted(new_cols))) - return cols - - @staticmethod - def _join_columns(cols, text_x_min, text_x_max): - """Makes column coordinates continuous. - - Parameters - ---------- - cols : list - List of column x-coordinate tuples. - text_x_min : int - text_y_max : int - - Returns - ------- - cols : list - Updated list of column x-coordinate tuples. - - """ - cols = sorted(cols) - cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - return cols - - def _validate_columns(self): - if self.table_areas is not None and self.columns is not None: - if len(self.table_areas) != len(self.columns): - raise ValueError("Length of table_areas and columns" - " should be equal") + self.textedges = [] def _nurminen_table_detection(self, textlines): """A general implementation of the table detection algorithm @@ -281,8 +101,13 @@ class Stream(TextBaseParser): return table_bbox + def record_parse_metadata(self, table): + """Record data about the origin of the table + """ + super().record_parse_metadata(table) + table._textedges = self.textedges + def _generate_table_bbox(self): - self.textedges = [] if self.table_areas is None: hor_text = self.horizontal_text if self.table_regions is not None: @@ -300,93 +125,3 @@ class Stream(TextBaseParser): for area_str in self.table_areas: table_bbox[bbox_from_str(area_str)] = None self.table_bbox = table_bbox - - def _generate_columns_and_rows(self, bbox, table_idx): - # select elements which lie within table_bbox - self.t_bbox = text_in_bbox_per_axis( - bbox, - self.horizontal_text, - self.vertical_text - ) - - text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( - self.t_bbox["horizontal"] + self.t_bbox["vertical"] - ) - rows_grouped = self._group_rows( - self.t_bbox["horizontal"], row_tol=self.row_tol) - rows = self._join_rows(rows_grouped, text_y_max, text_y_min) - elements = [len(r) for r in rows_grouped] - - if self.columns is not None and self.columns[table_idx] != "": - # user has to input boundary columns too - # take (0, pdf_width) by default - # similar to else condition - # len can't be 1 - cols = self.columns[table_idx].split(",") - cols = [float(c) for c in cols] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - else: - # calculate mode of the list of number of elements in - # each row to guess the number of columns - ncols = max(set(elements), key=elements.count) - if ncols == 1: - # if mode is 1, the page usually contains not tables - # but there can be cases where the list can be skewed, - # try to remove all 1s from list in this case and - # see if the list contains elements, if yes, then use - # the mode after removing 1s - elements = list(filter(lambda x: x != 1, elements)) - if elements: - ncols = max(set(elements), key=elements.count) - else: - warnings.warn( - "No tables found in table area {}" - .format(table_idx + 1) - ) - cols = [ - (t.x0, t.x1) - for r in rows_grouped - if len(r) == ncols - for t in r - ] - cols = self._merge_columns( - sorted(cols), - column_tol=self.column_tol - ) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend( - [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > left and t.x1 < right - ] - ) - outer_text = [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0] - ] - inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, self.row_tol) - cols = self._join_columns(cols, text_x_min, text_x_max) - - return cols, rows, None, None - - def _generate_table(self, table_idx, cols, rows, **kwargs): - table = self._initialize_new_table(table_idx, cols, rows) - table = table.set_all_edges() - table.record_parse_metadata(self) - - # for plotting - table._bbox = self.table_bbox - table._segments = None - table._textedges = self.textedges - - return table diff --git a/camelot/plotting.py b/camelot/plotting.py index 2ae2713..a94c1bb 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -87,9 +87,9 @@ def draw_parse_constraints(table, ax): ax : matplotlib.axes.Axes """ - if table.debug_info: + if table.parse_details: # Display a bbox per region - for region_str in table.debug_info["table_regions"] or []: + for region_str in table.parse_details["table_regions"] or []: draw_labeled_bbox( ax, bbox_from_str(region_str), "region: ({region_str})".format(region_str=region_str), @@ -99,7 +99,7 @@ def draw_parse_constraints(table, ax): label_pos="bottom,right" ) # Display a bbox per area - for area_str in table.debug_info["table_areas"] or []: + for area_str in table.parse_details["table_areas"] or []: draw_labeled_bbox( ax, bbox_from_str(area_str), "area: ({area_str})".format(area_str=area_str), @@ -294,8 +294,27 @@ class PlotMethods(object): ax.set_ylim(min(ys) - 10, max(ys) + 10) if table.flavor == "hybrid": - # FRHTODO: Clean this up - table.debug_info["edges_searches"][0].plot_alignments(ax) + for text_network in table.parse_details["network_searches"]: + # FRHTODO: This is too busy and doesn't plot lines + most_connected_tl = text_network.most_connected_textline() + + ax.add_patch( + patches.Rectangle( + (most_connected_tl.x0, most_connected_tl.y0), + most_connected_tl.x1 - most_connected_tl.x0, + most_connected_tl.y1 - most_connected_tl.y0, + color="red", + alpha=0.5 + ) + ) + for tl, alignments in text_network._textlines_alignments.items(): + ax.text( + tl.x0 - 5, + tl.y0 - 5, + f"{alignments.max_h_count()}x{alignments.max_v_count()}", + fontsize=5, + color="black" + ) else: for te in table._textedges: ax.plot([te.coord, te.coord], [te.y0, te.y1]) @@ -372,10 +391,10 @@ class PlotMethods(object): draw_pdf(table, ax) draw_parse_constraints(table, ax) - if table.debug_info is None: + if table.parse_details is None: return fig - debug_info = table.debug_info - for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]): + parse_details = table.parse_details + for box_id, bbox_search in enumerate(parse_details["bbox_searches"]): max_h_gap = bbox_search["max_h_gap"] max_v_gap = bbox_search["max_v_gap"] iterations = bbox_search["iterations"] @@ -403,7 +422,7 @@ class PlotMethods(object): ) ) - for box_id, col_search in enumerate(debug_info["col_searches"]): + for box_id, col_search in enumerate(parse_details["col_searches"]): draw_labeled_bbox( ax, col_search["expanded_bbox"], "box body + header #{box_id}".format( @@ -422,10 +441,5 @@ class PlotMethods(object): linewidth=2, label_pos="bottom,left" ) - # self.debug_info["col_searches"].append({ - # "core_bbox": bbox, - # "cols_anchors": cols_anchors, - # "expanded_bbox": expanded_bbox - # }) return fig diff --git a/tests/files/baseline_plots/test_hybrid_contour_plot.png b/tests/files/baseline_plots/test_hybrid_contour_plot.png index 26d2b57..524d5e0 100644 Binary files a/tests/files/baseline_plots/test_hybrid_contour_plot.png and b/tests/files/baseline_plots/test_hybrid_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_textedge_plot.png b/tests/files/baseline_plots/test_hybrid_textedge_plot.png index fc9496b..6b44d48 100644 Binary files a/tests/files/baseline_plots/test_hybrid_textedge_plot.png and b/tests/files/baseline_plots/test_hybrid_textedge_plot.png differ