diff --git a/camelot/core.py b/camelot/core.py index 7405fd1..1c3d268 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -12,6 +12,7 @@ import pandas as pd from cv2 import cv2 from .utils import ( + get_textline_coords, build_file_path_in_temp_dir, compute_accuracy, compute_whitespace, @@ -98,11 +99,8 @@ class TextEdges(object): """Returns the x coordinate of a text row based on the specified alignment. """ - x_left = textline.x0 - x_right = textline.x1 - x_middle = x_left + (x_right - x_left) / 2.0 - x_coord = {"left": x_left, "middle": x_middle, "right": x_right} - return x_coord[align] + coords = get_textline_coords(textline) + return coords[align] def find(self, x_coord, align): """Returns the index of an existing text edge using diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 32cc718..2d6f6f5 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +"""Implementation of hybrid table parser.""" + from __future__ import division import numpy as np @@ -7,6 +9,7 @@ import warnings from .base import BaseParser from ..utils import ( + get_textline_coords, bbox_from_str, text_in_bbox, text_in_bbox_per_axis, @@ -17,30 +20,26 @@ from ..utils import ( from matplotlib import patches as patches -# FRHTODO: Move to utils # maximum number of columns over which a header can spread MAX_COL_SPREAD_IN_HEADER = 3 -def todo_move_me_expand_area_for_header(area, textlines, col_anchors, - max_v_gap): - """The core algorithm is based on fairly strict alignment of text. - It works ok for the table body, but might fail on tables' headers - since they tend to be in a different font, alignment (e.g. vertical), - etc. - The section below tries to identify whether what's above the bbox - identified so far has the characteristics of a table header: - Close to the top of the body, with cells that fit within the bounds - identified. +def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): + """Expand a bbox vertically up by looking for plausible headers. + + The core algorithm is based on fairly strict alignment of text. It works + for the table body, but might fail on tables' headers since they tend to be + in a different font, alignment (e.g. vertical), etc. + This method evalutes the area above the table body's bbox for + characteristics of a table header: close to the top of the body, with cells + that fit within the horizontal bounds identified. """ - new_area = area - (left, bottom, right, top) = area + new_bbox = body_bbox + (left, bottom, right, top) = body_bbox zones = [] def column_spread(left, right, col_anchors): - """Returns the number of columns (splits on the x-axis) - crossed by an element covering left to right. - """ + """Get the number of columns crossed by a segment [left, right].""" indexLeft = 0 while indexLeft < len(col_anchors) \ and col_anchors[indexLeft] < left: @@ -55,7 +54,7 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors, keep_searching = True while keep_searching: keep_searching = False - # a/ first look for the closest text element above the area. + # a/ first look for the closest text element above the bbox. # It will be the anchor for a possible new row. closest_above = None all_above = [] @@ -128,18 +127,18 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors, # 1: # 2: # if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS: - new_area = (left, bottom, right, top) + new_bbox = (left, bottom, right, top) # At this stage we've identified a plausible row (or the # beginning of one). keep_searching = True - - return new_area + return new_bbox class TextEdge2(object): - """Defines a text edge coordinates relative to a left-bottom - origin. (PDF coordinate space) + """Text edge coordinates relative to a left-bottom origin. + + (PDF coordinate space) Parameters ---------- @@ -167,8 +166,7 @@ class TextEdge2(object): f"textlines text='{text_inside}...'>" def register_aligned_textline(self, textline, coord): - """Updates new textline to this alignment, adapting its average. - """ + """Update new textline to this alignment, adapting its average.""" # Increase the intersections for this segment, expand it up, # and adjust the x based on the new value self.coord = (self.coord * len(self.textlines) + coord) / \ @@ -177,8 +175,13 @@ class TextEdge2(object): class Alignments(object): - """Represents the number of other textlines aligned with this - one across each edge. + """ + Represent the number of textlines aligned with this one across each edge. + + A cell can be vertically aligned with others by having matching left, + right, or middle edge, and horizontally aligned by having matching top, + bottom, or center edge. + """ def __init__(self): @@ -261,20 +264,6 @@ class TextEdges2(object): self.max_rows = None self.max_cols = None - @staticmethod - def get_textline_coords(textline): - """Calculate the coordinates of each alignment - for a given textline. - """ - return { - "left": textline.x0, - "right": textline.x1, - "middle": (textline.x0 + textline.x1) / 2.0, - "bottom": textline.y0, - "top": textline.y1, - "center": (textline.y0 + textline.y1) / 2.0, - } - # FRHTODO: Move to utils and use generic name @staticmethod def _get_index_closest_point(coord, edge_array): @@ -328,7 +317,7 @@ class TextEdges2(object): def _register_textline(self, textline): """Updates an existing text edge in the current dict. """ - coords = TextEdges2.get_textline_coords(textline) + coords = get_textline_coords(textline) for alignment in self._textedges: edge_array = self._textedges[alignment] coord = coords[alignment] @@ -490,7 +479,7 @@ class TextEdges2(object): # It will serve as a reference axis along which to collect the average # spacing between rows/cols. most_aligned_tl = self._most_connected_textline() - most_aligned_coords = TextEdges2.get_textline_coords( + most_aligned_coords = get_textline_coords( most_aligned_tl) # Retrieve the list of textlines it's aligned with, across both @@ -871,7 +860,7 @@ class Hybrid(BaseParser): # FRHTODO: Check if needed, refactor with Stream @staticmethod def _add_columns(cols, text, row_tol): - """Adds columns to existing list by taking into account + """Add columns to existing list by taking into account the text that lies outside the current column x-coordinates. Parameters @@ -993,7 +982,7 @@ class Hybrid(BaseParser): # Apply a heuristic to salvage headers which formatting might be # off compared to the rest of the table. - expanded_bbox = todo_move_me_expand_area_for_header( + expanded_bbox = search_header_from_body_bbox( bbox, textlines, cols_anchors, diff --git a/camelot/utils.py b/camelot/utils.py index dcd0e4e..4865afa 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -389,8 +389,21 @@ def segments_in_bbox(bbox, v_segments, h_segments): return v_s, h_s +def get_textline_coords(textline): + """Calculate the coordinates of each alignment for a given textline. + """ + return { + "left": textline.x0, + "right": textline.x1, + "middle": (textline.x0 + textline.x1) / 2.0, + "bottom": textline.y0, + "top": textline.y1, + "center": (textline.y0 + textline.y1) / 2.0, + } + + def bbox_from_str(bbox_str): - """Deserialize bbox from string form "x1,y1,x2,y2" to tuple (x1, y1, x2, y2) + """Deserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2). Parameters ----------