Add fix to include table headers
parent
a1e1fd781d
commit
0251422e33
|
|
@ -15,8 +15,7 @@ import pandas as pd
|
||||||
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||||
# y coordinate tolerance for extending textedge
|
# y coordinate tolerance for extending textedge
|
||||||
TEXTEDGE_EXTEND_TOLERANCE = 50
|
TEXTEDGE_EXTEND_TOLERANCE = 50
|
||||||
# TODO: deal in percentages instead of absolutes
|
# padding added to table area on the left, right and bottom
|
||||||
# padding added to table area's lt and rb
|
|
||||||
TABLE_AREA_PADDING = 10
|
TABLE_AREA_PADDING = 10
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -79,8 +78,7 @@ class TextEdges(object):
|
||||||
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
||||||
|
|
||||||
def generate(self, textlines):
|
def generate(self, textlines):
|
||||||
textlines_flat = list(chain.from_iterable(textlines))
|
for tl in textlines:
|
||||||
for tl in textlines_flat:
|
|
||||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||||
self.update(tl)
|
self.update(tl)
|
||||||
|
|
||||||
|
|
@ -98,13 +96,12 @@ class TextEdges(object):
|
||||||
return self._textedges[relevant_align]
|
return self._textedges[relevant_align]
|
||||||
|
|
||||||
def get_table_areas(self, textlines, relevant_textedges):
|
def get_table_areas(self, textlines, relevant_textedges):
|
||||||
def pad(area):
|
def pad(area, average_row_height):
|
||||||
x0 = area[0] - TABLE_AREA_PADDING
|
x0 = area[0] - TABLE_AREA_PADDING
|
||||||
y0 = area[1] - TABLE_AREA_PADDING
|
y0 = area[1] - TABLE_AREA_PADDING
|
||||||
x1 = area[2] + TABLE_AREA_PADDING
|
x1 = area[2] + TABLE_AREA_PADDING
|
||||||
# TODO: deal in percentages instead of absolutes
|
# add a constant since table headers can be relatively up
|
||||||
# add a constant to include table headers
|
y1 = area[3] + average_row_height * 5
|
||||||
y1 = area[3] + TABLE_AREA_PADDING + 10
|
|
||||||
return (x0, y0, x1, y1)
|
return (x0, y0, x1, y1)
|
||||||
|
|
||||||
# sort relevant textedges in reading order
|
# sort relevant textedges in reading order
|
||||||
|
|
@ -136,7 +133,9 @@ class TextEdges(object):
|
||||||
# chars/words/sentences are often aligned differently.
|
# chars/words/sentences are often aligned differently.
|
||||||
# drawback: table areas that have paragraphs on their sides
|
# drawback: table areas that have paragraphs on their sides
|
||||||
# will include the paragraphs too.
|
# will include the paragraphs too.
|
||||||
|
sum_textline_height = 0
|
||||||
for tl in textlines:
|
for tl in textlines:
|
||||||
|
sum_textline_height += tl.y1 - tl.y0
|
||||||
found = None
|
found = None
|
||||||
for area in table_areas:
|
for area in table_areas:
|
||||||
# check for overlap
|
# check for overlap
|
||||||
|
|
@ -148,11 +147,12 @@ class TextEdges(object):
|
||||||
updated_area = (
|
updated_area = (
|
||||||
min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1))
|
min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1))
|
||||||
table_areas[updated_area] = None
|
table_areas[updated_area] = None
|
||||||
|
average_textline_height = sum_textline_height / float(len(textlines))
|
||||||
|
|
||||||
# add some padding to table areas
|
# add some padding to table areas
|
||||||
table_areas_padded = {}
|
table_areas_padded = {}
|
||||||
for area in table_areas:
|
for area in table_areas:
|
||||||
table_areas_padded[pad(area)] = None
|
table_areas_padded[pad(area, average_textline_height)] = None
|
||||||
|
|
||||||
return table_areas_padded
|
return table_areas_padded
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -255,12 +255,9 @@ class Stream(BaseParser):
|
||||||
# TODO: add support for arabic text #141
|
# TODO: add support for arabic text #141
|
||||||
# sort textlines in reading order
|
# sort textlines in reading order
|
||||||
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
||||||
# group textlines into rows
|
|
||||||
text_grouped = self._group_rows(
|
|
||||||
self.horizontal_text, row_close_tol=self.row_close_tol)
|
|
||||||
textedges = TextEdges()
|
textedges = TextEdges()
|
||||||
# generate left, middle and right textedges
|
# generate left, middle and right textedges
|
||||||
textedges.generate(text_grouped)
|
textedges.generate(textlines)
|
||||||
# select relevant edges
|
# select relevant edges
|
||||||
relevant_textedges = textedges.get_relevant()
|
relevant_textedges = textedges.get_relevant()
|
||||||
# guess table areas using textlines and relevant edges
|
# guess table areas using textlines and relevant edges
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue