Fix comments
parent
9b67b271e4
commit
a1e1fd781d
|
|
@ -10,10 +10,12 @@ import numpy as np
|
|||
import pandas as pd
|
||||
|
||||
|
||||
# minimum number of textlines to be considered a textedge
|
||||
# minimum number of vertical textline intersections for a textedge
|
||||
# to be considered valid
|
||||
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||
# y coordinate tolerance for extending text edge
|
||||
# y coordinate tolerance for extending textedge
|
||||
TEXTEDGE_EXTEND_TOLERANCE = 50
|
||||
# TODO: deal in percentages instead of absolutes
|
||||
# padding added to table area's lt and rb
|
||||
TABLE_AREA_PADDING = 10
|
||||
|
||||
|
|
@ -36,7 +38,8 @@ class TextEdge(object):
|
|||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
||||
self.y0 = y0
|
||||
self.intersections += 1
|
||||
# a textedge is valid if it extends uninterrupted over required_elements
|
||||
# a textedge is valid only if it extends uninterrupted
|
||||
# over a required number of textlines
|
||||
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||
self.is_valid = True
|
||||
|
||||
|
|
@ -89,8 +92,8 @@ class TextEdges(object):
|
|||
}
|
||||
|
||||
# TODO: naive
|
||||
# get the vertical textedges that intersect maximum number of
|
||||
# times with horizontal text rows
|
||||
# get vertical textedges that intersect maximum number of
|
||||
# times with horizontal textlines
|
||||
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
||||
return self._textedges[relevant_align]
|
||||
|
||||
|
|
@ -130,8 +133,8 @@ class TextEdges(object):
|
|||
# extend table areas based on textlines that overlap
|
||||
# vertically. it's possible that these textlines were
|
||||
# eliminated during textedges generation since numbers and
|
||||
# sentences/chars are often aligned differently.
|
||||
# drawback: table areas that have paragraphs to their sides
|
||||
# chars/words/sentences are often aligned differently.
|
||||
# drawback: table areas that have paragraphs on their sides
|
||||
# will include the paragraphs too.
|
||||
for tl in textlines:
|
||||
found = None
|
||||
|
|
|
|||
|
|
@ -247,10 +247,10 @@ class Stream(BaseParser):
|
|||
" should be equal")
|
||||
|
||||
def _nurminen_table_detection(self, textlines):
|
||||
# an general heuristic implementation of the table detection
|
||||
# a general heuristic implementation of the table detection
|
||||
# algorithm described by Anssi Nurminen's master's thesis:
|
||||
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
||||
# assumes that tables vertically separated by some distance
|
||||
# assumes that tables are situated relatively apart vertically
|
||||
|
||||
# TODO: add support for arabic text #141
|
||||
# sort textlines in reading order
|
||||
|
|
@ -263,9 +263,9 @@ class Stream(BaseParser):
|
|||
textedges.generate(text_grouped)
|
||||
# select relevant edges
|
||||
relevant_textedges = textedges.get_relevant()
|
||||
# guess table areas using relevant edges
|
||||
# guess table areas using textlines and relevant edges
|
||||
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||
# treat whole page as table if not table areas found
|
||||
# treat whole page as table area if no table areas found
|
||||
if not len(table_bbox):
|
||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue