Fix comments
parent
9b67b271e4
commit
a1e1fd781d
|
|
@ -10,10 +10,12 @@ import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
# minimum number of textlines to be considered a textedge
|
# minimum number of vertical textline intersections for a textedge
|
||||||
|
# to be considered valid
|
||||||
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||||
# y coordinate tolerance for extending textedge
|
# y coordinate tolerance for extending textedge
|
||||||
TEXTEDGE_EXTEND_TOLERANCE = 50
|
TEXTEDGE_EXTEND_TOLERANCE = 50
|
||||||
|
# TODO: deal in percentages instead of absolutes
|
||||||
# padding added to table area's lt and rb
|
# padding added to table area's lt and rb
|
||||||
TABLE_AREA_PADDING = 10
|
TABLE_AREA_PADDING = 10
|
||||||
|
|
||||||
|
|
@ -36,7 +38,8 @@ class TextEdge(object):
|
||||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.intersections += 1
|
self.intersections += 1
|
||||||
# a textedge is valid if it extends uninterrupted over required_elements
|
# a textedge is valid only if it extends uninterrupted
|
||||||
|
# over a required number of textlines
|
||||||
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
|
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||||
self.is_valid = True
|
self.is_valid = True
|
||||||
|
|
||||||
|
|
@ -89,8 +92,8 @@ class TextEdges(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO: naive
|
# TODO: naive
|
||||||
# get the vertical textedges that intersect maximum number of
|
# get vertical textedges that intersect maximum number of
|
||||||
# times with horizontal text rows
|
# times with horizontal textlines
|
||||||
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
||||||
return self._textedges[relevant_align]
|
return self._textedges[relevant_align]
|
||||||
|
|
||||||
|
|
@ -130,8 +133,8 @@ class TextEdges(object):
|
||||||
# extend table areas based on textlines that overlap
|
# extend table areas based on textlines that overlap
|
||||||
# vertically. it's possible that these textlines were
|
# vertically. it's possible that these textlines were
|
||||||
# eliminated during textedges generation since numbers and
|
# eliminated during textedges generation since numbers and
|
||||||
# sentences/chars are often aligned differently.
|
# chars/words/sentences are often aligned differently.
|
||||||
# drawback: table areas that have paragraphs to their sides
|
# drawback: table areas that have paragraphs on their sides
|
||||||
# will include the paragraphs too.
|
# will include the paragraphs too.
|
||||||
for tl in textlines:
|
for tl in textlines:
|
||||||
found = None
|
found = None
|
||||||
|
|
|
||||||
|
|
@ -247,10 +247,10 @@ class Stream(BaseParser):
|
||||||
" should be equal")
|
" should be equal")
|
||||||
|
|
||||||
def _nurminen_table_detection(self, textlines):
|
def _nurminen_table_detection(self, textlines):
|
||||||
# an general heuristic implementation of the table detection
|
# a general heuristic implementation of the table detection
|
||||||
# algorithm described by Anssi Nurminen's master's thesis:
|
# algorithm described by Anssi Nurminen's master's thesis:
|
||||||
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
||||||
# assumes that tables vertically separated by some distance
|
# assumes that tables are situated relatively apart vertically
|
||||||
|
|
||||||
# TODO: add support for arabic text #141
|
# TODO: add support for arabic text #141
|
||||||
# sort textlines in reading order
|
# sort textlines in reading order
|
||||||
|
|
@ -263,9 +263,9 @@ class Stream(BaseParser):
|
||||||
textedges.generate(text_grouped)
|
textedges.generate(text_grouped)
|
||||||
# select relevant edges
|
# select relevant edges
|
||||||
relevant_textedges = textedges.get_relevant()
|
relevant_textedges = textedges.get_relevant()
|
||||||
# guess table areas using relevant edges
|
# guess table areas using textlines and relevant edges
|
||||||
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
|
||||||
# treat whole page as table if not table areas found
|
# treat whole page as table area if no table areas found
|
||||||
if not len(table_bbox):
|
if not len(table_bbox):
|
||||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue