Fix comments

pull/2/head
Vinayak Mehta 2018-11-23 02:51:22 +05:30
parent 9b67b271e4
commit a1e1fd781d
2 changed files with 14 additions and 11 deletions

View File

@ -10,10 +10,12 @@ import numpy as np
import pandas as pd import pandas as pd
# minimum number of textlines to be considered a textedge # minimum number of vertical textline intersections for a textedge
# to be considered valid
TEXTEDGE_REQUIRED_ELEMENTS = 4 TEXTEDGE_REQUIRED_ELEMENTS = 4
# y coordinate tolerance for extending text edge # y coordinate tolerance for extending textedge
TEXTEDGE_EXTEND_TOLERANCE = 50 TEXTEDGE_EXTEND_TOLERANCE = 50
# TODO: deal in percentages instead of absolutes
# padding added to table area's lt and rb # padding added to table area's lt and rb
TABLE_AREA_PADDING = 10 TABLE_AREA_PADDING = 10
@ -36,7 +38,8 @@ class TextEdge(object):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0 self.y0 = y0
self.intersections += 1 self.intersections += 1
# a textedge is valid if it extends uninterrupted over required_elements # a textedge is valid only if it extends uninterrupted
# over a required number of textlines
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
self.is_valid = True self.is_valid = True
@ -89,8 +92,8 @@ class TextEdges(object):
} }
# TODO: naive # TODO: naive
# get the vertical textedges that intersect maximum number of # get vertical textedges that intersect maximum number of
# times with horizontal text rows # times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align] return self._textedges[relevant_align]
@ -130,8 +133,8 @@ class TextEdges(object):
# extend table areas based on textlines that overlap # extend table areas based on textlines that overlap
# vertically. it's possible that these textlines were # vertically. it's possible that these textlines were
# eliminated during textedges generation since numbers and # eliminated during textedges generation since numbers and
# sentences/chars are often aligned differently. # chars/words/sentences are often aligned differently.
# drawback: table areas that have paragraphs to their sides # drawback: table areas that have paragraphs on their sides
# will include the paragraphs too. # will include the paragraphs too.
for tl in textlines: for tl in textlines:
found = None found = None

View File

@ -247,10 +247,10 @@ class Stream(BaseParser):
" should be equal") " should be equal")
def _nurminen_table_detection(self, textlines): def _nurminen_table_detection(self, textlines):
# an general heuristic implementation of the table detection # a general heuristic implementation of the table detection
# algorithm described by Anssi Nurminen's master's thesis: # algorithm described by Anssi Nurminen's master's thesis:
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
# assumes that tables vertically separated by some distance # assumes that tables are situated relatively apart vertically
# TODO: add support for arabic text #141 # TODO: add support for arabic text #141
# sort textlines in reading order # sort textlines in reading order
@ -263,9 +263,9 @@ class Stream(BaseParser):
textedges.generate(text_grouped) textedges.generate(text_grouped)
# select relevant edges # select relevant edges
relevant_textedges = textedges.get_relevant() relevant_textedges = textedges.get_relevant()
# guess table areas using relevant edges # guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges) table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table if not table areas found # treat whole page as table area if no table areas found
if not len(table_bbox): if not len(table_bbox):
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}