From a1e1fd781d7cdf39707f825a2851ff376e9ff5dd Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 23 Nov 2018 02:51:22 +0530 Subject: [PATCH] Fix comments --- camelot/core.py | 17 ++++++++++------- camelot/parsers/stream.py | 8 ++++---- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 44aff2b..e0687d2 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -10,10 +10,12 @@ import numpy as np import pandas as pd -# minimum number of textlines to be considered a textedge +# minimum number of vertical textline intersections for a textedge +# to be considered valid TEXTEDGE_REQUIRED_ELEMENTS = 4 -# y coordinate tolerance for extending text edge +# y coordinate tolerance for extending textedge TEXTEDGE_EXTEND_TOLERANCE = 50 +# TODO: deal in percentages instead of absolutes # padding added to table area's lt and rb TABLE_AREA_PADDING = 10 @@ -36,7 +38,8 @@ class TextEdge(object): self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.y0 = y0 self.intersections += 1 - # a textedge is valid if it extends uninterrupted over required_elements + # a textedge is valid only if it extends uninterrupted + # over a required number of textlines if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: self.is_valid = True @@ -89,8 +92,8 @@ class TextEdges(object): } # TODO: naive - # get the vertical textedges that intersect maximum number of - # times with horizontal text rows + # get vertical textedges that intersect maximum number of + # times with horizontal textlines relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] return self._textedges[relevant_align] @@ -130,8 +133,8 @@ class TextEdges(object): # extend table areas based on textlines that overlap # vertically. it's possible that these textlines were # eliminated during textedges generation since numbers and - # sentences/chars are often aligned differently. - # drawback: table areas that have paragraphs to their sides + # chars/words/sentences are often aligned differently. + # drawback: table areas that have paragraphs on their sides # will include the paragraphs too. for tl in textlines: found = None diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 2aa5fc4..8f86dbd 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -247,10 +247,10 @@ class Stream(BaseParser): " should be equal") def _nurminen_table_detection(self, textlines): - # an general heuristic implementation of the table detection + # a general heuristic implementation of the table detection # algorithm described by Anssi Nurminen's master's thesis: # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 - # assumes that tables vertically separated by some distance + # assumes that tables are situated relatively apart vertically # TODO: add support for arabic text #141 # sort textlines in reading order @@ -263,9 +263,9 @@ class Stream(BaseParser): textedges.generate(text_grouped) # select relevant edges relevant_textedges = textedges.get_relevant() - # guess table areas using relevant edges + # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) - # treat whole page as table if not table areas found + # treat whole page as table area if no table areas found if not len(table_bbox): table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}