Fix comments

2018-11-23 02:51:22 +05:30 · 2018-11-23 02:51:22 +05:30 · a1e1fd781d
parent 9b67b271e4
commit a1e1fd781d
2 changed files with 14 additions and 11 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -10,10 +10,12 @@ import numpy as np
 import pandas as pd


-# minimum number of textlines to be considered a textedge
+# minimum number of vertical textline intersections for a textedge
+# to be considered valid
 TEXTEDGE_REQUIRED_ELEMENTS = 4
-# y coordinate tolerance for extending text edge
+# y coordinate tolerance for extending textedge
 TEXTEDGE_EXTEND_TOLERANCE = 50
+# TODO: deal in percentages instead of absolutes
 # padding added to table area's lt and rb
 TABLE_AREA_PADDING = 10

@ -36,7 +38,8 @@ class TextEdge(object):
            self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
            self.y0 = y0
            self.intersections += 1
-            # a textedge is valid if it extends uninterrupted over required_elements
+            # a textedge is valid only if it extends uninterrupted
+            # over a required number of textlines
            if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
                self.is_valid = True

@ -89,8 +92,8 @@ class TextEdges(object):
        }

        # TODO: naive
-        # get the vertical textedges that intersect maximum number of
-        # times with horizontal text rows
+        # get vertical textedges that intersect maximum number of
+        # times with horizontal textlines
        relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
        return self._textedges[relevant_align]

@ -130,8 +133,8 @@ class TextEdges(object):
        # extend table areas based on textlines that overlap
        # vertically. it's possible that these textlines were
        # eliminated during textedges generation since numbers and
-        # sentences/chars are often aligned differently.
-        # drawback: table areas that have paragraphs to their sides
+        # chars/words/sentences are often aligned differently.
+        # drawback: table areas that have paragraphs on their sides
        # will include the paragraphs too.
        for tl in textlines:
            found = None
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -247,10 +247,10 @@ class Stream(BaseParser):
                                 " should be equal")

    def _nurminen_table_detection(self, textlines):
-        # an general heuristic implementation of the table detection
+        # a general heuristic implementation of the table detection
        # algorithm described by Anssi Nurminen's master's thesis:
        # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
-        # assumes that tables vertically separated by some distance
+        # assumes that tables are situated relatively apart vertically

        # TODO: add support for arabic text #141
        # sort textlines in reading order
@ -263,9 +263,9 @@ class Stream(BaseParser):
        textedges.generate(text_grouped)
        # select relevant edges
        relevant_textedges = textedges.get_relevant()
-        # guess table areas using relevant edges
+        # guess table areas using textlines and relevant edges
        table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
-        # treat whole page as table if not table areas found
+        # treat whole page as table area if no table areas found
        if not len(table_bbox):
            table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}