From 9eb4f65fc937dd4271094d247a3428b8d4bab624 Mon Sep 17 00:00:00 2001 From: Frh Date: Sat, 25 Apr 2020 21:14:56 -0700 Subject: [PATCH] Remove f-strings, fix url based unit tests f-strings fail unit tests in Python <3.7, removed them for .format. Made download_url simulate Mozilla/5.0 to restore unit tests, since server targetted was 403ing. --- camelot/core.py | 8 +++++-- camelot/parsers/hybrid.py | 44 --------------------------------------- camelot/utils.py | 4 +++- 3 files changed, 9 insertions(+), 47 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 83ffea5..9263628 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -68,8 +68,12 @@ class TextAlignment(object): def __repr__(self): text_inside = " | ".join( map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "") - return f"" + return "".format( + coord=self.coord, + tl_count=len(self.textlines), + text_inside=text_inside + ) def register_aligned_textline(self, textline, coord): """Update new textline to this alignment, adapting its average.""" diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 17b51a9..5ccdde6 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -252,50 +252,6 @@ class TextNetworks(TextAlignments): self._textline_to_alignments[textline] = alignments alignments[align_id] = textedge.textlines - def _calculate_gaps_thresholds(self, percentile=75): - """Identify reasonable gaps between lines and columns based - on gaps observed across alignments. - This can be used to reject cells as too far away from - the core table. - """ - h_gaps, v_gaps = [], [] - for align_id in self._text_alignments: - edge_array = self._text_alignments[align_id] - gaps = [] - vertical = align_id in HORIZONTAL_ALIGNMENTS - sort_function = (lambda tl: tl.y0) \ - if vertical \ - else (lambda tl: tl.x0) - for alignments in edge_array: - tls = sorted( - alignments.textlines, - key=sort_function, - reverse=True - ) - for i in range(1, len(tls)): - # If the lines are vertically aligned (stacked up), we - # record the vertical gap between them - if vertical: - gap = tls[i-1].y1 - tls[i].y0 - else: - gap = tls[i-1].x1 - tls[i].x0 - gaps.append(gap) - if gaps: - if vertical: - v_gaps.append(np.percentile(gaps, percentile)) - else: - h_gaps.append(np.percentile(gaps, percentile)) - direction_str = 'vertical' if vertical else 'horizontal' - rounded_gaps = list(map(lambda x: round(x, 2), gaps)) - print( - f"{direction_str} gaps found " - f"for {align_id}: " - f"{rounded_gaps} " - f"with {percentile}th percentile " - f"{np.percentile(gaps, percentile)}" - ) - return max(h_gaps, default=None), max(v_gaps, default=None) - def _remove_unconnected_edges(self): """Weed out elements which are only connected to others vertically or horizontally. There needs to be connections across both diff --git a/camelot/utils.py b/camelot/utils.py index 752e4b5..a675580 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -12,6 +12,7 @@ import tempfile import warnings from itertools import groupby from operator import itemgetter +from urllib.request import Request import numpy as np import pandas as pd @@ -96,7 +97,8 @@ def download_url(url): """ filename = "{}.pdf".format(random_string(6)) with tempfile.NamedTemporaryFile("wb", delete=False) as f: - obj = urlopen(url) + req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) + obj = urlopen(req) if PY3: content_type = obj.info().get_content_type() else: