Remove f-strings, fix url based unit tests

f-strings fail unit tests in Python <3.7, removed them for .format.
Made download_url simulate Mozilla/5.0 to restore unit tests, since
server targetted was 403ing.
pull/153/head
Frh 2020-04-25 21:14:56 -07:00
parent 81de841ca0
commit 9eb4f65fc9
3 changed files with 9 additions and 47 deletions

View File

@ -68,8 +68,12 @@ class TextAlignment(object):
def __repr__(self):
text_inside = " | ".join(
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
f"textlines text='{text_inside}...'>"
return "<TextEdge coord={coord} tl={tl_count} " \
"textlines text='{text_inside}...'>".format(
coord=self.coord,
tl_count=len(self.textlines),
text_inside=text_inside
)
def register_aligned_textline(self, textline, coord):
"""Update new textline to this alignment, adapting its average."""

View File

@ -252,50 +252,6 @@ class TextNetworks(TextAlignments):
self._textline_to_alignments[textline] = alignments
alignments[align_id] = textedge.textlines
def _calculate_gaps_thresholds(self, percentile=75):
"""Identify reasonable gaps between lines and columns based
on gaps observed across alignments.
This can be used to reject cells as too far away from
the core table.
"""
h_gaps, v_gaps = [], []
for align_id in self._text_alignments:
edge_array = self._text_alignments[align_id]
gaps = []
vertical = align_id in HORIZONTAL_ALIGNMENTS
sort_function = (lambda tl: tl.y0) \
if vertical \
else (lambda tl: tl.x0)
for alignments in edge_array:
tls = sorted(
alignments.textlines,
key=sort_function,
reverse=True
)
for i in range(1, len(tls)):
# If the lines are vertically aligned (stacked up), we
# record the vertical gap between them
if vertical:
gap = tls[i-1].y1 - tls[i].y0
else:
gap = tls[i-1].x1 - tls[i].x0
gaps.append(gap)
if gaps:
if vertical:
v_gaps.append(np.percentile(gaps, percentile))
else:
h_gaps.append(np.percentile(gaps, percentile))
direction_str = 'vertical' if vertical else 'horizontal'
rounded_gaps = list(map(lambda x: round(x, 2), gaps))
print(
f"{direction_str} gaps found "
f"for {align_id}: "
f"{rounded_gaps} "
f"with {percentile}th percentile "
f"{np.percentile(gaps, percentile)}"
)
return max(h_gaps, default=None), max(v_gaps, default=None)
def _remove_unconnected_edges(self):
"""Weed out elements which are only connected to others vertically
or horizontally. There needs to be connections across both

View File

@ -12,6 +12,7 @@ import tempfile
import warnings
from itertools import groupby
from operator import itemgetter
from urllib.request import Request
import numpy as np
import pandas as pd
@ -96,7 +97,8 @@ def download_url(url):
"""
filename = "{}.pdf".format(random_string(6))
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
obj = urlopen(url)
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
obj = urlopen(req)
if PY3:
content_type = obj.info().get_content_type()
else: