Remove f-strings, fix url based unit tests

f-strings fail unit tests in Python <3.7, removed them for .format.
Made download_url simulate Mozilla/5.0 to restore unit tests, since
server targetted was 403ing.
pull/153/head
Frh 2020-04-25 21:14:56 -07:00
parent 016776939e
commit 2624010197
3 changed files with 9 additions and 47 deletions

View File

@ -68,8 +68,12 @@ class TextAlignment(object):
def __repr__(self): def __repr__(self):
text_inside = " | ".join( text_inside = " | ".join(
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "") map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \ return "<TextEdge coord={coord} tl={tl_count} " \
f"textlines text='{text_inside}...'>" "textlines text='{text_inside}...'>".format(
coord=self.coord,
tl_count=len(self.textlines),
text_inside=text_inside
)
def register_aligned_textline(self, textline, coord): def register_aligned_textline(self, textline, coord):
"""Update new textline to this alignment, adapting its average.""" """Update new textline to this alignment, adapting its average."""

View File

@ -252,50 +252,6 @@ class TextNetworks(TextAlignments):
self._textline_to_alignments[textline] = alignments self._textline_to_alignments[textline] = alignments
alignments[align_id] = textedge.textlines alignments[align_id] = textedge.textlines
def _calculate_gaps_thresholds(self, percentile=75):
"""Identify reasonable gaps between lines and columns based
on gaps observed across alignments.
This can be used to reject cells as too far away from
the core table.
"""
h_gaps, v_gaps = [], []
for align_id in self._text_alignments:
edge_array = self._text_alignments[align_id]
gaps = []
vertical = align_id in HORIZONTAL_ALIGNMENTS
sort_function = (lambda tl: tl.y0) \
if vertical \
else (lambda tl: tl.x0)
for alignments in edge_array:
tls = sorted(
alignments.textlines,
key=sort_function,
reverse=True
)
for i in range(1, len(tls)):
# If the lines are vertically aligned (stacked up), we
# record the vertical gap between them
if vertical:
gap = tls[i-1].y1 - tls[i].y0
else:
gap = tls[i-1].x1 - tls[i].x0
gaps.append(gap)
if gaps:
if vertical:
v_gaps.append(np.percentile(gaps, percentile))
else:
h_gaps.append(np.percentile(gaps, percentile))
direction_str = 'vertical' if vertical else 'horizontal'
rounded_gaps = list(map(lambda x: round(x, 2), gaps))
print(
f"{direction_str} gaps found "
f"for {align_id}: "
f"{rounded_gaps} "
f"with {percentile}th percentile "
f"{np.percentile(gaps, percentile)}"
)
return max(h_gaps, default=None), max(v_gaps, default=None)
def _remove_unconnected_edges(self): def _remove_unconnected_edges(self):
"""Weed out elements which are only connected to others vertically """Weed out elements which are only connected to others vertically
or horizontally. There needs to be connections across both or horizontally. There needs to be connections across both

View File

@ -12,6 +12,7 @@ import tempfile
import warnings import warnings
from itertools import groupby from itertools import groupby
from operator import itemgetter from operator import itemgetter
from urllib.request import Request
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -96,7 +97,8 @@ def download_url(url):
""" """
filename = "{}.pdf".format(random_string(6)) filename = "{}.pdf".format(random_string(6))
with tempfile.NamedTemporaryFile("wb", delete=False) as f: with tempfile.NamedTemporaryFile("wb", delete=False) as f:
obj = urlopen(url) req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
obj = urlopen(req)
if PY3: if PY3:
content_type = obj.info().get_content_type() content_type = obj.info().get_content_type()
else: else: