Remove f-strings, fix url based unit tests
f-strings fail unit tests in Python <3.7, removed them for .format. Made download_url simulate Mozilla/5.0 to restore unit tests, since server targetted was 403ing.pull/153/head
parent
81de841ca0
commit
9eb4f65fc9
|
|
@ -68,8 +68,12 @@ class TextAlignment(object):
|
|||
def __repr__(self):
|
||||
text_inside = " | ".join(
|
||||
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
|
||||
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
|
||||
f"textlines text='{text_inside}...'>"
|
||||
return "<TextEdge coord={coord} tl={tl_count} " \
|
||||
"textlines text='{text_inside}...'>".format(
|
||||
coord=self.coord,
|
||||
tl_count=len(self.textlines),
|
||||
text_inside=text_inside
|
||||
)
|
||||
|
||||
def register_aligned_textline(self, textline, coord):
|
||||
"""Update new textline to this alignment, adapting its average."""
|
||||
|
|
|
|||
|
|
@ -252,50 +252,6 @@ class TextNetworks(TextAlignments):
|
|||
self._textline_to_alignments[textline] = alignments
|
||||
alignments[align_id] = textedge.textlines
|
||||
|
||||
def _calculate_gaps_thresholds(self, percentile=75):
|
||||
"""Identify reasonable gaps between lines and columns based
|
||||
on gaps observed across alignments.
|
||||
This can be used to reject cells as too far away from
|
||||
the core table.
|
||||
"""
|
||||
h_gaps, v_gaps = [], []
|
||||
for align_id in self._text_alignments:
|
||||
edge_array = self._text_alignments[align_id]
|
||||
gaps = []
|
||||
vertical = align_id in HORIZONTAL_ALIGNMENTS
|
||||
sort_function = (lambda tl: tl.y0) \
|
||||
if vertical \
|
||||
else (lambda tl: tl.x0)
|
||||
for alignments in edge_array:
|
||||
tls = sorted(
|
||||
alignments.textlines,
|
||||
key=sort_function,
|
||||
reverse=True
|
||||
)
|
||||
for i in range(1, len(tls)):
|
||||
# If the lines are vertically aligned (stacked up), we
|
||||
# record the vertical gap between them
|
||||
if vertical:
|
||||
gap = tls[i-1].y1 - tls[i].y0
|
||||
else:
|
||||
gap = tls[i-1].x1 - tls[i].x0
|
||||
gaps.append(gap)
|
||||
if gaps:
|
||||
if vertical:
|
||||
v_gaps.append(np.percentile(gaps, percentile))
|
||||
else:
|
||||
h_gaps.append(np.percentile(gaps, percentile))
|
||||
direction_str = 'vertical' if vertical else 'horizontal'
|
||||
rounded_gaps = list(map(lambda x: round(x, 2), gaps))
|
||||
print(
|
||||
f"{direction_str} gaps found "
|
||||
f"for {align_id}: "
|
||||
f"{rounded_gaps} "
|
||||
f"with {percentile}th percentile "
|
||||
f"{np.percentile(gaps, percentile)}"
|
||||
)
|
||||
return max(h_gaps, default=None), max(v_gaps, default=None)
|
||||
|
||||
def _remove_unconnected_edges(self):
|
||||
"""Weed out elements which are only connected to others vertically
|
||||
or horizontally. There needs to be connections across both
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import tempfile
|
|||
import warnings
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
from urllib.request import Request
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
|
@ -96,7 +97,8 @@ def download_url(url):
|
|||
"""
|
||||
filename = "{}.pdf".format(random_string(6))
|
||||
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||
obj = urlopen(url)
|
||||
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
obj = urlopen(req)
|
||||
if PY3:
|
||||
content_type = obj.info().get_content_type()
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in New Issue