Remove f-strings, fix url based unit tests
f-strings fail unit tests in Python <3.7, removed them for .format. Made download_url simulate Mozilla/5.0 to restore unit tests, since server targetted was 403ing.pull/153/head
parent
016776939e
commit
2624010197
|
|
@ -68,8 +68,12 @@ class TextAlignment(object):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
text_inside = " | ".join(
|
text_inside = " | ".join(
|
||||||
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
|
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
|
||||||
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
|
return "<TextEdge coord={coord} tl={tl_count} " \
|
||||||
f"textlines text='{text_inside}...'>"
|
"textlines text='{text_inside}...'>".format(
|
||||||
|
coord=self.coord,
|
||||||
|
tl_count=len(self.textlines),
|
||||||
|
text_inside=text_inside
|
||||||
|
)
|
||||||
|
|
||||||
def register_aligned_textline(self, textline, coord):
|
def register_aligned_textline(self, textline, coord):
|
||||||
"""Update new textline to this alignment, adapting its average."""
|
"""Update new textline to this alignment, adapting its average."""
|
||||||
|
|
|
||||||
|
|
@ -252,50 +252,6 @@ class TextNetworks(TextAlignments):
|
||||||
self._textline_to_alignments[textline] = alignments
|
self._textline_to_alignments[textline] = alignments
|
||||||
alignments[align_id] = textedge.textlines
|
alignments[align_id] = textedge.textlines
|
||||||
|
|
||||||
def _calculate_gaps_thresholds(self, percentile=75):
|
|
||||||
"""Identify reasonable gaps between lines and columns based
|
|
||||||
on gaps observed across alignments.
|
|
||||||
This can be used to reject cells as too far away from
|
|
||||||
the core table.
|
|
||||||
"""
|
|
||||||
h_gaps, v_gaps = [], []
|
|
||||||
for align_id in self._text_alignments:
|
|
||||||
edge_array = self._text_alignments[align_id]
|
|
||||||
gaps = []
|
|
||||||
vertical = align_id in HORIZONTAL_ALIGNMENTS
|
|
||||||
sort_function = (lambda tl: tl.y0) \
|
|
||||||
if vertical \
|
|
||||||
else (lambda tl: tl.x0)
|
|
||||||
for alignments in edge_array:
|
|
||||||
tls = sorted(
|
|
||||||
alignments.textlines,
|
|
||||||
key=sort_function,
|
|
||||||
reverse=True
|
|
||||||
)
|
|
||||||
for i in range(1, len(tls)):
|
|
||||||
# If the lines are vertically aligned (stacked up), we
|
|
||||||
# record the vertical gap between them
|
|
||||||
if vertical:
|
|
||||||
gap = tls[i-1].y1 - tls[i].y0
|
|
||||||
else:
|
|
||||||
gap = tls[i-1].x1 - tls[i].x0
|
|
||||||
gaps.append(gap)
|
|
||||||
if gaps:
|
|
||||||
if vertical:
|
|
||||||
v_gaps.append(np.percentile(gaps, percentile))
|
|
||||||
else:
|
|
||||||
h_gaps.append(np.percentile(gaps, percentile))
|
|
||||||
direction_str = 'vertical' if vertical else 'horizontal'
|
|
||||||
rounded_gaps = list(map(lambda x: round(x, 2), gaps))
|
|
||||||
print(
|
|
||||||
f"{direction_str} gaps found "
|
|
||||||
f"for {align_id}: "
|
|
||||||
f"{rounded_gaps} "
|
|
||||||
f"with {percentile}th percentile "
|
|
||||||
f"{np.percentile(gaps, percentile)}"
|
|
||||||
)
|
|
||||||
return max(h_gaps, default=None), max(v_gaps, default=None)
|
|
||||||
|
|
||||||
def _remove_unconnected_edges(self):
|
def _remove_unconnected_edges(self):
|
||||||
"""Weed out elements which are only connected to others vertically
|
"""Weed out elements which are only connected to others vertically
|
||||||
or horizontally. There needs to be connections across both
|
or horizontally. There needs to be connections across both
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
from urllib.request import Request
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
@ -96,7 +97,8 @@ def download_url(url):
|
||||||
"""
|
"""
|
||||||
filename = "{}.pdf".format(random_string(6))
|
filename = "{}.pdf".format(random_string(6))
|
||||||
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||||
obj = urlopen(url)
|
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||||
|
obj = urlopen(req)
|
||||||
if PY3:
|
if PY3:
|
||||||
content_type = obj.info().get_content_type()
|
content_type = obj.info().get_content_type()
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue