Merge 8beb8d79bf into 644bbe7c6d
commit
59de5fe7fc
|
|
@ -1,5 +1,5 @@
|
||||||
fontconfig/
|
fontconfig/
|
||||||
__pycache__/
|
__pycache__/*
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
*.so
|
*.so
|
||||||
|
|
||||||
|
|
@ -18,3 +18,5 @@ htmlcov/
|
||||||
|
|
||||||
# vscode
|
# vscode
|
||||||
.vscode
|
.vscode
|
||||||
|
.env
|
||||||
|
changelog.txt
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
||||||
Be cordial or be on your way. --Kenneth Reitz
|
Be cordial or be on your way. --Kenneth Reitz
|
||||||
|
|
||||||
https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way
|
https://kennethreitz.org/essays/2013/01/27/be-cordial-or-be-on-your-way
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ The following quote sums up the **Code Of Conduct**.
|
||||||
|
|
||||||
> Be cordial or be on your way. --Kenneth Reitz
|
> Be cordial or be on your way. --Kenneth Reitz
|
||||||
|
|
||||||
Kenneth Reitz has also written an [essay](https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way) on this topic, which you should read.
|
Kenneth Reitz has also written an [essay](https://kennethreitz.org/essays/2013/01/27/be-cordial-or-be-on-your-way) on this topic, which you should read.
|
||||||
|
|
||||||
As the [Requests Code Of Conduct](http://docs.python-requests.org/en/master/dev/contributing/#be-cordial) states, **all contributions are welcome**, as long as everyone involved is treated with respect.
|
As the [Requests Code Of Conduct](http://docs.python-requests.org/en/master/dev/contributing/#be-cordial) states, **all contributions are welcome**, as long as everyone involved is treated with respect.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -524,12 +524,12 @@ class Table(object):
|
||||||
|
|
||||||
def set_border(self):
|
def set_border(self):
|
||||||
"""Sets table border edges to True."""
|
"""Sets table border edges to True."""
|
||||||
for r in range(len(self.rows)):
|
for index, row in enumerate(self.rows):
|
||||||
self.cells[r][0].left = True
|
self.cells[index][0].left = True
|
||||||
self.cells[r][len(self.cols) - 1].right = True
|
self.cells[index][len(self.cols) - 1].right = True
|
||||||
for c in range(len(self.cols)):
|
for index, col in enumerate(self.cols):
|
||||||
self.cells[0][c].top = True
|
self.cells[0][index].top = True
|
||||||
self.cells[len(self.rows) - 1][c].bottom = True
|
self.cells[len(self.rows) - 1][index].bottom = True
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_span(self):
|
def set_span(self):
|
||||||
|
|
|
||||||
|
|
@ -35,9 +35,7 @@ class PDFHandler(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, filepath, pages="1", password=None):
|
def __init__(self, filepath, pages="1", password=None):
|
||||||
if is_url(filepath):
|
self.filepath = download_url(filepath) if is_url(filepath) else filepath
|
||||||
filepath = download_url(filepath)
|
|
||||||
self.filepath = filepath
|
|
||||||
if not filepath.lower().endswith(".pdf"):
|
if not filepath.lower().endswith(".pdf"):
|
||||||
raise NotImplementedError("File format not supported")
|
raise NotImplementedError("File format not supported")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -162,7 +162,7 @@ class Lattice(BaseParser):
|
||||||
return backend
|
return backend
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(table, idx, shift_text):
|
||||||
"""Reduces index of a text object if it lies within a spanning
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
cell.
|
cell.
|
||||||
|
|
||||||
|
|
@ -187,32 +187,28 @@ class Lattice(BaseParser):
|
||||||
indices = []
|
indices = []
|
||||||
for r_idx, c_idx, text in idx:
|
for r_idx, c_idx, text in idx:
|
||||||
for d in shift_text:
|
for d in shift_text:
|
||||||
if d == "l":
|
if d == "l" and table.cells[r_idx][c_idx].hspan:
|
||||||
if t.cells[r_idx][c_idx].hspan:
|
while not table.cells[r_idx][c_idx].left:
|
||||||
while not t.cells[r_idx][c_idx].left:
|
|
||||||
c_idx -= 1
|
c_idx -= 1
|
||||||
if d == "r":
|
if d == "r" and table.cells[r_idx][c_idx].hspan:
|
||||||
if t.cells[r_idx][c_idx].hspan:
|
while not table.cells[r_idx][c_idx].right:
|
||||||
while not t.cells[r_idx][c_idx].right:
|
|
||||||
c_idx += 1
|
c_idx += 1
|
||||||
if d == "t":
|
if d == "t" and table.cells[r_idx][c_idx].vspan:
|
||||||
if t.cells[r_idx][c_idx].vspan:
|
while not table.cells[r_idx][c_idx].top:
|
||||||
while not t.cells[r_idx][c_idx].top:
|
|
||||||
r_idx -= 1
|
r_idx -= 1
|
||||||
if d == "b":
|
if d == "b" and table.cells[r_idx][c_idx].vspan:
|
||||||
if t.cells[r_idx][c_idx].vspan:
|
while not table.cells[r_idx][c_idx].bottom:
|
||||||
while not t.cells[r_idx][c_idx].bottom:
|
|
||||||
r_idx += 1
|
r_idx += 1
|
||||||
indices.append((r_idx, c_idx, text))
|
indices.append((r_idx, c_idx, text))
|
||||||
return indices
|
return indices
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _copy_spanning_text(t, copy_text=None):
|
def _copy_spanning_text(table, copy_text=None):
|
||||||
"""Copies over text in empty spanning cells.
|
"""Copies over text in empty spanning cells.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
t : camelot.core.Table
|
table : camelot.core.Table
|
||||||
copy_text : list, optional (default: None)
|
copy_text : list, optional (default: None)
|
||||||
{'h', 'v'}
|
{'h', 'v'}
|
||||||
Select one or more strings from above and pass them as a list
|
Select one or more strings from above and pass them as a list
|
||||||
|
|
@ -221,23 +217,23 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
t : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for f in copy_text:
|
for f in copy_text:
|
||||||
if f == "h":
|
if f == "h":
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(table.cells)):
|
||||||
for j in range(len(t.cells[i])):
|
for j in range(len(table.cells[i])):
|
||||||
if t.cells[i][j].text.strip() == "":
|
if table.cells[i][j].text.strip() == "":
|
||||||
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
if table.cells[i][j].hspan and not table.cells[i][j].left:
|
||||||
t.cells[i][j].text = t.cells[i][j - 1].text
|
table.cells[i][j].text = table.cells[i][j - 1].text
|
||||||
elif f == "v":
|
elif f == "v":
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(table.cells)):
|
||||||
for j in range(len(t.cells[i])):
|
for j in range(len(table.cells[i])):
|
||||||
if t.cells[i][j].text.strip() == "":
|
if table.cells[i][j].text.strip() == "":
|
||||||
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
if table.cells[i][j].vspan and not table.cells[i][j].top:
|
||||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
table.cells[i][j].text = table.cells[i - 1][j].text
|
||||||
return t
|
return table
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
def scale_areas(areas):
|
def scale_areas(areas):
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,10 @@ def download_url(url):
|
||||||
"""
|
"""
|
||||||
filename = f"{random_string(6)}.pdf"
|
filename = f"{random_string(6)}.pdf"
|
||||||
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||||
headers = {"User-Agent": "Mozilla/5.0"}
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0",
|
||||||
|
"Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1"
|
||||||
|
}
|
||||||
request = Request(url, None, headers)
|
request = Request(url, None, headers)
|
||||||
obj = urlopen(request)
|
obj = urlopen(request)
|
||||||
content_type = obj.info().get_content_type()
|
content_type = obj.info().get_content_type()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue