pull/112/merge
Jose Vargas 2020-10-25 23:33:15 +09:00 committed by GitHub
commit 692b8fcf57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 364 additions and 10 deletions

View File

@ -335,16 +335,18 @@ class Stream(BaseParser):
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
if self.columns is not None:
column_idx = table_idx if table_idx < len(self.columns) else -1
if self.columns[column_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[column_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns

View File

@ -2798,3 +2798,342 @@ data_stream_layout_kwargs = [
["A.O.P Cornas", ""],
["Domaine Lionnet « Terre Brûlée » 2012", "15 €"],
]
canada_manitoba_bills_first_table = [
["", "", "", "", "(November 19, 2019 to present)", "", "", "", "", "", ""],
[
"Bill",
"Title",
"Sponsor",
"1st",
"2nd",
"Committee/",
"Amended",
"Report",
"Concurrence",
"Royal Assent",
"In",
],
[
"No.",
"",
"",
"Reading",
"Reading",
"Reported",
"",
"Stage",
"and 3rd",
"",
"Effect",
],
["", "", "", "", "", "", "", "Amend.", "Reading", "", ""],
["", "", "", "", "GOVERNMENT BILLS", "", "", "", "", "", ""],
[
"1",
"An Act respecting the Administration of",
"Hon. Mr.",
"Nov. 19,",
"",
"FORMAL BILL",
"",
"",
"",
"",
"",
],
[
"",
"Oaths of Office/Loi sur la prestation des",
"PALLISTER",
"2019",
"",
"(not printed)",
"",
"",
"",
"",
"",
],
["", "serments d'entrée en fonction", "", "", "", "", "", "", "", "", ""],
[
"2",
"The Retail Business Hours of Operation",
"Hon. Mr.",
"Nov. 22,",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Act (Various Acts Amended or",
"FIELDING",
"2019",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Repealed)/Loi sur les heures d'ouverture",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"des commerces de détail (modification ou",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
["", "abrogation de diverses lois)", "", "", "", "", "", "", "", "", ""],
[
"3",
"The Liquor, Gaming and Cannabis Control",
"Hon. Mr.",
"Nov. 22,",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Amendment Act (Cannabis Social",
"CULLEN",
"2019",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Responsibility Fee)/Loi modifiant la Loi",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"sur la réglementation des alcools, des jeux",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
["", "et du cannabis (taxe de responsabilité", "", "", "", "", "", "", "", "", ""],
["", "sociale en matière de cannabis)", "", "", "", "", "", "", "", "", ""],
[
"4",
"The Manitoba Hydro Amendment Act/Loi",
"Hon. Mr.",
"Nov. 21,",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"modifiant la Loi sur l'Hydro-Manitoba",
"WHARTON",
"2019",
"",
"",
"",
"",
"",
"",
"",
],
[
"5",
"The Liquor, Gaming and Cannabis Control",
"Hon. Mr.",
"Nov. 22,",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Amendment Act/Loi modifiant la Loi sur",
"CULLEN",
"2019",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"la réglementation des alcools, des jeux et",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
["", "du cannabis", "", "", "", "", "", "", "", "", ""],
[
"6",
"The Planning Amendment Act/Loi",
"Hon. Ms.",
"Nov. 21,",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"modifiant la Loi sur l'aménagement du",
"SQUIRES",
"2019",
"",
"",
"",
"",
"",
"",
"",
],
["", "territoire", "", "", "", "", "", "", "", "", ""],
[
"7",
"The Employment Standards Code",
"Hon. Mrs. COX",
"Nov. 25,",
"Dec. 2,",
"Human Resources",
"No",
"Yes",
"Dec. 5, 2019",
"Dec. 5, 2019",
"RA",
],
[
"",
"Amendment Act (Leave for Victims of",
"",
"2019",
"2019",
"Dec. 3, 2019",
"",
"",
"",
"",
"",
],
[
"",
"Interpersonal Violence)/Loi modifiant le",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Code des normes d'emploi (congé pour les",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
["", "victimes de violence interpersonnelle)", "", "", "", "", "", "", "", "", ""],
[
"8",
"The Pension Benefits Amendment Act/Loi",
"Hon. Mr.",
"Nov. 27,",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"modifiant la Loi sur les prestations de",
"FIELDING",
"2019",
"",
"",
"",
"",
"",
"",
"",
],
["", "pension", "", "", "", "", "", "", "", "", ""],
]

Binary file not shown.

View File

@ -66,6 +66,19 @@ def test_stream_two_tables():
assert df2.equals(tables[1].df)
def test_stream_all_pages_with_known_column_coordinates():
df = pd.DataFrame(canada_manitoba_bills_first_table)
filename = os.path.join(testdir, "canada-manitoba-bills.pdf")
columns = ["93,242,305,350,395,468,517,566,629,693"]
tables = camelot.read_pdf(
filename, flavor="stream", pages="all", columns=columns, edge_tol=200
)
assert len(tables) == 5
assert df.equals(tables[0].df)
def test_stream_table_regions():
df = pd.DataFrame(data_stream_table_areas)