Merge pull request #227 from socialcopsdev/fix-050-bugs

Fix v0.5.0 bugs
pull/2/head
Vinayak Mehta 2018-12-13 16:29:47 +05:30 committed by GitHub
commit f8eaec4ce4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 120 additions and 99 deletions

View File

@ -271,10 +271,11 @@ class Lattice(BaseParser):
tk, self.vertical_segments, self.horizontal_segments)
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
self.t_bbox = t_bbox
for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0))
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox
cols, rows = zip(*self.table_bbox[tk])
cols, rows = list(cols), list(rows)
@ -308,7 +309,9 @@ class Lattice(BaseParser):
table = table.set_span()
pos_errors = []
for direction in self.t_bbox:
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ['vertical', 'horizontal']:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text,

View File

@ -293,10 +293,11 @@ class Stream(BaseParser):
t_bbox = {}
t_bbox['horizontal'] = text_in_bbox(tk, self.horizontal_text)
t_bbox['vertical'] = text_in_bbox(tk, self.vertical_text)
self.t_bbox = t_bbox
for direction in self.t_bbox:
self.t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
t_bbox['horizontal'].sort(key=lambda x: (-x.y0, x.x0))
t_bbox['vertical'].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_close_tol=self.row_close_tol)
@ -350,8 +351,11 @@ class Stream(BaseParser):
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = Table(cols, rows)
table = table.set_all_edges()
pos_errors = []
for direction in self.t_bbox:
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ['vertical', 'horizontal']:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text,

View File

@ -344,9 +344,9 @@ def flag_font_size(textline, direction):
fchars = [t[0] for t in chars]
if ''.join(fchars).strip():
flist.append(''.join(fchars))
fstring = ''.join(flist).strip('\n')
fstring = ''.join(flist)
else:
fstring = ''.join([t.get_text() for t in textline]).strip('\n')
fstring = ''.join([t.get_text() for t in textline])
return fstring
@ -419,7 +419,7 @@ def split_textline(table, textline, direction, flag_size=False):
grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction)))
else:
gchars = [t[2].get_text() for t in chars]
grouped_chars.append((key[0], key[1], ''.join(gchars).strip('\n')))
grouped_chars.append((key[0], key[1], ''.join(gchars)))
return grouped_chars
@ -500,7 +500,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
if flag_size:
return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error
else:
return [(r_idx, c_idx, t.get_text().strip('\n'))], error
return [(r_idx, c_idx, t.get_text())], error
def compute_accuracy(error_weights):

View File

@ -82,42 +82,40 @@ data_stream_two_tables_1 = [
["", "", "Total", "", "", "Male", "", "", "Female", ""],
["Offense charged", "", "Under 18", "18 years", "", "Under 18", "18 years", "", "Under 18", "18 years"],
["", "Total", "years", "and over", "Total", "years", "and over", "Total", "years", "and over"],
["Total . . . . . . . . . . . . . . . . . . . . . . . . .", "11,062 .6", "1,540 .0", "9,522 .6", "8,263 .3", "1,071 .6", "7,191 .7", "2,799 .2", "468 .3", "2,330 .9"],
["Violent crime . . . . . . . . . . . . . . . . . .", "467 .9", "69 .1", "398 .8", "380 .2", "56 .5", "323 .7", "87 .7", "12 .6", "75 .2"],
["Total .\n .\n . . . . . .\n . .\n . .\n . .\n . .\n . .\n . .\n . .\n . . .", "11,062 .6", "1,540 .0", "9,522 .6", "8,263 .3", "1,071 .6", "7,191 .7", "2,799 .2", "468 .3", "2,330 .9"],
["Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n . .", "467 .9", "69 .1", "398 .8", "380 .2", "56 .5", "323 .7", "87 .7", "12 .6", "75 .2"],
["Murder and nonnegligent", "", "", "", "", "", "", "", "", ""],
["manslaughter . . . . . . . .. .. .. .. ..", "10.0", "0.9", "9.1", "9.0", "0.9", "8.1", "1.1", "", "1.0"],
["Forcible rape . . . . . . . .. .. .. .. .. .", "17.5", "2.6", "14.9", "17.2", "2.5", "14.7", "", "", ""],
["Robbery . . . .. .. . .. . ... . ... . ...", "102.1", "25.5", "76.6", "90.0", "22.9", "67.1", "12.1", "2.5", "9.5"],
["Aggravated assault . . . . . . . .. .. ..", "338.4", "40.1", "298.3", "264.0", "30.2", "233.8", "74.4", "9.9", "64.5"],
["Property crime . . . . . . . . . . . . . . . . .", "1,396 .4", "338 .7", "1,057 .7", "875 .9", "210 .8", "665 .1", "608 .2", "127 .9", "392 .6"],
["Burglary . .. . . . . .. ... .... .... ..", "240.9", "60.3", "180.6", "205.0", "53.4", "151.7", "35.9", "6.9", "29.0"],
["Larceny-theft . . . . . . . .. .. .. .. .. .", "1,080.1", "258.1", "822.0", "608.8", "140.5", "468.3", "471.3", "117.6", "353.6"],
["Motor vehicle theft . . . . .. .. . .... .", "65.6", "16.0", "49.6", "53.9", "13.3", "40.7", "11.7", "2.7", "8.9"],
["Arson .. . . . .. . ... .... .... .... .", "9.8", "4.3", "5.5", "8.1", "3.7", "4.4", "1.7", "0.6", "1.1"],
["Other assaults .. . . . . .. . ... . ... ..", "1,061.3", "175.3", "886.1", "785.4", "115.4", "670.0", "276.0", "59.9", "216.1"],
["Forgery and counterfeiting .. . . . . . ..", "68.9", "1.7", "67.2", "42.9", "1.2", "41.7", "26.0", "0.5", "25.5"],
["Fraud .... .. . . .. ... .... .... ....", "173.7", "5.1", "168.5", "98.4", "3.3", "95.0", "75.3", "1.8", "73.5"],
["Embezzlement . . .. . . . .. . ... . ....", "14.6", "", "14.1", "7.2", "", "6.9", "7.4", "", "7.2"],
["Stolen property 1 . . . . . . .. . .. .. ...", "84.3", "15.1", "69.2", "66.7", "12.2", "54.5", "17.6", "2.8", "14.7"],
["Vandalism . . . . . . . .. .. .. .. .. ....", "217.4", "72.7", "144.7", "178.1", "62.8", "115.3", "39.3", "9.9", "29.4"],
["manslaughter . . . . . . . .\n. .\n. .\n. .\n. .\n.", "10.0", "0.9", "9.1", "9.0", "0.9", "8.1", "1.1", "", "1.0"],
["Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n. .", "17.5", "2.6", "14.9", "17.2", "2.5", "14.7", "", "", ""],
["Robbery . . . .\n. .\n. . .\n. . .\n.\n. . .\n.\n. . .\n.\n.", "102.1", "25.5", "76.6", "90.0", "22.9", "67.1", "12.1", "2.5", "9.5"],
["Aggravated assault . . . . . . . .\n. .\n. .\n.", "338.4", "40.1", "298.3", "264.0", "30.2", "233.8", "74.4", "9.9", "64.5"],
["Property crime . . . .\n . .\n . . .\n . . .\n .\n . . . .", "1,396 .4", "338 .7", "1,057 .7", "875 .9", "210 .8", "665 .1", "608 .2", "127 .9", "392 .6"],
["Burglary . .\n. . . . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.", "240.9", "60.3", "180.6", "205.0", "53.4", "151.7", "35.9", "6.9", "29.0"],
["Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n. .", "1,080.1", "258.1", "822.0", "608.8", "140.5", "468.3", "471.3", "117.6", "353.6"],
["Motor vehicle theft . . . . .\n. .\n. . .\n.\n.\n. .", "65.6", "16.0", "49.6", "53.9", "13.3", "40.7", "11.7", "2.7", "8.9"],
["Arson .\n. . . . .\n. . .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .", "9.8", "4.3", "5.5", "8.1", "3.7", "4.4", "1.7", "0.6", "1.1"],
["Other assaults .\n. . . . . .\n. . .\n.\n. . .\n.\n. .\n.", "1,061.3", "175.3", "886.1", "785.4", "115.4", "670.0", "276.0", "59.9", "216.1"],
["Forgery and counterfeiting .\n. . . . . . .\n.", "68.9", "1.7", "67.2", "42.9", "1.2", "41.7", "26.0", "0.5", "25.5"],
["Fraud .\n.\n.\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n.", "173.7", "5.1", "168.5", "98.4", "3.3", "95.0", "75.3", "1.8", "73.5"],
["Embezzlement . . .\n. . . . .\n. . .\n.\n. . .\n.\n.\n.", "14.6", "", "14.1", "7.2", "", "6.9", "7.4", "", "7.2"],
["Stolen property 1 . . . . . . .\n. . .\n. .\n. .\n.\n.", "84.3", "15.1", "69.2", "66.7", "12.2", "54.5", "17.6", "2.8", "14.7"],
["Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.", "217.4", "72.7", "144.7", "178.1", "62.8", "115.3", "39.3", "9.9", "29.4"],
["Weapons; carrying, possessing, etc. .", "132.9", "27.1", "105.8", "122.1", "24.3", "97.8", "10.8", "2.8", "8.0"],
["Prostitution and commercialized vice",
"56.9", "1.1", "55.8", "17.3", "", "17.1", "39.6", "0.8", "38.7"],
["Sex offenses 2 . . . . .. . . . .. .. .. . ..", "61.5", "10.7", "50.7", "56.1", "9.6", "46.5", "5.4", "1.1", "4.3"],
["Drug abuse violations . . . . . . . .. ...", "1,333.0", "136.6", "1,196.4", "1,084.3", "115.2", "969.1", "248.7", "21.4", "227.3"],
["Gambling .. . . . . .. ... . ... . ... ...", "8.2", "1.4", "6.8", "7.2", "1.4", "5.9", "0.9", "", "0.9"],
["Prostitution and commercialized vice", "56.9", "1.1", "55.8", "17.3", "", "17.1", "39.6", "0.8", "38.7"],
["Sex offenses 2 . . . . .\n. . . . .\n. .\n. .\n. . .\n.", "61.5", "10.7", "50.7", "56.1", "9.6", "46.5", "5.4", "1.1", "4.3"],
["Drug abuse violations . . . . . . . .\n. .\n.\n.", "1,333.0", "136.6", "1,196.4", "1,084.3", "115.2", "969.1", "248.7", "21.4", "227.3"],
["Gambling .\n. . . . . .\n. .\n.\n. . .\n.\n. . .\n.\n. .\n.\n.", "8.2", "1.4", "6.8", "7.2", "1.4", "5.9", "0.9", "", "0.9"],
["Offenses against the family and", "", "", "", "", "", "", "", "", ""],
["children . . . .. . . .. .. .. .. .. .. . ..", "92.4", "3.7", "88.7", "68.9", "2.4", "66.6", "23.4", "1.3", "22.1"],
["Driving under the influence . . . . . .. .", "1,158.5", "109.2", "1,147.5", "895.8", "8.2", "887.6", "262.7", "2.7", "260.0"],
["Liquor laws . . . . . . . .. .. .. .. .. .. .", "48.2", "90.2", "368.0", "326.8", "55.4", "271.4",
"131.4", "34.7", "96.6"],
["Drunkenness . . .. . . . .. . ... . ... ..", "488.1", "11.4", "476.8", "406.8", "8.5", "398.3", "81.3", "2.9", "78.4"],
["Disorderly conduct . .. . . . . . .. .. .. .", "529.5", "136.1", "393.3", "387.1", "90.8", "296.2", "142.4", "45.3", "97.1"],
["Vagrancy . . . .. . . . ... .... .... ...", "26.6", "2.2", "24.4", "20.9", "1.6", "19.3", "5.7", "0.6", "5.1"],
["All other offenses (except traffic) . . ..", "306.1", "263.4", "2,800.8", "2,337.1", "194.2", "2,142.9", "727.0", "69.2", "657.9"],
["Suspicion . . . .. . . .. .. .. .. .. .. . ..", "1.6", "", "1.4", "1.2", "", "1.0", "", "", ""],
["Curfew and loitering law violations ..", "91.0", "91.0", "(X)", "63.1", "63.1", "(X)", "28.0", "28.0", "(X)"],
["Runaways . . . . . . . .. .. .. .. .. ....", "75.8", "75.8", "(X)", "34.0", "34.0", "(X)", "41.8", "41.8", "(X)"],
["children . . . .\n. . . .\n. .\n. .\n. .\n. .\n. .\n. . .\n.", "92.4", "3.7", "88.7", "68.9", "2.4", "66.6", "23.4", "1.3", "22.1"],
["Driving under the influence . . . . . .\n. .", "1,158.5", "109.2", "1,147.5", "895.8", "8.2", "887.6", "262.7", "2.7", "260.0"],
["Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .", "48.2", "90.2", "368.0", "326.8", "55.4", "271.4", "131.4", "34.7", "96.6"],
["Drunkenness . . .\n. . . . .\n. . .\n.\n. . .\n.\n. .\n.", "488.1", "11.4", "476.8", "406.8", "8.5", "398.3", "81.3", "2.9", "78.4"],
["Disorderly conduct . .\n. . . . . . .\n. .\n. .\n. .", "529.5", "136.1", "393.3", "387.1", "90.8", "296.2", "142.4", "45.3", "97.1"],
["Vagrancy . . . .\n. . . . .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.", "26.6", "2.2", "24.4", "20.9", "1.6", "19.3", "5.7", "0.6", "5.1"],
["All other offenses (except traffic) . . .\n.", "306.1", "263.4", "2,800.8", "2,337.1", "194.2", "2,142.9", "727.0", "69.2", "657.9"],
["Suspicion . . . .\n. . . .\n. .\n. .\n. .\n. .\n. .\n. . .\n.", "1.6", "", "1.4", "1.2", "", "1.0", "", "", ""],
["Curfew and loitering law violations .\n.", "91.0", "91.0", "(X)", "63.1", "63.1", "(X)", "28.0", "28.0", "(X)"],
["Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.", "75.8", "75.8", "(X)", "34.0", "34.0", "(X)", "41.8", "41.8", "(X)"],
["", " Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", "", "", "", "", "", "", "", ""],
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", "", "", "", "", ""]
]
@ -128,41 +126,40 @@ data_stream_two_tables_2 = [
["[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", "", "", "", "", ""],
["with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", "", "", "", "", ""],
["", "", "", "", "American", ""],
["Offense charged", "", "", "",
"Indian/Alaskan", "Asian Pacific"],
["Offense charged", "", "", "", "Indian/Alaskan", "Asian Pacific"],
["", "Total", "White", "Black", "Native", "Islander"],
["Total . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "10,690,561", "7,389,208", "3,027,153", "150,544", "123,656"],
["Violent crime . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "456,965", "268,346", "177,766", "5,608", "5,245"],
["Murder and nonnegligent manslaughter . .. ... .", "9,739", "4,741", "4,801", "100", "97"],
["Forcible rape . . . . . . . .. .. .. .. .... .. ...... .", "16,362", "10,644", "5,319", "169", "230"],
["Robbery . . . . .. . . . ... . ... . .... .... .... . . .", "100,496", "43,039", "55,742", "726", "989"],
["Aggravated assault . . . . . . . .. .. ...... .. ....", "330,368", "209,922", "111,904", "4,613", "3,929"],
["Property crime . . . . . . . . . . . . . . . . . . . . . . . . . . .", "1,364,409", "922,139", "406,382", "17,599", "18,289"],
["Burglary . . .. . . . .. . .... .... .... .... ... . . .", "234,551", "155,994", "74,419", "2,021", "2,117"],
["Larceny-theft . . . . . . . .. .. .. .. .... .. ...... .", "1,056,473", "719,983", "306,625", "14,646", "15,219"],
["Motor vehicle theft . . . . . .. ... . ... ..... ... ..", "63,919", "39,077", "23,184", "817", "841"],
["Arson .. . . .. .. .. ... .... .... .... .... . . . . .", "9,466", "7,085", "2,154", "115", "112"],
["Other assaults .. . . . . . ... . ... . ... ..... ... ..", "1,032,502", "672,865", "332,435", "15,127", "12,075"],
["Forgery and counterfeiting .. . . . . . ... ..... .. ..", "67,054", "44,730", "21,251", "345", "728"],
["Fraud ... . . . . .. .. .. .. .. .. .. .. .. .... . . . . . .", "161,233", "108,032", "50,367", "1,315", "1,519"],
["Embezzlement . . . .. . . . ... . ... . .... ... .....", "13,960", "9,208", "4,429", "75", "248"],
["Stolen property; buying, receiving, possessing .. .", "82,714", "51,953", "29,357", "662", "742"],
["Vandalism . . . . . . . .. .. .. .. .. .. .... .. ..... .", "212,173", "157,723", "48,746", "3,352", "2,352"],
["Weapons—carrying, possessing, etc. .. .. ... .. .", "130,503", "74,942", "53,441", "951", "1,169"],
["Prostitution and commercialized vice . ... .. .. ..", "56,560", "31,699", "23,021", "427", "1,413"],
["Sex offenses 1 . . . . . . . .. .. .. .. .... .. ...... .", "60,175", "44,240", "14,347", "715", "873"],
["Drug abuse violations . . . . . . . .. . ..... .. .....", "1,301,629", "845,974", "437,623", "8,588", "9,444"],
["Gambling . . . . .. . . . ... . ... . .. ... . ...... .. .", "8,046", "2,290", "5,518", "27", "211"],
["Offenses against the family and children ... .. .. .", "87,232", "58,068", "26,850", "1,690", "624"],
["Driving under the influence . . . . . . .. ... ...... .", "1,105,401", "954,444", "121,594", "14,903", "14,460"],
["Liquor laws . . . . . . . .. .. .. .. .. . ..... .. .....", "444,087", "373,189", "50,431", "14,876", "5,591"],
["Drunkenness . .. . . . . . ... . ... . ..... . .......", "469,958", "387,542", "71,020", "8,552", "2,844"],
["Disorderly conduct . . .. . . . . .. .. . ..... .. .....", "515,689", "326,563", "176,169", "8,783", "4,174"],
["Vagrancy . . .. .. . . .. ... .... .... .... .... . . .", "26,347", "14,581", "11,031", "543", "192"],
["All other offenses (except traffic) . .. .. .. ..... ..", "2,929,217", "1,937,221", "911,670", "43,880", "36,446"],
["Suspicion . . .. . . . .. .. .. .. .. .. .. ...... .. . . .", "1,513", "677", "828", "1", "7"],
["Curfew and loitering law violations . .. ... .. ....", "89,578", "54,439", "33,207", "872", "1,060"],
["Runaways . . . . . . . .. .. .. .. .. .. .... .. ..... .", "73,616", "48,343", "19,670", "1,653", "3,950"],
["Total .\n .\n .\n .\n . .\n . . .\n . . .\n .\n . . .\n .\n . . .\n . .\n .\n . . .\n .\n .\n .\n . .\n . .\n . .", "10,690,561", "7,389,208", "3,027,153", "150,544", "123,656"],
["Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n .\n .\n . .\n . .\n .\n .\n .\n .\n . .", "456,965", "268,346", "177,766", "5,608", "5,245"],
["Murder and nonnegligent manslaughter . .\n. .\n.\n. .", "9,739", "4,741", "4,801", "100", "97"],
["Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", "16,362", "10,644", "5,319", "169", "230"],
["Robbery . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", "100,496", "43,039", "55,742", "726", "989"],
["Aggravated assault . . . . . . . .\n. .\n. .\n.\n.\n.\n.\n. .\n. .\n.\n.\n.", "330,368", "209,922", "111,904", "4,613", "3,929"],
["Property crime . . . . .\n . . . . .\n .\n . . .\n .\n . .\n .\n .\n .\n . .\n .\n . .\n .\n .", "1,364,409", "922,139", "406,382", "17,599", "18,289"],
["Burglary . . .\n. . . . .\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n. . . .", "234,551", "155,994", "74,419", "2,021", "2,117"],
["Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", "1,056,473", "719,983", "306,625", "14,646", "15,219"],
["Motor vehicle theft . . . . . .\n. .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", "63,919", "39,077", "23,184", "817", "841"],
["Arson .\n. . . .\n. .\n. .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . . . .", "9,466", "7,085", "2,154", "115", "112"],
["Other assaults .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", "1,032,502", "672,865", "332,435", "15,127", "12,075"],
["Forgery and counterfeiting .\n. . . . . . .\n.\n. .\n.\n.\n.\n. .\n. .\n.", "67,054", "44,730", "21,251", "345", "728"],
["Fraud .\n.\n. . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. . . . . . .", "161,233", "108,032", "50,367", "1,315", "1,519"],
["Embezzlement . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n. .\n.\n.\n.\n.", "13,960", "9,208", "4,429", "75", "248"],
["Stolen property; buying, receiving, possessing .\n. .", "82,714", "51,953", "29,357", "662", "742"],
["Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", "212,173", "157,723", "48,746", "3,352", "2,352"],
["Weapons—carrying, possessing, etc. .\n. .\n. .\n.\n. .\n. .", "130,503", "74,942", "53,441", "951", "1,169"],
["Prostitution and commercialized vice . .\n.\n. .\n. .\n. .\n.", "56,560", "31,699", "23,021", "427", "1,413"],
["Sex offenses 1 . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", "60,175", "44,240", "14,347", "715", "873"],
["Drug abuse violations . . . . . . . .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", "1,301,629", "845,974", "437,623", "8,588", "9,444"],
["Gambling . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n. .\n.\n. . .\n.\n.\n.\n.\n. .\n. .", "8,046", "2,290", "5,518", "27", "211"],
["Offenses against the family and children .\n.\n. .\n. .\n. .", "87,232", "58,068", "26,850", "1,690", "624"],
["Driving under the influence . . . . . . .\n. .\n.\n. .\n.\n.\n.\n.\n. .", "1,105,401", "954,444", "121,594", "14,903", "14,460"],
["Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", "444,087", "373,189", "50,431", "14,876", "5,591"],
["Drunkenness . .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n.\n.\n. . .\n.\n.\n.\n.\n.\n.", "469,958", "387,542", "71,020", "8,552", "2,844"],
["Disorderly conduct . . .\n. . . . . .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", "515,689", "326,563", "176,169", "8,783", "4,174"],
["Vagrancy . . .\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", "26,347", "14,581", "11,031", "543", "192"],
["All other offenses (except traffic) . .\n. .\n. .\n. .\n.\n.\n.\n. .\n.", "2,929,217", "1,937,221", "911,670", "43,880", "36,446"],
["Suspicion . . .\n. . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.\n.\n. .\n. . . .", "1,513", "677", "828", "1", "7"],
["Curfew and loitering law violations . .\n. .\n.\n. .\n. .\n.\n.\n.", "89,578", "54,439", "33,207", "872", "1,060"],
["Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", "73,616", "48,343", "19,670", "1,653", "3,950"],
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,", "", "", "", ""]
]
@ -170,7 +167,7 @@ data_stream_two_tables_2 = [
data_stream_table_areas = [
["", "One Withholding"],
["Payroll Period", "Allowance"],
["Weekly", "$71.15"],
["Weekly", "$\n71.15"],
["Biweekly", "142.31"],
["Semimonthly", "154.17"],
["Monthly", "308.33"],
@ -316,8 +313,8 @@ data_stream_flag_size = [
]
data_lattice = [
["Cycle Name", "KI (1/km)", "Distance (mi)", "Percent Fuel Savings", "", "", ""],
["", "", "", "Improved Speed", "Decreased Accel", "Eliminate Stops", "Decreased Idle"],
["Cycle \nName", "KI \n(1/km)", "Distance \n(mi)", "Percent Fuel Savings", "", "", ""],
["", "", "", "Improved \nSpeed", "Decreased \nAccel", "Eliminate \nStops", "Decreased \nIdle"],
["2012_2", "3.30", "1.3", "5.9%", "9.5%", "29.2%", "17.4%"],
["2145_1", "0.68", "11.2", "2.4%", "0.1%", "9.5%", "2.7%"],
["4234_1", "0.59", "58.7", "8.5%", "1.3%", "8.5%", "3.3%"],
@ -326,7 +323,7 @@ data_lattice = [
]
data_lattice_table_rotated = [
["State", "Nutritional Assessment (No. of individuals)", "", "", "", "IYCF Practices (No. of mothers: 2011-12)", "Blood Pressure (No. of adults: 2011-12)", "", "Fasting Blood Sugar (No. of adults:2011-12)", ""],
["State", "Nutritional Assessment \n(No. of individuals)", "", "", "", "IYCF Practices \n(No. of mothers: \n2011-12)", "Blood Pressure \n(No. of adults: \n2011-12)", "", "Fasting Blood Sugar \n(No. of adults:\n2011-12)", ""],
["", "1975-79", "1988-90", "1996-97", "2011-12", "", "Men", "Women", "Men", "Women"],
["Kerala", "5738", "6633", "8864", "8297", "245", "2161", "3195", "1645", "2391"],
["Tamil Nadu", "7387", "10217", "5813", "7851", "413", "2134", "2858", "1119", "1739"],
@ -343,7 +340,7 @@ data_lattice_table_rotated = [
data_lattice_two_tables_1 = [
["State", "n", "Literacy Status", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
["", "", "Illiterate", "Read & \nWrite", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5"],
["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2"],
["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8"],
@ -359,7 +356,7 @@ data_lattice_two_tables_1 = [
data_lattice_two_tables_2 = [
["State", "n", "Literacy Status", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
["", "", "Illiterate", "Read & \nWrite", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
["Kerala", "2400", "8.8", "0.3", "20.1", "17.0", "45.6", "8.2"],
["Tamil Nadu", "2400", "29.9", "1.5", "8.5", "33.1", "22.3", "4.8"],
["Karnataka", "2399", "47.9", "2.5", "10.2", "18.8", "18.4", "2.3"],
@ -376,7 +373,7 @@ data_lattice_two_tables_2 = [
data_lattice_table_areas = [
["", "", "", "", "", "", "", "", ""],
["State", "n", "Literacy Status", "", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College", ""],
["", "", "Illiterate", "Read & \nWrite", "1-4 std.", "5-8 std.", "9-12 std.", "College", ""],
["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5", ""],
["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2", ""],
["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8", ""],
@ -392,13 +389,13 @@ data_lattice_table_areas = [
]
data_lattice_process_background = [
["State", "Date", "Halt stations", "Halt days", "Persons directly reached(in lakh)", "Persons trained", "Persons counseled" ,"Persons testedfor HIV"],
["State", "Date", "Halt \nstations", "Halt \ndays", "Persons \ndirectly \nreached\n(in lakh)", "Persons \ntrained", "Persons \ncounseled", "Persons \ntested\nfor HIV"],
["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"],
["Rajasthan", "2.12.2009 to 19.12.2009", "", "", "", "", "", ""],
["Gujarat", "20.12.2009 to 3.1.2010", "6", "13", "6.03", "3,810", "2,317", "1,453"],
["Maharashtra", "4.01.2010 to 1.2.2010", "13", "26", "1.27", "5,680", "9,027", "4,153"],
["Karnataka", "2.2.2010 to 22.2.2010", "11", "19", "1.80", "5,741", "3,658", "3,183"],
["Kerala", "23.2.2010 to 11.3.2010", "9", "17", "1.42", "3,559", "2,173", "855"],
["Rajasthan", "2.12.2009 to \n19.12.2009", "", "", "", "", "", ""],
["Gujarat", "20.12.2009 to \n3.1.2010", "6", "13", "6.03", "3,810", "2,317", "1,453"],
["Maharashtra", "4.01.2010 to \n1.2.2010", "13", "26", "1.27", "5,680", "9,027", "4,153"],
["Karnataka", "2.2.2010 to \n22.2.2010", "11", "19", "1.80", "5,741", "3,658", "3,183"],
["Kerala", "23.2.2010 to \n11.3.2010", "9", "17", "1.42", "3,559", "2,173", "855"],
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"]
]
@ -442,11 +439,11 @@ data_lattice_copy_text = [
["PCCM", "San Francisco", "Family Mosaic", "25"],
["PCCM", "Total PHP Enrollment", "", "853"],
["All Models Total Enrollments", "", "", "10,132,875"],
["Source: Data Warehouse 12/14/15", "", "", ""]
["Source: Data Warehouse \n12/14/15", "", "", ""]
]
data_lattice_shift_text_left_top = [
["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"],
["Investigations", "No. of\nHHs", "Age/Sex/\nPhysiological Group", "Preva-\nlence", "C.I*", "Relative\nPrecision", "Sample size\nper State"],
["Anthropometry", "2400", "All the available individuals", "", "", "", ""],
["Clinical Examination", "", "", "", "", "", ""],
["History of morbidity", "", "", "", "", "", ""],
@ -455,12 +452,12 @@ data_lattice_shift_text_left_top = [
["", "", "Women (≥ 18 yrs)", "", "", "", "1728"],
["Fasting blood glucose", "2400", "Men (≥ 18 yrs)", "5%", "95%", "20%", "1825"],
["", "", "Women (≥ 18 yrs)", "", "", "", "1825"],
["Knowledge &Practices on HTN &DM", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["Knowledge &\nPractices on HTN &\nDM", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"]
]
data_lattice_shift_text_disable = [
["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"],
["Investigations", "No. of\nHHs", "Age/Sex/\nPhysiological Group", "Preva-\nlence", "C.I*", "Relative\nPrecision", "Sample size\nper State"],
["Anthropometry", "", "", "", "", "", ""],
["Clinical Examination", "2400", "", "All the available individuals", "", "", ""],
["History of morbidity", "", "", "", "", "", ""],
@ -469,12 +466,12 @@ data_lattice_shift_text_disable = [
["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"],
["", "", "Men (≥ 18 yrs)", "", "", "", "1825"],
["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"],
["Knowledge &Practices on HTN &", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["Knowledge &\nPractices on HTN &", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["DM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"]
]
data_lattice_shift_text_right_bottom = [
["Investigations", "No. ofHHs", "Age/Sex/Physiological Group", "Preva-lence", "C.I*", "RelativePrecision", "Sample sizeper State"],
["Investigations", "No. of\nHHs", "Age/Sex/\nPhysiological Group", "Preva-\nlence", "C.I*", "Relative\nPrecision", "Sample size\nper State"],
["Anthropometry", "", "", "", "", "", ""],
["Clinical Examination", "", "", "", "", "", ""],
["History of morbidity", "2400", "", "", "", "", "All the available individuals"],
@ -484,5 +481,13 @@ data_lattice_shift_text_right_bottom = [
["", "", "Men (≥ 18 yrs)", "", "", "", "1825"],
["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"],
["", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
["Knowledge &Practices on HTN &DM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"]
["Knowledge &\nPractices on HTN &\nDM", "2400", "Women (≥ 18 yrs)", "-", "-", "-", "1728"]
]
data_arabic = [
['ً\n\xa0\nﺎﺒﺣﺮﻣ', 'ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ'],
['ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ', '؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ'],
['1234', 'ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ'],
['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'],
['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', '']
]

View File

@ -62,6 +62,7 @@ def test_stream_two_tables():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='stream')
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
@ -179,3 +180,11 @@ def test_repr():
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.42 x2=164.64 y2=233.89>"
def test_arabic():
df = pd.DataFrame(data_arabic)
filename = os.path.join(testdir, "tabula/arabic.pdf")
tables = camelot.read_pdf(filename)
assert df.equals(tables[0].df)