diff --git a/camelot/core.py b/camelot/core.py index 62c4ba4..1c49c66 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -329,6 +329,7 @@ class Table(object): kw = { 'encoding': 'utf-8', 'index': False, + 'header': False, 'quoting': 1 } kw.update(kwargs) diff --git a/docs/benchmark/agstat/agstat-data-camelot-page-1-table-1.csv b/docs/benchmark/agstat/agstat-data-camelot-page-1-table-1.csv new file mode 100755 index 0000000..2e16a02 --- /dev/null +++ b/docs/benchmark/agstat/agstat-data-camelot-page-1-table-1.csv @@ -0,0 +1,33 @@ +"Sl.No.","District","(In lakhs)for 2012-13Projected Population","Adult (In lakhs)Equivalent to 88%","requirement(In Lakh tonnes)Total Consumption(@ 400gms/adult/day)","(In Lakh tonnes)(Including seeds, feeds & wastage)Total Requirement","Production (Rice)(In Lakh tonnes)","","","Surplus/Defi cit(In Lakh tonnes)","" +"","","","","","","Kharif","Rabi","Total","Rice","Paddy" +"1","Balasore","23.65","20.81","3.04","3.47","2.78","0.86","3.64","0.17","0.25" +"2","Bhadrak","15.34","13.50","1.97","2.25","3.50","0.05","3.55","1.30","1.94" +"3","Balangir","17.01","14.97","2.19","2.50","6.23","0.10","6.33","3.83","5.72" +"4","Subarnapur","6.70","5.90","0.86","0.98","4.48","1.13","5.61","4.63","6.91" +"5","Cuttack","26.63","23.43","3.42","3.91","3.75","0.06","3.81","-0.10","-0.15" +"6","Jagatsingpur","11.49","10.11","1.48","1.69","2.10","0.02","2.12","0.43","0.64" +"7","Jajpur","18.59","16.36","2.39","2.73","2.13","0.04","2.17","-0.56","-0.84" +"8","Kendrapara","14.62","12.87","1.88","2.15","2.60","0.07","2.67","0.52","0.78" +"9","Dhenkanal","12.13","10.67","1.56","1.78","2.26","0.02","2.28","0.50","0.75" +"10","Angul","12.93","11.38","1.66","1.90","1.73","0.02","1.75","-0.15","-0.22" +"11","Ganjam","35.77","31.48","4.60","5.26","4.57","0.00","4.57","-0.69","-1.03" +"12","Gajapati","5.85","5.15","0.75","0.86","0.68","0.01","0.69","-0.17","-0.25" +"13","Kalahandi","16.12","14.19","2.07","2.37","5.42","1.13","6.55","4.18","6.24" +"14","Nuapada","6.18","5.44","0.79","0.90","1.98","0.08","2.06","1.16","1.73" +"15","Keonjhar","18.42","16.21","2.37","2.71","2.76","0.08","2.84","0.13","0.19" +"16","Koraput","14.09","12.40","1.81","2.07","2.08","0.34","2.42","0.35","0.52" +"17","Malkangiri","6.31","5.55","0.81","0.93","1.78","0.04","1.82","0.89","1.33" +"18","Nabarangpur","12.50","11.00","1.61","1.84","3.26","0.02","3.28","1.44","2.15" +"19","Rayagada","9.83","8.65","1.26","1.44","1.15","0.03","1.18","-0.26","-0.39" +"20","Mayurbhanj","25.61","22.54","3.29","3.76","4.90","0.06","4.96","1.20","1.79" +"21","Kandhamal","7.45","6.56","0.96","1.10","0.70","0.01","0.71","-0.39","-0.58" +"22","Boudh","4.51","3.97","0.58","0.66","1.73","0.03","1.76","1.10","1.64" +"23","Puri","17.29","15.22","2.22","2.54","2.45","0.99","3.44","0.90","1.34" +"24","Khordha","23.08","20.31","2.97","3.39","2.02","0.03","2.05","-1.34","-2.00" +"25","Nayagarh","9.78","8.61","1.26","1.44","2.10","0.00","2.10","0.66","0.99" +"26","Sambalpur","10.62","9.35","1.37","1.57","3.45","0.71","4.16","2.59","3.87" +"27","Bargarh","15.00","13.20","1.93","2.21","6.87","2.65","9.52","7.31","10.91" +"28","Deogarh","3.18","2.80","0.41","0.47","1.12","0.07","1.19","0.72","1.07" +"29","Jharsuguda","5.91","5.20","0.76","0.87","0.99","0.01","1.00","0.13","0.19" +"30","Sundargarh","21.21","18.66","2.72","3.11","4.72","0.02","4.74","1.63","2.43" +"ODISHA","","427.80","376.49","54.99","62.86","86.29","8.68","94.97","32.11","47.92" diff --git a/docs/benchmark/agstat/agstat-data-tabula.csv b/docs/benchmark/agstat/agstat-data-tabula.csv new file mode 100755 index 0000000..3696f06 --- /dev/null +++ b/docs/benchmark/agstat/agstat-data-tabula.csv @@ -0,0 +1,32 @@ +"Sl. No.",District,,,,,"Production (Rice) (In Lakh tonnes)","Surplus/Defi cit (In Lakh tonnes)",,, +"",,,,,,,,,, +1,Balasore,23.65,20.81,3.04,3.47,2.78,0.86,3.64,0.17,0.25 +2,Bhadrak,15.34,13.50,1.97,2.25,3.50,0.05,3.55,1.30,1.94 +3,Balangir,17.01,14.97,2.19,2.50,6.23,0.10,6.33,3.83,5.72 +4,Subarnapur,6.70,5.90,0.86,0.98,4.48,1.13,5.61,4.63,6.91 +5,Cuttack,26.63,23.43,3.42,3.91,3.75,0.06,3.81,-0.10,-0.15 +6,Jagatsingpur,11.49,10.11,1.48,1.69,2.10,0.02,2.12,0.43,0.64 +7,Jajpur,18.59,16.36,2.39,2.73,2.13,0.04,2.17,-0.56,-0.84 +8,Kendrapara,14.62,12.87,1.88,2.15,2.60,0.07,2.67,0.52,0.78 +9,Dhenkanal,12.13,10.67,1.56,1.78,2.26,0.02,2.28,0.50,0.75 +10,Angul,12.93,11.38,1.66,1.90,1.73,0.02,1.75,-0.15,-0.22 +11,Ganjam,35.77,31.48,4.60,5.26,4.57,0.00,4.57,-0.69,-1.03 +12,Gajapati,5.85,5.15,0.75,0.86,0.68,0.01,0.69,-0.17,-0.25 +13,Kalahandi,16.12,14.19,2.07,2.37,5.42,1.13,6.55,4.18,6.24 +14,Nuapada,6.18,5.44,0.79,0.90,1.98,0.08,2.06,1.16,1.73 +15,Keonjhar,18.42,16.21,2.37,2.71,2.76,0.08,2.84,0.13,0.19 +16,Koraput,14.09,12.40,1.81,2.07,2.08,0.34,2.42,0.35,0.52 +17,Malkangiri,6.31,5.55,0.81,0.93,1.78,0.04,1.82,0.89,1.33 +18,Nabarangpur,12.50,11.00,1.61,1.84,3.26,0.02,3.28,1.44,2.15 +19,Rayagada,9.83,8.65,1.26,1.44,1.15,0.03,1.18,-0.26,-0.39 +20,Mayurbhanj,25.61,22.54,3.29,3.76,4.90,0.06,4.96,1.20,1.79 +21,Kandhamal,7.45,6.56,0.96,1.10,0.70,0.01,0.71,-0.39,-0.58 +22,Boudh,4.51,3.97,0.58,0.66,1.73,0.03,1.76,1.10,1.64 +23,Puri,17.29,15.22,2.22,2.54,2.45,0.99,3.44,0.90,1.34 +24,Khordha,23.08,20.31,2.97,3.39,2.02,0.03,2.05,-1.34,-2.00 +25,Nayagarh,9.78,8.61,1.26,1.44,2.10,0.00,2.10,0.66,0.99 +26,Sambalpur,10.62,9.35,1.37,1.57,3.45,0.71,4.16,2.59,3.87 +27,Bargarh,15.00,13.20,1.93,2.21,6.87,2.65,9.52,7.31,10.91 +28,Deogarh,3.18,2.80,0.41,0.47,1.12,0.07,1.19,0.72,1.07 +29,Jharsuguda,5.91,5.20,0.76,0.87,0.99,0.01,1.00,0.13,0.19 +30,Sundargarh,21.21,18.66,2.72,3.11,4.72,0.02,4.74,1.63,2.43 diff --git a/docs/benchmark/agstat/agstat-table-detection-camelot.png b/docs/benchmark/agstat/agstat-table-detection-camelot.png new file mode 100755 index 0000000..38bf911 Binary files /dev/null and b/docs/benchmark/agstat/agstat-table-detection-camelot.png differ diff --git a/docs/benchmark/agstat/agstat-table-detection-tabula.png b/docs/benchmark/agstat/agstat-table-detection-tabula.png new file mode 100755 index 0000000..815e81b Binary files /dev/null and b/docs/benchmark/agstat/agstat-table-detection-tabula.png differ diff --git a/docs/benchmark/agstat/agstat.pdf b/docs/benchmark/agstat/agstat.pdf new file mode 100755 index 0000000..cf1c25a Binary files /dev/null and b/docs/benchmark/agstat/agstat.pdf differ diff --git a/docs/index.rst b/docs/index.rst index 29a0bb0..7cf02c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -47,84 +47,7 @@ Usage >>> tables[0].df .. csv-table:: - :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" - - "","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle" - "2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%" - "2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%" - "4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%" - "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%" - "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%" - -:: - - $ camelot --help - Usage: camelot [OPTIONS] FILEPATH - - Options: - -p, --pages TEXT Comma-separated page numbers to parse. - Example: 1,3,4 or 1,4-end - -o, --output TEXT Output filepath. - -f, --format [csv|json|excel|html] - Output file format. - -z, --zip Whether or not to create a ZIP archive. - -m, --mesh Whether or not to use Lattice method of - parsing. Stream is used by default. - -T, --table_area TEXT Table areas (x1,y1,x2,y2) to process. - x1, y1 - -> left-top and x2, y2 -> right-bottom - -split, --split_text Whether or not to split text if it spans - across multiple cells. - -flag, --flag_size (inactive) Whether or not to flag text which - has uncommon size. (Useful to detect - super/subscripts) - -M, --margins ... - char_margin, line_margin, word_margin for - PDFMiner. - -C, --columns TEXT x-coordinates of column separators. - -r, --row_close_tol INTEGER Rows will be formed by combining text - vertically within this tolerance. - -c, --col_close_tol INTEGER Columns will be formed by combining text - horizontally within this tolerance. - -back, --process_background (with --mesh) Whether or not to process - lines that are in background. - -scale, --line_size_scaling INTEGER - (with --mesh) Factor by which the page - dimensions will be divided to get smallest - length of detected lines. - -copy, --copy_text [h|v] (with --mesh) Specify direction in which - text will be copied over in a spanning cell. - -shift, --shift_text [l|r|t|b] (with --mesh) Specify direction in which - text in a spanning cell should flow. - -l, --line_close_tol INTEGER (with --mesh) Tolerance parameter used to - merge close vertical lines and close - horizontal lines. - -j, --joint_close_tol INTEGER (with --mesh) Tolerance parameter used to - decide whether the detected lines and points - lie close to each other. - -block, --threshold_blocksize INTEGER - (with --mesh) For adaptive thresholding, - size of a pixel neighborhood that is used to - calculate a threshold value for the pixel: - 3, 5, 7, and so on. - -const, --threshold_constant INTEGER - (with --mesh) For adaptive thresholding, - constant subtracted from the mean or - weighted mean. - Normally, it is positive but - may be zero or negative as well. - -I, --iterations INTEGER (with --mesh) Number of times for - erosion/dilation is applied. - -G, --geometry_type [text|table|contour|joint|line] - Plot geometry found on pdf page for - debugging. - text: Plot text objects. (Useful to get - table_area and columns coordinates) - table: Plot parsed table. - contour (with --mesh): Plot detected rectangles. - joint (with --mesh): Plot detected line intersections. - line (with --mesh): Plot detected lines. - --help Show this message and exit. + :file: _static/csv/foo.csv The User Guide -------------- @@ -135,6 +58,7 @@ The User Guide user/intro user/install user/quickstart + user/cli The API Documentation / Guide ----------------------------- diff --git a/docs/user/cli.rst b/docs/user/cli.rst new file mode 100644 index 0000000..bfacca0 --- /dev/null +++ b/docs/user/cli.rst @@ -0,0 +1,74 @@ +.. _cli: + +Command-line interface +====================== + +:: + + $ camelot --help + Usage: camelot [OPTIONS] FILEPATH + + Options: + -p, --pages TEXT Comma-separated page numbers to parse. + Example: 1,3,4 or 1,4-end + -o, --output TEXT Output filepath. + -f, --format [csv|json|excel|html] + Output file format. + -z, --zip Whether or not to create a ZIP archive. + -m, --mesh Whether or not to use Lattice method of + parsing. Stream is used by default. + -T, --table_area TEXT Table areas (x1,y1,x2,y2) to process. + x1, y1 + -> left-top and x2, y2 -> right-bottom + -split, --split_text Whether or not to split text if it spans + across multiple cells. + -flag, --flag_size (inactive) Whether or not to flag text which + has uncommon size. (Useful to detect + super/subscripts) + -M, --margins ... + char_margin, line_margin, word_margin for + PDFMiner. + -C, --columns TEXT x-coordinates of column separators. + -r, --row_close_tol INTEGER Rows will be formed by combining text + vertically within this tolerance. + -c, --col_close_tol INTEGER Columns will be formed by combining text + horizontally within this tolerance. + -back, --process_background (with --mesh) Whether or not to process + lines that are in background. + -scale, --line_size_scaling INTEGER + (with --mesh) Factor by which the page + dimensions will be divided to get smallest + length of detected lines. + -copy, --copy_text [h|v] (with --mesh) Specify direction in which + text will be copied over in a spanning cell. + -shift, --shift_text [l|r|t|b] (with --mesh) Specify direction in which + text in a spanning cell should flow. + -l, --line_close_tol INTEGER (with --mesh) Tolerance parameter used to + merge close vertical lines and close + horizontal lines. + -j, --joint_close_tol INTEGER (with --mesh) Tolerance parameter used to + decide whether the detected lines and points + lie close to each other. + -block, --threshold_blocksize INTEGER + (with --mesh) For adaptive thresholding, + size of a pixel neighborhood that is used to + calculate a threshold value for the pixel: + 3, 5, 7, and so on. + -const, --threshold_constant INTEGER + (with --mesh) For adaptive thresholding, + constant subtracted from the mean or + weighted mean. + Normally, it is positive but + may be zero or negative as well. + -I, --iterations INTEGER (with --mesh) Number of times for + erosion/dilation is applied. + -G, --geometry_type [text|table|contour|joint|line] + Plot geometry found on pdf page for + debugging. + text: Plot text objects. (Useful to get + table_area and columns coordinates) + table: Plot parsed table. + contour (with --mesh): Plot detected rectangles. + joint (with --mesh): Plot detected line intersections. + line (with --mesh): Plot detected lines. + --help Show this message and exit. \ No newline at end of file diff --git a/docs/user/lattice.rst b/docs/user/lattice.rst index e8370e0..38e782b 100644 --- a/docs/user/lattice.rst +++ b/docs/user/lattice.rst @@ -17,7 +17,7 @@ Line segments are detected in the first step. .. .. _this: insert link for us-030.pdf -.. image:: ../_static/user/line.png +.. image:: ../_static/png/line.png :height: 674 :width: 1366 :scale: 50% @@ -25,7 +25,7 @@ Line segments are detected in the first step. The detected line segments are overlapped by `and` ing their pixel intensities to find intersections. -.. image:: ../_static/user/intersection.png +.. image:: ../_static/png/intersection.png :height: 674 :width: 1366 :scale: 50% @@ -33,7 +33,7 @@ The detected line segments are overlapped by `and` ing their pixel intensities t The detected line segments are overlapped again, this time by `or` ing their pixel intensities and outermost contours are computed to identify potential table boundaries. This helps Lattice in detecting more than one table on a single page. -.. image:: ../_static/user/contour.png +.. image:: ../_static/png/contour.png :height: 674 :width: 1366 :scale: 50% @@ -41,7 +41,7 @@ The detected line segments are overlapped again, this time by `or` ing their pix Since dimensions of a pdf and its image vary; table contours, intersections and segments are scaled and translated to the pdf's coordinate space. A representation of the table is then created using these scaled coordinates. -.. image:: ../_static/user/table.png +.. image:: ../_static/png/table.png :height: 674 :width: 1366 :scale: 50% @@ -49,7 +49,7 @@ Since dimensions of a pdf and its image vary; table contours, intersections and Spanning cells are then detected using the line segments and intersections. -.. image:: ../_static/user/table_span.png +.. image:: ../_static/png/table_span.png :height: 674 :width: 1366 :scale: 50% @@ -85,7 +85,7 @@ Let's consider this pdf file. .. .. _this: insert link for row_span_1.pdf -.. image:: ../_static/user/scale_1.png +.. image:: ../_static/png/scale_1.png :height: 674 :width: 1366 :scale: 50% @@ -93,7 +93,7 @@ Let's consider this pdf file. Clearly, it couldn't detected those small lines in the lower left part. Therefore, we need to increase the value of scale. Let's try a value of 40. -.. image:: ../_static/user/scale_2.png +.. image:: ../_static/png/scale_2.png :height: 674 :width: 1366 :scale: 50% diff --git a/docs/user/stream.rst b/docs/user/stream.rst index 68d7d15..ab4a231 100644 --- a/docs/user/stream.rst +++ b/docs/user/stream.rst @@ -68,7 +68,7 @@ We can also specify the column x-coordinates. We need to call Stream with debug= >>> manager = Pdf(Stream(debug=True), 'mexican_towns.pdf'), debug=True >>> manager.debug_plot() -.. image:: ../_static/user/columns.png +.. image:: ../_static/png/columns.png :height: 674 :width: 1366 :scale: 50%