python - Save a pandas dataframe as table in Image or pdf document with nice multi index display -
i'm trying include data frame multi-index in report in pdf. have nice table output.
i have found these 2 solutions:
pandas.df -> html -> pdf
import pandas pd ipython.display import html import pdfkit # df generation df = pd.read_csv(path_to_csv, sep =',') groupeddf = df.groupby('cluster') res = groupeddf.describe([0.05, 0.5, 0.95]) res.index.rename(['cluster', 'stats'], inplace=true) res['cluster'] = res.index.get_level_values('cluster') res['stats'] = res.index.get_level_values('stats') populations = (res.iloc[(res.index.get_level_values('stats') == 'count'), \ 0].values).tolist() res['population'] = [populations[i] in res.index.labels[0].values()] total_pop = sum(populations) res['frequency'] =(res['population']/total_pop).round(3) res.set_index(['cluster', 'population','frequency', 'stats'], inplace=true) res1 = res.iloc[(res.index.get_level_values('stats') == '5%') | (res.index.get_level_values('stats') == 'mean') | (res.index.get_level_values('stats') == '50%') | (res.index.get_level_values('stats') == '95%')] res1 = res1.round(2) # saving df h = html(res1.to_html()) my_file = open('test.html', 'w') my_file.write(h.data) my_file.close() options = { 'orientation': 'landscape' } open('test.html') f: pdfkit.from_file(f, 'out.pdf', options=options)
but has dependence on pdfkit
make difficult us. that's why trying use pandas.df -> tex -> pdf (as mentioned in export pandas dataframe table image )
import pandas pd import os # df generation df = pd.read_csv(path_to_csv, sep =',') groupeddf = df.groupby('cluster') res = groupeddf.describe([0.05, 0.5, 0.95]) res.index.rename(['cluster', 'stats'], inplace=true) res['cluster'] = res.index.get_level_values('cluster') res['stats'] = res.index.get_level_values('stats') populations = (res.iloc[(res.index.get_level_values('stats') == 'count'), \ 0].values).tolist() res['population'] = [populations[i] in res.index.labels[0].values()] total_pop = sum(populations) res['frequency'] =(res['population']/total_pop).round(3) res.set_index(['cluster', 'population','frequency', 'stats'], inplace=true) res1 = res.iloc[(res.index.get_level_values('stats') == '5%') | (res.index.get_level_values('stats') == 'mean') | (res.index.get_level_values('stats') == '50%') | (res.index.get_level_values('stats') == '95%')] res1 = res1.round(2) res1.rename(columns=lambda x: x.replace('_', ' '), inplace=true) #latex template = r'''\documentclass[preview]{{standalone}} \usepackage{{booktabs}} \begin{{document}} {} \end{{document}} ''' open("outputfile.tex", "wb") afile: afile.write(template.format(res1.to_latex())) os.system("pdflatex outputfile.tex")
however, not familiar latex, , error :
! latex error: file `standalone.cls' not found. type x quit or <return> proceed, or enter new name. (default extension: cls)
any idea error or standard way pandas.df -> pdf ?
the solution work me: pandas >= 0.17 installed pdflatex. copied latex package such booktabs.sty, geography.sty , pdflscape.sty
import pandas pd import os import math def save_summary_table_as_pdf(path_to_csv, path_to_output_folder): pwd = os.getcwd() df = pd.read_csv(path_to_csv, sep =',') #data preparation groupeddf = df.groupby('cluster') res = groupeddf.describe([0.05, 0.5, 0.95]) res.index.rename(['cluster', 'stats'], inplace=true) res['cluster'] = res.index.get_level_values('cluster') res['stats'] = res.index.get_level_values('stats') populations = (res.iloc[(res.index.get_level_values('stats') == 'count'), \ 0].values).tolist() res['population'] = [populations[i] in res.index.labels[0].values()] total_pop = sum(populations) res['frequency'] =(res['population']/total_pop).round(3) res.set_index(['cluster', 'population','frequency', 'stats'], inplace=true) res1 = res.iloc[(res.index.get_level_values('stats') == '5%') | (res.index.get_level_values('stats') == 'mean') | (res.index.get_level_values('stats') == '50%') | (res.index.get_level_values('stats') == '95%')] res1 = res1.round(2) res1.rename(columns=lambda x: x.replace('_', ' '), inplace=true) #latex nbpages = int(math.ceil(res1.shape[0]*1.0/40)) templatetop = r'''\documentclass[a3paper, 5pt]{article} \usepackage{booktabs} \usepackage{pdflscape} \usepackage[a4paper,bindingoffset=0.2in,% left=0.25in,right=0.25in,top=1in,bottom=1in,% footskip=.25in]{geometry} \begin{document} \begin{landscape} \pagenumbering{gobble} \oddsidemargin = 0pt \hoffset = -0.25in \topmargin = 1pt \headheight = 0pt \headsep = 0pt ''' templatebottom = ''' \end{landscape} \end{document} ''' output_folder_path_abs = path_to_output_folder output_tex = os.path.join(output_folder_path_abs, "clustering_summary_table.tex") open(output_tex, "wb") afile: afile.write(templatetop +'\n') in range(0, nbpages): afile.write(res1.iloc[(i*40):((i+1)*40), :].to_latex() +'\n' + """\pagenumbering{gobble}""") afile.write(templatebottom +'\n') os.chdir(output_folder_path_abs) os.system('pdflatex clustering_summary_table.tex') os.chdir(pwd) os.remove(output_tex) os.remove(os.path.join(path_to_output_folder, 'clustering_summary_table.aux')) os.remove(os.path.join(path_to_output_folder, 'clustering_summary_table.log')) if __name__ == "__main__": print 'begin generate pdf table clustering' import argparse parser = argparse.argumentparser() parser.add_argument("path_to_csv") parser.add_argument("outputfolder") args = vars(parser.parse_args()) filedir = os.path.abspath(os.path.dirname(__file__)) output_folder_path_abs = os.path.abspath(args['outputfolder']) input_folder_path_abs = os.path.abspath(args['path_to_csv']) # copy user package latex folder os.system('scp ' +os.path.abspath(os.path.join(filedir, 'userpackagelatex/booktabs.sty'))+ ' ' +output_folder_path_abs) os.system('scp ' +os.path.abspath(os.path.join(filedir, 'userpackagelatex/geography.sty'))+ ' ' +output_folder_path_abs) os.system('scp ' +os.path.abspath(os.path.join(filedir, 'userpackagelatex/pdflscape.sty'))+ ' ' +output_folder_path_abs) save_summary_table_as_pdf(input_folder_path_abs, output_folder_path_abs) os.remove(os.path.join(output_folder_path_abs, 'booktabs.sty')) os.remove(os.path.join(output_folder_path_abs, 'geography.sty')) os.remove(os.path.join(output_folder_path_abs, 'pdflscape.sty'))
Comments
Post a Comment