from IPython.display import Image
import numpy as np
import pandas as pd
from datetime import datetime
import time
import subprocess
import shlex
from io import BytesIO
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from datetime import datetime
from pytz import timezone
import re
def del_colnpoint(strr):
if '.' in list(strr):
return strr.split('.')[1]
else:
return strr
def run_sql(cmd):
cmd = 'set hive.cli.print.header=true;' + cmd
hivecmd='hive -S -e ' + '"{0}"'.format(cmd)
args = shlex.split(hivecmd)
coutput = subprocess.run(args, stdout=subprocess.PIPE)
if coutput.returncode != 0:
print("select data error!")
print(coutput.stderr)
return None
else:
data = pd.read_csv(BytesIO(coutput.stdout), sep='\t', low_memory=False, error_bad_lines=False)
data.columns = [del_colnpoint(strr) for strr in data.columns]
print('本DataFrame的行数为:'+str(data.shape[0]))
print('本DataFrame的列数为:'+str(data.shape[1]))
return data
print("Run End: ", datetime.today().astimezone(timezone('Asia/Shanghai')))
def run_sql_f(f):
hivecmd='hive -S -f {}'.format(f)
args = shlex.split(hivecmd)
coutput = subprocess.run(args, stdout=subprocess.PIPE)
if coutput.returncode != 0:
print("select data error!")
print(coutput.stderr)
return None
else:
data = pd.read_csv(BytesIO(coutput.stdout), sep='\t', low_memory=False, error_bad_lines=False)
data.columns = [del_colnpoint(strr) for strr in data.columns]
return data
网友评论