# -*- coding: utf-8 -*-
#!/usr/bin/env python
import os
import sys
import argparse
parser = argparse.ArgumentParser(description="trim data , change geneid to genename")
parser.add_argument('--DEseq2_file',help="DEseq2.xls.filter, include sample different geneID and genename ",required=True)
parser.add_argument('--diffmatrix',help="help",required=True)
parser.add_argument('--top20con',help="help",required=True)
parser.add_argument('--trimfile',help="help",required=True)
argv = vars(parser.parse_args())
DEseq2file = argv['DEseq2_file'] # 存储path路径的mapfile
difffile=argv['diffmatrix']
top20file=argv['top20con'] # 存储top 20的 geneid and genename
outfile=argv['trimfile']
# 根据padj 提取前面差异最大的20个基因,获得geneid 和 gene name;
os.system('cat %s | sort -k 7 -u -r | head -n 21 | cut -f 1,2 >%s'%(DEseq2file,top20file))
# top20的基因,基因ID、基因name 存入字典
top20 = {}
with open(top20file,"r") as topliness:
toplines = topliness.readlines()
for topline in toplines[1:]:
toplin = topline.strip().split("\t")
print(toplin)
topgeneID = toplin[0]
if 'ENS' in str(topgeneID):
topgeneID = toplin[0]+"_"+toplin[1]
top20[topgeneID] = toplin[1]
# 提取top 20 gene的geneID, 提取相应的基因,并更换基因名
outcontent = open(outfile,"w")
with open(difffile,"r") as diffliness:
difflines = diffliness.readlines()
headlines = difflines[0].strip().split('\t')
headcon = 'geneid'+'\t'
for headeri in headlines[1:]:
headcon = headcon+str(headeri)+ "\t"
outcontent.write(str(headcon)+"\n")
for diffline in difflines[1:]:
difflin = diffline.strip().split('\t')
gendifid = difflin[0]
if gendifid in top20.keys():
con = ''
con = top20[gendifid] +"\t"
for diffli in difflin[1:]:
con = con + str(diffli)+"\t"
outcontent.write(str(con)+"\n")
outcontent.close()
网友评论