1.counting DNA nucleotides

rosalind1 题目
#style1
seq=''
ntcount=[]
with open('rosalind1.txt') as f:
    for line in f:
         line = line.rstrip() #去末尾空格
         seq += line.upper()
for nt in ['A','C','G','T']:
    ntcount.append(seq.count(nt))
print('\t'.join(map(str,ntcount)))
#style2
with open('rosalind1.txt') as f:
    for a in f:
        b = list(str(a)) #b是一个个核苷酸的列表
       c={}
       for d in b:
           c[d] = b.count(d)
       print(c)
#style3
from collections import Counter
with open('rosalind_dna.txt') as f:
    list=f.read().rstrip()
    print(Counter(list))
2.将DNA转录成RNA

Rosalind 2 题目
#style1 使用replace函数
with open('rosalind_rna.txt','r') as f:
    list=f.read()
    print(list.replace('T','U'))
#style2 使用re.sub正则表达式
import re
with open('rosalind_rna.txt', 'r') as f:
    list = f.read()
    print(re.sub('T','U',list))
3.获取反向互补链
 Rosalind 3 题目
Rosalind 3 题目
#rosalind 3
#style1
def reverse_complement(seq):
    ntComplement={'A':'T','C':'G','G':'C','T':'A'}
    revSeqList = list(reversed(seq))
    revComSeqList = [ntComplement[k] for k in revSeqList]
    revComSeq = ''.join(revComSeqList)
    return revComSeq
seq=''
with open('rosalind_revc.txt') as f:
    for line in f:
        line = line.rstrip()
        seq += line.rstrip()
        print(reverse_complement(seq))
#style2 使用翻译表
seq=''
with open('rosalind_revc.txt') as f:
    for line in f:
        line = line.rstrip()
        seq += line.rstrip()
transtable = str.maketrans('ATCG','TAGC')
print(seq.translate(transtable)[::-1])
#style3 使用biopython
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
seq=''
with open('rosalind_revc.txt') as f:
    for line in f:
        line = line.rstrip()
        seq += line.rstrip()
my_seq = Seq(seq,IUPAC.unambiguous_dna)
print(my_seq)
print(my_seq.complement()) #互补链
print(my_seq.reverse_complement()) #反向互补链
4.兔子和递归关系

rosalind4 题目
def fibonacciRabbits(n, k):
    F = [1, 1]
    generation = 2
    while generation <= n:
        F.append(F[generation - 1] + F[generation - 2] * k)
        generation += 1
    # 函数返回列表最后一项
    return (F[n - 1])
#调用函数并打印
print (fibonacciRabbits(5, 3))
5.计算GC含量

rosalind 5题目
#style1
from Bio import SeqIO
Target_Seq = SeqIO.parse('rosalind_gc.txt','fasta')
GC_content = 0
ID = ''
for target in Target_Seq:
    if GC_content < (float(target.seq.count('C')+target.seq.count('G'))/len(target.seq))*100:
        GC_content = (float(target.seq.count('C')+target.seq.count('G'))/len(target.seq))*100
        ID = target.description
print(ID,'\n',GC_content)
#style2
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GGGATCGTAC",IUPAC.unambiguous_dna)
for index,letter in enumerate(my_seq):
    print(index,letter)
print(len(my_seq))
print(my_seq.count('A'))
print(GC(my_seq))
6.计算突变个数

Rosalind6题目
d = open('rosalind_hamm.txt','r')
s = d.readlines()
a = s[0].strip()
b = s[1].strip()
hum = 0
for i in range(len(a)):
    if a[i] != b[I]:
        hum+=1
print(hum)
7.孟德尔第一定律
 Rosalind7题目
Rosalind7题目
from scipy.misc import comb
def fun(k,m,n):
    y = k+m+n
    nn = comb(n,2)/comb(y,2)
    kk = comb(m,2)/comb(y,2)
    mm = comb(n,1)/comb(y,2)
    p = round(1-(nn+kk*1/4+mm*1/2),5)
    return p
#读取数字
d = open('rosalind_iprb.txt','r')
s = d.read()
f = s.split()
k,m,n = int(f[0]),int(f[1]),int(f[2])
print(fun(k,m,n))
8.将RNA翻译成蛋白质

rosalind8 题目
# -*- coding: utf-8 -*-
'''rosaline 将RNA翻译成蛋白质'''
#style1 使用biopython
from Bio import Seq
from Bio.Alphabet import generic_dna,generic_rna
from Bio import SeqIO
from Bio.Data import CodonTable
#载入mRNA序列
RNA_string=open('rosalind_prot.txt','r').read().strip()
my_seq=Seq.Seq(RNA_string,generic_rna)
#载入标准遗传密码表
standard_table=CodonTable.unambiguous_rna_by_name['Standard']
#将mRNA翻译成蛋白质
protein=my_seq.translate(table='Standard')
print(protein)
#style2 自己写出密码表
import re
def mRNA_protein(RNA_string):
    #将mRNA翻译成蛋白质
    start_code = 'AUG'
    end_code = ['UAA', 'UAG', 'UGA']
    protein_table = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \
                     'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \
                     'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \
                     'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \
                     'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \
                     'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \
                     'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \
                     'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \
                     'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \
                     'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \
                     'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \
                     'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \
                     'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \
                     'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \
                     'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \
                     'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
                     }
    # 找到起始密码子的位置
    start_sit = re.search(start_code, RNA_string)
    protein = ''
    # 按阅读框匹配蛋白质
    for sit in range(start_sit.end(), len(RNA_string), 3):
        protein = protein + protein_table[RNA_string[sit:sit + 3]]
    print(protein)
if __name__ == '__main__':
    RNA_string = open('rosalind_prot.txt','r').read().strip()
    mRNA_protein(RNA_string)
#style3 使用biopython
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
d=open('rosalind_prot.txt','r').read().strip()
messenger_rna=Seq(d,IUPAC.unambiguous_rna)
print(messenger_rna.translate().strip('*'))
9.找到子串位置

rosalind9 题目
# -*- coding: utf-8 -*-
#rosalind9:finding a motif in dna
#style1
d=open('rosalind_subs.txt','r')
s=d.readlines()   #返回列表
a=s[0].strip()
b=s[1].strip()
print(s)
num1,num2=len(a),len(b)
n=num1-num2
I=0
site=[]
for i in range(n):
    c=a[i:i+num2]
    if b==c:
        site.append(i+1)#python的第1位是0,第2位是1,所以要加1
print(" ".join(str(j) for j in site))#输出成sampleoutput里的格式,使用join需要改为字符串格式
#style2
import re
# 如果overlapped   为False, 匹配到位置为2,10
matches = re.finditer('ATAT', 'GATATATGCATATACTT')
# 返回所有匹配项,
for match in matches:  #迭代器可循环
    print(match.start())
- re.match,re.search,re.findall,re.finditer区别
 re.match从首字母开始匹配,没有,就返回none
 re.search返回对象
 re.findall返回数组
 re.finditer返回迭代器
#style3
seq = 'GATATATGCATATACTT'
pattern = 'ATAT'
def find_motif(seq, pattern):
    position = []
    for i in range(len(seq) - len(pattern)):
        if seq[i:i + len(pattern)] == pattern:
            position.append(str(i + 1))
    print('\t'.join(position))
find_motif(seq, pattern)
10.寻找最可能的共同祖先

Rosalind10 题目

rosalind 10 题目
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#rosalind10: consensus and profile
#style1
from collections import Counter
from collections import OrderedDict  #有序字典
# 写出到文件
fh = open('cons.txt', 'wt')
rosalind = OrderedDict()
seqLength = 0
with open('rosalind_cons.txt') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('>'):
            rosalindName = line
            rosalind[rosalindName] = ''
            # 跳出这个循环,进入下一个循环
            continue
        # 如果不是'>'开头,执行这一句
        rosalind[rosalindName] += line
    # len(ATCCAGCT)
    seqLength = len(rosalind[rosalindName])
A, C, G, T = [], [], [], []
consensusSeq = ''
for i in range(seqLength):
    seq = ''
    # rosalind.keys = Rosalind_1...Rosalind_7
    for k in rosalind.keys():
        # 把 Rosalind_1...Rosalind_7 相同顺序的碱基加起来
        seq += rosalind[k][i]
    A.append(seq.count('A'))
    C.append(seq.count('C'))
    G.append(seq.count('G'))
    T.append(seq.count('T'))
    # 为seq计数
    counts = Counter(seq)
    # 从多到少返回,是个list啊,只返回第一个
    consensusSeq += counts.most_common()[0][0]
    print(counts.most_common())
fh.write(consensusSeq + '\n')
fh.write('\n'.join(['A:\t' + '\t'.join(map(str, A)), 'C:\t' + '\t'.join(map(str, C)),
                    'G:\t' + '\t'.join(map(str, G)), 'T:\t' + '\t'.join(map(str, T))]))
# .join(map(str,A))  把 A=[5, 1, 0, 0, 5, 5, 0, 0] 格式转化成 A:5 1 0 0 5 5 0 0
fh.close()
#style2(未成功)
import numpy as np
import os
from collections import Counter
fhand = open('rosalind_cons.txt')
t = []
for line in fhand:
    if line.startswith('>'):
        continue
    else:
        line = line.rstrip()
        line_list = list(line)  # 变成一个list
    t.append(line_list)  # 再把list 放入list
a = np.array(t)  # 创建一个二维数组
print(a)
L1, L2, L3, L4 = [], [], [], []
comsquence = ''
for i in range(a.shape[1]):  # shape[0],是7 行数,shape[1]是8 项目数
    l = [x[i] for x in a]  # 调出二维数组的每一列
    L1.append(l.count('A'))
    L2.append(l.count('C'))
    L3.append(l.count('T'))
    L4.append(l.count('G'))
    comsquence = comsquence + Counter(l).most_common()[0][0]
print(comsquence)
print('A:', L1, '\n', 'C:', L2, '\n', 'T:', L3, '\n', 'G:', L4)
#style3
def update_cnt(list, char, str):
    res = [a + b for a,b in zip(list, [1 if x == char else 0 for x in str])]
    return(res)
def print_cnt(list, label):
    print(label,end = ": ")
    for x in list:
        print(x, end=" ")
    print()
f = open("rosalind_cons.txt", 'r')
fas = f.readlines()
seq = ''
j = 0
for i in range(len(fas)):
    if fas[i][0] == '>' and j == 0:
        j += 1
        next
    elif fas[i][0] == '>':
        if j == 1:
            A = [0 for x in seq]
            C = A
            G = A
            T = A
        A = update_cnt(A, 'A', seq)
        C = update_cnt(C, 'C', seq)
        G = update_cnt(G, 'G', seq)
        T = update_cnt(T, 'T', seq)
        j += 1
        seq = ''
    else:
        seq += fas[i].strip()
## last seq
A = update_cnt(A, 'A', seq)
C = update_cnt(C, 'C', seq)
G = update_cnt(G, 'G', seq)
T = update_cnt(T, 'T', seq)
consensus = ''
for i in range(len(A)):
    m = max(A[i], C[i], G[i], T[I])
    if A[i] == m:
        consensus += 'A'
    elif C[i] == m:
        consensus += 'C'
    elif G[i] == m:
        consensus += 'G'
    elif T[i] == m:
        consensus += 'T'
print(consensus)
print_cnt(A, 'A')
print_cnt(C, 'C')
print_cnt(G, 'G')
print_cnt(T, 'T')
#style4
import numpy as np #引入numpy
d=open('rosalind_cons.txt','r')
from collections import Counter #问题1里求A/T/C/G出现过多少次时用到的函数
list2=[]
a=1 #起标记作用,看是不是第一次遇到'>'
dline=''
for line in d:
    if line.startswith('>'):
        if a==0: #如果不是第一次遇到'>'
            list2.append(list(dline)) #list是为了创建二维数组
            dline=''
        continue
    else:
        a=0 #如果第一次遇到'>'
        dline+=line.rstrip() #去掉每一行右边的换行
#最后一行
list2.append(list(dline))
bi=np.array(list2)#创建二维数组
LA,LT,LC,LG=[],[],[],[]
consensus =''#共同序列
for i in range(bi.shape[1]):#数组第1行有多少个
    l=bi[:,i] #调出数组每一列
    listl=list(l)
    LA.append(listl.count('A'))#数一个加一个,LA是数值列表,每数一列就加一个在末尾
    LC.append(listl.count('C'))
    LT.append(listl.count('T'))
    LG.append(listl.count('G'))
    most=Counter(listl).most_common() #算出最多的那个
    print(most)
    consensus=consensus+most[0][0]#第一列的(排序)第一的相加
print (consensus)
print("A:"," ".join(str(j) for j in LA))
print("C:"," ".join(str(j) for j in LC))
print("G:"," ".join(str(j) for j in LG))
print("T:"," ".join(str(j) for j in LT))
- 矩阵运算: numpy spicy pandas
- counters包
 
网友评论