Read Fasta File

发布日期: 2025-01-09

更新日期: 2025-01-09

文章字数: 252

阅读时长: 1 分

我们在平时的工作中经常会遇到对生物序列进行提取或修改，不管是基因组DNA序列还是编码蛋白序列，它们都是fasta格式。一般都是将fasta序列存储为dict格式再进行操作，下面就介绍下我常用或遇到的一些处理方式：

1.比较原始的逐行处理

#read a fasta file and return a dictionary, the key is entry id and the value is the sequence in upcase
def readFasta(fastaFile):
    import gzip
    f1 = gzip.open(fastaFile, 'rt') if fastaFile.endswith('gz') else open(fastaFile)
    line = f1.readline()
    sequence = ""
    fasta_dict = {}
    header = ""
    while True:
        if line:
            if line[0] == '>':
                if len(sequence) > 0:
                    fasta_dict[header] = sequence
                    sequence = ""
                header = line.strip().replace('>', '').split()[0]
            else:
                sequence += line.strip()
        else:
            break 
        line = f1.readline()
    f1.close()
    if header and sequence:
        fasta_dict[header] = sequence

    return fasta_dict

2.比较优雅的借助Biopython API

def load_fasta(path, subset_chroms=None):
    from Bio import SeqIO
    import gzip
    with gzip.open(path, "rt") if path.endswith(".gz") else open(path) as handle:
        genome = pd.Series(
            {
                rec.id: str(rec.seq)
                for rec in SeqIO.parse(handle, "fasta")
                if subset_chroms is None or rec.id in subset_chroms
            }
        )
    return genome