python优化代码(文本查找)效率?

 天王2502871933 发布于 2022-11-04 17:23
#!/usr/bin/env python
# -*- coding: utf-8 -*-
infile2 = open('genemark.gff3', 'r')
infile1 = set(line1.strip() for line1 in open('1.txt', 'r'))
for line in infile2:
    line = line.strip().split()
    if line[2] == 'gene':
        chr, start, end = line[0], int(line[3]), int(line[4])
        for line1 in infile1:
            line1 = line1.split()
            chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3])
            if chr1 == chr:
                if start1 < start < end1:
                    print line1[0], line[-1]
                if start1 < end < end1:
                    print line1[0], line[-1]
                if start1 > start and end > end1:
                    print line1[0], line[-1]

genemark.gff3 格式类似下边:

chr1D    GeneMark.hmm    gene    2705930    2711118    .    +    .    ID=1903228_g;Name=1903228_g
chr1D    GeneMark.hmm    mRNA    2705930    2711118    .    +    .    ID=1903228_t;Name=1903228_t;Parent=1903228_g

1.txt:

UN011157 chr1D 2705329 2706342 98.4 95.0 972 30 21 0
UN003843 chr1D 2705681 2721144 61.4 97.4 633 12 5 0

附上原始文件的百度云链接,希望感兴趣的参考
点击下载 密码 enu8

综合楼下各位朋友的答案,现推荐两种
第一种 根据 @ferstar @用筹兮用严 的答案,即并行版

#!/usr/bin/env python
# encoding: utf-8

from collections import defaultdict
from multiprocessing import Pool, cpu_count
from functools import partial


def find_sth(f2, f1=None):
    start, end = int(f2[3]), int(f2[4])
    for uno1, start1, end1 in f1[f2[0]]:
        if (start1 <= start and start <= end1) or (start1 <= end and end <= end1) or (start1 >= start and end >= end1):
            with open("out.txt", "a") as fh:
                fh.write(uno1 + "\t" + f2[-1] + "\n")
                #print(uno1, f2[-1])


def main():
    with open('1.txt', 'r') as f1:
        infile1 = defaultdict(set)
        for uno1, chr1, start1, end1, *others in map(str.split, f1):
            infile1[chr1].add((uno1, int(start1), int(end1)))
    with open('genemark.gff3', 'r') as f2:
        infile2 = [x for x in map(str.split, f2) if x[2] == 'gene']
    pool = Pool(cpu_count())
    pool.map(partial(find_sth, f1=infile1), infile2)
    pool.close()
    pool.join()


if __name__ == "__main__":
    main()

第二种 @citaret 他的版本(单核版),对单核来说,不逊于上述代码。但是两者结果稍有不同,并行版结果更全(这里少了73条,出在判断条件的边界问题,由于对intervaltree熟悉,怎么改还不知道),现在边界问题已修改,两种代码结果完全一样,perfect!
如下

from collections import defaultdict
from intervaltree import Interval, IntervalTree

with open('1.txt') as f:
    d1 = defaultdict(list)
    xs = map(lambda x: x.strip().split(), f)
    for x in xs:
        y = (x[0], int(x[2]), int(x[3]))
        d1[x[1]].append(y)

    for k, v in d1.items():
        d1[k] = IntervalTree(Interval(s, e, u) for u, s, e in v)

with open('genemark.gff3') as f:
    for line in f:
        line = line.strip().split()
        if line[2] == 'gene':
            chr, start, end = line[0], int(line[3]), int(line[4])
            for start1, end1, un1 in d1[chr][start-1:end+1]:
                print(un1, line[-1])
6 个回答
  • 发现一个很有意思的事情, 大家回答很积极, 但是实际结果呢, 我刚好无聊小测试了一下, 过程如下:

    介于题主提供的示例文本才两行, 所以我把1.txtgenemark.gff3分别加倍到4000

    (qiime) [ngs@cluster ~]$ wc -l 1.txt 
    4000 1.txt
    (qiime) [ngs@cluster ~]$ wc -l genemark.gff3 
    4000 genemark.gff3

    按照回复楼层数排序, 如题主的代码是hi.py,然后一楼答主的代码是hi1.py,依次类推

    • 先看题主的

    (qiime) [ngs@cluster ~]$ time python hi.py > hi.txt
    real    0m0.049s
    user    0m0.042s
    sys    0m0.007s
    (qiime) [ngs@cluster ~]$ wc -l hi.txt 
    6000 hi.txt

    感觉是有重复

    • 一楼答主的

    time python hi1.py > hi1.txt
    
    real    0m21.727s
    user    0m21.171s
    sys    0m0.547s
    (qiime) [ngs@cluster ~]$ wc -l hi1.txt 
    8000000 hi1.txt

    重复到姥姥家了

    • 二楼答主的

    (qiime) [ngs@cluster ~]$ time python hi2.py > hi2.txt
    
    real    0m16.326s
    user    0m14.550s
    sys    0m1.044s
    (qiime) [ngs@cluster ~]$ wc -l hi2.txt 
    12000000 hi2.txt
    • 三楼答主的

    (qiime) [ngs@cluster ~]$ time python hi3.py > hi3.txt
    
    real    0m27.079s
    user    0m26.281s
    sys    0m0.786s
    (qiime) [ngs@cluster ~]$ wc -l hi3.txt
    12000000 hi3.txt

    三楼答主的结果跟二楼一样, 但是慢了10秒多

    • 四楼答主的(冤枉四楼同学了,这是py3代码)

    (py3) [ngs@cluster ~]$ time python hi4.py > hi4.txt
    
    real    0m0.074s
    user    0m0.064s
    sys    0m0.010s
    (py3) [ngs@cluster ~]$ wc -l hi4.txt 
    4000 hi4.txt

    果然是有交流才有进步, 目前这个结果才是正确的

    总结

    实际好像是题主的代码结果会有重复, 四楼答主的结果才是正确的

    我的方案--把四楼的代码小改变成并行的

    我写的有问题, @用筹兮用严 更新了正确的并行代码, 我的代码就不改了, 方便后面看到的同学参考

    直接放码(python3)

    from collections import defaultdict
    import multiprocessing
    
    
    def find_sth(x):
        with open('1.txt', 'r') as f1:
            infile1 = defaultdict(set)
            for uno1, chr1, start1, end1, *others in map(str.split, f1):
                infile1[chr1].add((uno1, int(start1), int(end1)))
            chr, start, end, info = x[0], int(x[3]), int(x[4]), x[-1]
            for uno1, start1, end1 in infile1[chr]:
                if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1):
                    print(uno1, info)
    
    
    def main():
        with open('genemark.gff3', 'r') as fh:
            lst = [x for x in map(str.split, fh) if x[2] == 'gene']
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        pool.map(find_sth, lst)
        pool.close()
        pool.join()
    
    if __name__ == "__main__":
        main()

    然后看看运行效率

    (py3) [ngs@cluster ~]$ time python hi_new.py > hi_new.txt
    real    0m3.033s
    user    0m31.952s
    sys    0m0.219s
    
    (py3) [ngs@cluster ~]$ wc -l hi_new.txt 
    4000 hi_new.txt

    时间上貌似慢了很多(4000行数据才几百KB), 题主可以试着用你的真实数据测试下, 处理数据越大, 并行处理的效率优势越明显

    PS: 我估计题主实际处理的数据大小得有MB甚至是GB级别, 这种级别并行处理才是王道

    源数据及结果度娘地址 链接:http://pan.baidu.com/s/1hrSZQuS 密码:u93n

    2022-11-12 01:56 回答
  • from collections import defaultdict
    
    with open('1.txt', 'r') as f1, open('genemark.gff3', 'r') as f2:
        infile1 = defaultdict(set)
        for uno1, chr1, start1, end1, *others in map(str.split, f1):
            infile1[chr1].add((uno1, int(start1), int(end1)))
        infile2 = filter(lambda x: x[2] == 'gene', map(str.split, f2))
        for chr, start, end, info in map(lambda x: (x[0], int(x[3]), int(x[4]), x[-1]), infile2):
            for uno1, start1, end1 in infile1[chr]:
                if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1):
                    print(uno1, info)

    6楼 @ferstar 提出的并行化才是正确的方向,不过代码有点问题……
    改了下:

    #!/usr/bin/env python
    # encoding: utf-8
    
    from collections import defaultdict
    from multiprocessing import Pool, cpu_count
    from functools import partial
    
    
    def find_sth(line, f1=None):
        line = line.split()
        if line[2] != 'gene':
            return
        start, end = int(line[3]), int(line[4])
        for uno1, start1, end1 in f1[line[0]]:
            if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1):
                print(uno1, line[-1])
    
    
    def main():
        pool = Pool(cpu_count())
        with open('1.txt', 'r') as f1, open('genemark.gff3', 'r') as f2:
            infile1 = defaultdict(set)
            for uno1, chr1, start1, end1, *others in map(str.split, f1):
                infile1[chr1].add((uno1, int(start1), int(end1)))
            pool.map(partial(find_sth, f1=infile1), f2)
            #fw.writelines(filter(lambda x: x is not None, map(lambda x: x.get(), [pool.apply_async(func, (line,)) for line in f2])))
        pool.close()
        pool.join()
    
    
    if __name__ == "__main__":
        main()
    2022-11-12 01:56 回答
  • 文件发开后不用关闭吗

    2022-11-12 01:56 回答
  • 用空间换时间,分别构建 genemark.gff3 的列表和 1.txt 的字典,具体实现:

    from collections import defaultdict
    
    with open('genemark.gff3') as f:
        ls = f.readlines()
        xs = map(lambda x: x.strip().split(), ls)
        t2 = (x for x in xs if x[2] == 'gene')
    
    with open('1.txt') as f:
        d1 = defaultdict(list)
        ls = f.readlines()
        xs = map(lambda x: x.strip().split(), ls)
        for x in xs:
            d1[x[1]].append(x)
    
    for line in t2:
        chr, start, end = line[0], int(line[3]), int(line[4])
        if chr in d1:
            for line1 in d1[chr]:
                chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3])
                if start1 < start < end1:
                    print line1[0], line[-1]
                if start1 < end < end1:
                    print line1[0], line[-1]
                if start1 > start and end > end1:
                    print line1[0], line[-1]

    修改版 v2,消除内层循环里的 int(),简化输出。

    from collections import defaultdict
    
    with open('1.txt') as f:
        d1 = defaultdict(list)
        xs = map(lambda x: x.strip().split(), f)
        for x in xs:
            y = (x[0], int(x[2]), int(x[3]))
            d1[x[1]].append(y)
    
    with open('genemark.gff3') as f:
        for line in f:
            line = line.strip().split()
            chr, start, end = line[0], int(line[3]), int(line[4])
            for un1, start1, end1 in d1[chr]:
                if start < end1 and end > start1:
                    print un1, line[-1]

    v3: 仔细研究题意,发现主循环是求出集合中和一个片段相交的所有片段,我们可以先看看这个集合:

    chr1D 7359
    chr2D 9219
    chr2B 9486
    chr2A 8986
    chr6B 7178
    chr6A 6446
    chr6D 6093
    chr4A 7543
    chr4B 7086
    chr4D 6316
    ...
    

    每个集合的片段数量在6000-10000,遍历的话效率低,因此考虑使用 intervaltree,可以快速得出和一个片段相交的所有片段,具体实现为:

    from collections import defaultdict
    from intervaltree import Interval, IntervalTree
    
    with open('1.txt') as f:
        d1 = defaultdict(list)
        xs = map(lambda x: x.strip().split(), f)
        for x in xs:
            y = (x[0], int(x[2]), int(x[3]))
            d1[x[1]].append(y)
    
        for k, v in d1.items():
            d1[k] = IntervalTree(Interval(s, e, u) for u, s, e in v)
    
    with open('genemark.gff3') as f:
        for line in f:
            line = line.strip().split()
            if line[2] != 'gene': continue
            chr, start, end = line[0], int(line[3]), int(line[4])
            for start1, end1, un1 in d1[chr][start:end]:
                print un1, line[-1]

    时间测试结果为:构建 intervaltree 花时 10秒,但是求交集的过程速度有100倍左右的提升。
    intervaltee 参考 https://pypi.python.org/pypi/...

    2022-11-12 01:56 回答
  • 这里给出两个建议:

    1. 代码嵌套太深,在函数中可以通过尽早的 return 来减少嵌套层级,同样的在循环中,可以通过使用 continue 来达到减少嵌套层级的目的。

    2. 关于性能方面

    for line1 in infile1:
        line1 = line1.split()
    

    每次循环都要对对 file1中的行进行 split 操作是非常不明智的

    下面是我修改的代码

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    infile2 = open('genemark.gff3', 'r')
    infile1 = {}
    for line1 in open('1.txt', 'r'):
        line1 = line1.strip().split()
        id, chr1, start1, end1 = line1[0], line1[1], int(line1[2]), int(line1[3])
        if not infile1.has_key(chr1):
            infile1[chr1] = []
        infile1[chr1].append({"start": start1, "end": end1, "id": id})
    
    for line in infile2:
        line = line.strip().split()
    
        if line[2] != 'gene':
            continue
    
        chr, start, end = line[0], int(line[3]), int(line[4])
    
        if not infile1.has_key(chr):
            continue
    
        for i in infile1[chr]:
            if i['start'] < start < i['end']:
                print i['id'], line[-1]
            if i['start'] < end < i['end']:
                print i['id'], line[-1]
            if i['start'] > start and i['end'] > end:
                print i['id'], line[-1]
    
    2022-11-12 01:56 回答
  • 性能应该没有更好的优化余地,不过代码可以稍微调整一下

    with open('1.txt', 'r') as f1, open('2.txt', 'r') as f2:
        lines1 = [_.strip().split() for _ in f1]
        for line2 in f2:
            line2 = line2.strip().split()
            if line2[2] != 'gene':
                continue
    
            chr2, start2, end2 = line2[0], int(line2[3]), int(line2[4])
            for line1 in lines1:
                chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3])
                if chr1 == chr2 and (start1 < start2 < end1 or start1 < end2 < end1 or start1 > start2 and end2 > end1):
                    print line1[0], line2[-1]
    2022-11-12 01:56 回答
撰写答案
今天,你开发时遇到什么问题呢?
立即提问
热门标签
PHP1.CN | 中国最专业的PHP中文社区 | PNG素材下载 | DevBox开发工具箱 | json解析格式化 |PHP资讯 | PHP教程 | 数据库技术 | 服务器技术 | 前端开发技术 | PHP框架 | 开发工具 | 在线工具
Copyright © 1998 - 2020 PHP1.CN. All Rights Reserved 京公网安备 11010802041100号 | 京ICP备19059560号-4 | PHP1.CN 第一PHP社区 版权所有