Get the first X molecules from a large .Mol2 file


Written by D. Lagorce

Put the following lines in a file, call it for instance count.py
--------------------------------
#! /usr/bin/env python


from string import *
import sys
import os.path


def mol_counter(fhi,n_mol):

    result = ""
    flag = 0
    line = fhi.readline()
    while line != "":
        if line[:-1] == "@<TRIPOS>MOLECULE":
            flag = flag + 1
       
        if flag == n_mol+1:
            break
       
        result += line
        line = fhi.readline()

    return result

if __name__=='__main__':


    file_in = sys.argv[1]
    n = int(sys.argv[2])
    fh_in = open(file_in,'r')
    file_path = str(os.path.basename(file_in))
    file_out = './best_'+str(n)+'_'+ file_path
    fh_out = open(file_out,'w')
    res = mol_counter(fh_in,n)
    fh_out.write(res)
    fh_out.close()
    fh_in.close()


-------------------------------------

run it like this on the terminal window:
./compteur.py mybank.mol2 3000

this means that you want to get the first 3000 molecules from your large compound collections
the output file written in your working directory will be: best_3000_mybank.mol2
clearly, the molecules need to start with <TRIPOS>...