Friday, April 20, 2012

python script to convert plink bed/bim to qtdt format


import os

sdir = '/hapmap/'
basebedname = 'hapmap3'
ddir = '/erlotinib/'
temponame = 'tempo'
phenoname = 'erlotinib'

os.system('plink --bfile ' + sdir + basebedname + ' --recode12 --out ' + ddir + temponame )

## join tempo.ped with phenodata.txt 
## and generate pheno.ped file

shortfilename = ddir +  "phenodata.txt" ## FID,IID,dad,mom,sex,pheno1,...
longfilename = ddir +  temponame + ".ped" ## FID,IID,dad,mom,sex,pheno,snp1,...
outfilename = ddir + phenoname + ".ped"

index = {}
shortfile= open( shortfilename, "r" )
for row in shortfile:
    part = row.split(None,2)
    iid = part[1] ## IID is the second column in phenofile
    index[iid] = row.rstrip()
    ## rstrip gets rid of \n and other white spaces

shortfile.close()

outfile = open( outfilename, "w")
longfile= open( longfilename, "r" )

for row in longfile:
    part = row.split(None,6)
    iid = part[1]
    restofline = part[6]
    if iid in index:
        pheno = index[iid]
        outfile.write( pheno + ' ' + restofline)

longfile.close()
outfile.close()

## generate pheno.dat file

infilename = ddir + temponame + '.map'
outfilename = ddir + phenoname + '.dat'

infile = open(infilename,'r')
outfile = open(outfilename,'w')


## write covariates and phenotype names
outfile.write('C igrowth\n')
outfile.write('T logAUC\n')

## write one row per snp
for linea in infile:
 part = linea.split(None,2)
 newlinea = 'M ' + part[1] + '\n'
 outfile.write(newlinea)

infile.close()
outfile.close()



No comments:

Contributors

google