> prompt > mget * .
Tuesday, July 24, 2012
Friday, April 20, 2012
python script to convert plink bed/bim to qtdt format
import os
sdir = '/hapmap/'
basebedname = 'hapmap3'
ddir = '/erlotinib/'
temponame = 'tempo'
phenoname = 'erlotinib'
os.system('plink --bfile ' + sdir + basebedname + ' --recode12 --out ' + ddir + temponame )
## join tempo.ped with phenodata.txt
## and generate pheno.ped file
shortfilename = ddir + "phenodata.txt" ## FID,IID,dad,mom,sex,pheno1,...
longfilename = ddir + temponame + ".ped" ## FID,IID,dad,mom,sex,pheno,snp1,...
outfilename = ddir + phenoname + ".ped"
index = {}
shortfile= open( shortfilename, "r" )
for row in shortfile:
part = row.split(None,2)
iid = part[1] ## IID is the second column in phenofile
index[iid] = row.rstrip()
## rstrip gets rid of \n and other white spaces
shortfile.close()
outfile = open( outfilename, "w")
longfile= open( longfilename, "r" )
for row in longfile:
part = row.split(None,6)
iid = part[1]
restofline = part[6]
if iid in index:
pheno = index[iid]
outfile.write( pheno + ' ' + restofline)
longfile.close()
outfile.close()
## generate pheno.dat file
infilename = ddir + temponame + '.map'
outfilename = ddir + phenoname + '.dat'
infile = open(infilename,'r')
outfile = open(outfilename,'w')
## write covariates and phenotype names
outfile.write('C igrowth\n')
outfile.write('T logAUC\n')
## write one row per snp
for linea in infile:
part = linea.split(None,2)
newlinea = 'M ' + part[1] + '\n'
outfile.write(newlinea)
infile.close()
outfile.close()
Friday, April 13, 2012
create new theme for ggplot
from: http://sape.inf.usi.ch/quick-reference/ggplot2/themes
theme_complete_bw <- function(base_size = 12) {
structure(list(
axis.line = theme_blank(),
axis.text.x = theme_text(size = base_size * 0.8 , lineheight = 0.9, colour = "black", vjust = 1),
axis.text.y = theme_text(size = base_size * 0.8, lineheight = 0.9, colour = "black", hjust = 1),
axis.ticks = theme_segment(colour = "black"),
axis.title.x = theme_text(size = base_size, vjust = 0.5),
axis.title.y = theme_text(size = base_size, angle = 90, vjust = 0.5),
axis.ticks.length = unit(0.15, "cm"),
axis.ticks.margin = unit(0.1, "cm"),
legend.background = theme_rect(colour=NA),
legend.key = theme_rect(fill = NA, colour = "black", size = 0.25),
legend.key.size = unit(1.2, "lines"),
legend.text = theme_text(size = base_size * 0.8),
legend.title = theme_text(size = base_size * 0.8, face = "bold", hjust = 0),
legend.position = "right",
panel.background = theme_rect(fill = NA, colour = "black", size = 0.25),
panel.border = theme_blank(),
panel.grid.major = theme_line(colour = "black", size = 0.05),
panel.grid.minor = theme_line(colour = "black", size = 0.05),
panel.margin = unit(0.25, "lines"),
strip.background = theme_rect(fill = NA, colour = NA),
strip.text.x = theme_text(colour = "black", size = base_size * 0.8),
strip.text.y = theme_text(colour = "black", size = base_size * 0.8, angle = -90),
plot.background = theme_rect(colour = NA, fill = "white"),
plot.title = theme_text(size = base_size * 1.2),
plot.margin = unit(c(1, 1, 0.5, 0.5), "lines")
), class = "options")
}
Thursday, April 12, 2012
# cdf: cd's to frontmost window of Finder (by Yarko)
copy the following in ~/.bash_profile
# cdf: cd's to frontmost window of Finder
cdf ()
{
currFolderPath=$( /usr/bin/osascript <<" EOT"
tell application "Finder"
try
set currFolder to (folder of the front window as alias)
on error
set currFolder to (path to desktop folder as alias)
end try
POSIX path of currFolder
end tell
EOT
)
# ignore the cd if it's my desktop
if [ "$currFolderPath" != "$HOME/Desktop/" ]
then
echo "cd to \"$currFolderPath\""
cd "$currFolderPath"
else
echo "no finder path to cd to..."
fi
}
Friday, February 3, 2012
Thursday, February 2, 2012
python script extract variants from large file based on smaller file
#small script to extract variants from a large file based on a smaller file (by Vasa)
largeChromosomes = {}
largePositions = {}
largeFile = open('ref1k.rs.snps.txt','r')
for line in largeFile:
line = line.split('\t')
chrm = line [0]
rs = line[1]
pos = line[2]
largeChromosomes[rs] = chrm
largePositions[rs] = pos
smallFile = open('reck.txt','r')
fileOut = open('reck.1kpositions.txt','w')
for line in smallFile:
line = line.split('\t')
rs = line[1]
largeChrom = largeChromosomes[rs]
largePos = largePositions[rs]
lineOut = ' '.join( [rs, largeChrom, largePos] )
print >> fileOut, lineOut
Below is a more stylish script that does the same:
create a file named select.py, make it excutable and run
> python select.py -s [name|number] selector_file -c [name|number] large_file -o output_file
#!/usr/bin/env python
###
# March 29th, 2011
# Yarko Tymciurak
# yarkot@uchicago.edu
#
# Research Computing Group
# Biostatistics Lab
# University of Chicago
####
'''
Given two files, select (output) lines of the second (data) file which contain one of
the data items in the first (selector) file.
'''
from __future__ import print_function
from collections import namedtuple
from optparse import OptionParser
import sys
if sys.version_info[0] < 3 and sys.version_info[0] == 2 and sys.version_info[1] < 6:
print( 'This program needs a minimum of Python version 2.6 to run. You are using:\n%s' % sys.version, file=sys.stderr)
exit(1)
LINEEND = '\r\n' # for rstrip
# FIELDSEP = None # for split
error_log = sys.stderr
def main():
useage = '''useage:
%prog [-s [name|number] selector_file [-c [name|number] data_file [-o output_file]"
Given two input files, the values of a specified column of
a selector file will be used to select output.
Data in the specified "compare" column of the data_file will
be checked against the selector data.
If a compare data item from the data_file exists in the selector set,
then that data_file row (line) will be output.
You may also specify an output file.
- If a field name is given, the first line of the file
is assumed to contain field names.
- If a field number is given, the n-th field is used (left-to-right,
beginning with 1); all lines are assumed to be data
- Fields are assumed to be TAB separated.
example:
% %prog -s IID relationships.txt -c 2 some.ped -o filtered.ped
This will use the "IID" column from relationships. Any row in "some.ped"
which has column-2 data matching an IID data will be output to "filtered.ped"
This is equivalent:
% %prog -sIID relationships.txt -c2 some.ped -ofiltered.ped
'''
parser = OptionParser(useage)
parser.add_option( "-s", "--select-column", dest="select_column",
help="column from which to build selection matching data")
parser.add_option( "-c", "--compare-column", dest="compare_column",
help="column to compare against selection criteria")
parser.add_option( "-o", "--output", dest="output",
help="output file for selected data")
parser.add_option( "-p", "--separator", dest="FIELDSEP",
help="set field separator (each string will be one separator).\n\
default separator: any successive whitespace")
(options,args) = parser.parse_args()
output = options.output
select_column = proper_type(options.select_column)
compare_column = proper_type(options.compare_column)
compare_asindex = type(compare_column) is int
global FIELDSEP
FIELDSEP = options.FIELDSEP # I depend on this being 'None' when not set
if (select_column is None) or (compare_column is None):
error_log.write("Need to specify select column (-s) and compare column (-c).\nTry -h option for help.\n")
exit(1)
if len(args)<2:
parser.print_help()
exit(1)
fname = args[0]
fp = sys.stdin if fname == '-' else must_open(fname)
fo = sys.stdout if output is None else must_open(output, 'w')
selectors = get_selectors(fp, select_column)
if type(select_column) is int:
select_column -= 1 # adjust for zero-based array indexing
for fname in args[1:]:
global n
n = 0
fp = sys.stdin if fname == '-' else must_open(fname)
line = fp.readline()
# assume first line is a header line, if named data field:
if not compare_asindex:
dat = namedtuple("dat", line.rstrip(LINEEND).split(FIELDSEP))
line = fp.readline()
else:
maxsplit = compare_column
compare_column -= 1 # i.e. 2nd column => row[1]
while not (line == ''):
n+=1
# split the line fields
if compare_asindex:
line_element = line.rstrip(LINEEND).split(FIELDSEP, maxsplit)
else:
drow = dat(*line.rstrip(LINEEND).split(FIELDSEP))
line_element = drow._asdict()
# output if the specified data field is in the selector
try:
if line_element[compare_column] in selectors:
fo.write(line)
except Exception as e:
print( e, "\nMaybe you need to specify an alternate field separator to process your data?",
file=sys.stdout);
return
line = fp.readline()
def get_selectors(fp, column):
'''
using the global option, select a column from the file, and return a dict (hash performance) with all the values
'''
selectors = {}
global FIELDSEP
# Get column option:
# - if numeric, split the fields and use number as an index,
# - else use named.tuples, and use the first line as the header
asindex = type(column) is int
if not asindex:
# get the first line, which we expect to be the column header
line = fp.readline()
hdr = namedtuple("hdr", line.rstrip(LINEEND).split(FIELDSEP))
else: # can only limit splits on numerics
maxsplit = column
column -= 1 # i.e. 2nd column => row[1]
# now go thru the file:
line = fp.readline()
while not (line==''):
# prepare so we index correctly either way:
if asindex:
row = line.rstrip(LINEEND).split(FIELDSEP, maxsplit)
else:
drow = hdr(*line.rstrip(LINEEND).split(FIELDSEP))
row = drow._asdict()
selectors[row[column]]='' # we only care about key, not value
line = fp.readline()
return selectors
def must_open(f, c='r'):
try:
fp = open(f,c)
except IOError as e:
error_log.write('Unable to open %s : %s\n' % (f,e))
exit(2)
return fp
def proper_type(i):
return int(i) if (i and i.isdigit()) else i
if __name__=='__main__':
main()
##----------
Wednesday, January 11, 2012
Friday, January 6, 2012
latexdiff - mark up differences between latex files
Determine and mark up significant differences between latex files. http://www.ctan.org/tex-archive/support/latexdiff This perl script will generate a latex file with changes marked up (similar to Word) from two latex files latexdiff-so fileold.tex filenew.tex > dif.tex
ps2pdf to all ps files in current directory
for x in $(ls *.ps); do echo $x; ps2pdf $x $(echo $x|sed -e 's/ps$/pdf/');done
or more efficiently according to Yarko's comment
for x in *.ps; do
echo $x;
ps2pdf $x ${x/%ps/pdf};
done
Subscribe to:
Comments (Atom)