Tuesday, July 24, 2012

Friday, April 20, 2012

python script to convert plink bed/bim to qtdt format


import os

sdir = '/hapmap/'
basebedname = 'hapmap3'
ddir = '/erlotinib/'
temponame = 'tempo'
phenoname = 'erlotinib'

os.system('plink --bfile ' + sdir + basebedname + ' --recode12 --out ' + ddir + temponame )

## join tempo.ped with phenodata.txt 
## and generate pheno.ped file

shortfilename = ddir +  "phenodata.txt" ## FID,IID,dad,mom,sex,pheno1,...
longfilename = ddir +  temponame + ".ped" ## FID,IID,dad,mom,sex,pheno,snp1,...
outfilename = ddir + phenoname + ".ped"

index = {}
shortfile= open( shortfilename, "r" )
for row in shortfile:
    part = row.split(None,2)
    iid = part[1] ## IID is the second column in phenofile
    index[iid] = row.rstrip()
    ## rstrip gets rid of \n and other white spaces

shortfile.close()

outfile = open( outfilename, "w")
longfile= open( longfilename, "r" )

for row in longfile:
    part = row.split(None,6)
    iid = part[1]
    restofline = part[6]
    if iid in index:
        pheno = index[iid]
        outfile.write( pheno + ' ' + restofline)

longfile.close()
outfile.close()

## generate pheno.dat file

infilename = ddir + temponame + '.map'
outfilename = ddir + phenoname + '.dat'

infile = open(infilename,'r')
outfile = open(outfilename,'w')


## write covariates and phenotype names
outfile.write('C igrowth\n')
outfile.write('T logAUC\n')

## write one row per snp
for linea in infile:
 part = linea.split(None,2)
 newlinea = 'M ' + part[1] + '\n'
 outfile.write(newlinea)

infile.close()
outfile.close()



Friday, April 13, 2012

create new theme for ggplot

from: http://sape.inf.usi.ch/quick-reference/ggplot2/themes

theme_complete_bw <- function(base_size = 12) {
  structure(list(
    axis.line =         theme_blank(),
    axis.text.x =       theme_text(size = base_size * 0.8 , lineheight = 0.9, colour = "black", vjust = 1),
    axis.text.y =       theme_text(size = base_size * 0.8, lineheight = 0.9, colour = "black", hjust = 1),
    axis.ticks =        theme_segment(colour = "black"),
    axis.title.x =      theme_text(size = base_size, vjust = 0.5),
    axis.title.y =      theme_text(size = base_size, angle = 90, vjust = 0.5),
    axis.ticks.length = unit(0.15, "cm"),
    axis.ticks.margin = unit(0.1, "cm"),
 
    legend.background = theme_rect(colour=NA), 
    legend.key =        theme_rect(fill = NA, colour = "black", size = 0.25),
    legend.key.size =   unit(1.2, "lines"),
    legend.text =       theme_text(size = base_size * 0.8),
    legend.title =      theme_text(size = base_size * 0.8, face = "bold", hjust = 0),
    legend.position =   "right",
 
    panel.background =  theme_rect(fill = NA, colour = "black", size = 0.25), 
    panel.border =      theme_blank(),
    panel.grid.major =  theme_line(colour = "black", size = 0.05),
    panel.grid.minor =  theme_line(colour = "black", size = 0.05),
    panel.margin =      unit(0.25, "lines"),
 
    strip.background =  theme_rect(fill = NA, colour = NA), 
    strip.text.x =      theme_text(colour = "black", size = base_size * 0.8),
    strip.text.y =      theme_text(colour = "black", size = base_size * 0.8, angle = -90),
 
    plot.background =   theme_rect(colour = NA, fill = "white"),
    plot.title =        theme_text(size = base_size * 1.2),
    plot.margin =       unit(c(1, 1, 0.5, 0.5), "lines")
  ), class = "options")
}

Thursday, April 12, 2012

# cdf: cd's to frontmost window of Finder (by Yarko)

copy the following in ~/.bash_profile

# cdf: cd's to frontmost window of Finder
cdf ()
{
    currFolderPath=$( /usr/bin/osascript <<"    EOT"
        tell application "Finder"
            try
                set currFolder to (folder of the front window as alias)
            on error
                set currFolder to (path to desktop folder as alias)
            end try
            POSIX path of currFolder
        end tell
    EOT
    )
    # ignore the cd if it's my desktop
    if [ "$currFolderPath" != "$HOME/Desktop/" ]
    then
        echo "cd to \"$currFolderPath\""
        cd "$currFolderPath"
    else
        echo "no finder path to cd to..."
    fi
}

Thursday, February 2, 2012

python script extract variants from large file based on smaller file

#small script to extract variants from a large file based on a smaller file (by Vasa)

largeChromosomes = {}
largePositions = {}
largeFile = open('ref1k.rs.snps.txt','r')
for line in largeFile:
       line = line.split('\t')
       chrm = line [0]
       rs = line[1]
       pos = line[2]
       largeChromosomes[rs] = chrm
       largePositions[rs] = pos

smallFile = open('reck.txt','r')
fileOut = open('reck.1kpositions.txt','w')
for line in smallFile:
       line = line.split('\t')
       rs = line[1]
       largeChrom = largeChromosomes[rs]
       largePos = largePositions[rs]
       lineOut = '     '.join( [rs, largeChrom, largePos] )
       print >> fileOut, lineOut



Below is a more stylish script that does the same:
create a file named select.py, make it excutable and run
> python select.py -s [name|number] selector_file -c [name|number] large_file -o output_file


#!/usr/bin/env python

###
# March 29th, 2011
# Yarko Tymciurak
# yarkot@uchicago.edu
#
# Research Computing Group
# Biostatistics Lab
# University of Chicago
####


'''
Given two files, select (output) lines of the second (data) file which contain one of
the data items in the first (selector) file.
'''
from __future__ import print_function
from collections import namedtuple
from optparse import OptionParser
import sys

if sys.version_info[0] < 3 and sys.version_info[0] == 2 and sys.version_info[1] < 6:
    print( 'This program needs a minimum of Python version 2.6 to run.  You are using:\n%s' % sys.version, file=sys.stderr)
    exit(1)
LINEEND = '\r\n'   # for rstrip
# FIELDSEP = None    # for split

error_log = sys.stderr

def main():
    useage = '''useage:
    %prog [-s [name|number] selector_file [-c [name|number] data_file [-o output_file]"
    
    Given two input files, the values of a specified column of
     a selector file will be used to select output.
    Data in the specified "compare" column of the data_file will
     be checked against the selector data.
    If a compare data item from the data_file exists in the selector set,
     then that data_file row (line) will be output.
    
    You may also specify an output file.
     
    - If a field name is given, the first line of the file
      is assumed to contain field names.
    - If a field number is given, the n-th field is used (left-to-right,
      beginning with 1); all lines are assumed to be data
    - Fields are assumed to be TAB separated.
    
    
    example:
    % %prog -s IID relationships.txt -c 2 some.ped -o filtered.ped
    
    This will use the "IID" column from relationships.  Any row in "some.ped"
    which has column-2 data matching an IID data will be output to "filtered.ped"
    
    This is equivalent:
    % %prog -sIID relationships.txt -c2 some.ped -ofiltered.ped
    
    '''
    parser = OptionParser(useage)

    parser.add_option( "-s", "--select-column", dest="select_column",
                       help="column from which to build selection matching data")
    parser.add_option( "-c", "--compare-column", dest="compare_column",
                       help="column to compare against selection criteria")
    parser.add_option( "-o", "--output", dest="output",
                       help="output file for selected data")
    parser.add_option( "-p", "--separator", dest="FIELDSEP",
                       help="set field separator (each string will be one separator).\n\
                       default separator: any successive whitespace")

    (options,args) = parser.parse_args()
    output = options.output
    select_column = proper_type(options.select_column)
    compare_column = proper_type(options.compare_column)
    compare_asindex = type(compare_column) is int
    
    global FIELDSEP
    
    FIELDSEP = options.FIELDSEP # I depend on this being 'None' when not set


    if (select_column is None) or (compare_column is None):
        error_log.write("Need to specify select column (-s) and compare column (-c).\nTry -h option for help.\n")
        exit(1)
    if len(args)<2:
        parser.print_help()
        exit(1)

    fname = args[0]
    fp = sys.stdin if fname == '-' else must_open(fname)
    fo = sys.stdout if output is None else must_open(output, 'w')
    selectors = get_selectors(fp, select_column)
    if type(select_column) is int:
        select_column -= 1  # adjust for zero-based array indexing

    for fname in args[1:]:
        global n
        n = 0
        fp = sys.stdin if fname == '-' else must_open(fname)
        line = fp.readline()
        # assume first line is a header line, if named data field:
        if not compare_asindex:
            dat = namedtuple("dat", line.rstrip(LINEEND).split(FIELDSEP))
            line = fp.readline()
        else:
            maxsplit = compare_column
            compare_column -= 1   # i.e. 2nd column => row[1]
        while not (line == ''):
            n+=1
            # split the line fields
            if compare_asindex:
                line_element = line.rstrip(LINEEND).split(FIELDSEP, maxsplit)
            else:
                drow = dat(*line.rstrip(LINEEND).split(FIELDSEP))
                line_element = drow._asdict()
            
            # output if the specified data field is in the selector
            try:
                if line_element[compare_column] in selectors:
                    fo.write(line)
            except Exception as e:
                print( e, "\nMaybe you need to specify an alternate field separator to process your data?",
                       file=sys.stdout);
                return
            line = fp.readline()


def get_selectors(fp, column):
    '''
    using the global option, select a column from the file, and return a dict (hash performance) with all the values
    '''
    selectors = {}
    global FIELDSEP
    # Get column option:
    # - if numeric, split the fields and use number as an index,
    # - else use named.tuples, and use the first line as the header
    asindex = type(column) is int
    if not asindex:
        # get the first line, which we expect to be the column header
        line = fp.readline()
        hdr = namedtuple("hdr", line.rstrip(LINEEND).split(FIELDSEP))
    else: # can only limit splits on numerics
        maxsplit = column
        column -= 1   # i.e. 2nd column => row[1]
    # now go thru the file:
    line = fp.readline()
    while not (line==''):
        # prepare so we index correctly either way:
        if asindex:
            row = line.rstrip(LINEEND).split(FIELDSEP, maxsplit)
        else:
            drow = hdr(*line.rstrip(LINEEND).split(FIELDSEP))
            row = drow._asdict()
        selectors[row[column]]=''   # we only care about key, not value
        line = fp.readline()
    return selectors

def must_open(f, c='r'):
    try:
        fp = open(f,c)
    except IOError as e:
        error_log.write('Unable to open %s : %s\n' % (f,e))
        exit(2)
    return fp

def proper_type(i):
    return int(i) if (i and i.isdigit()) else i

if __name__=='__main__':
    main()
##----------

Friday, January 6, 2012

latexdiff - mark up differences between latex files

Determine and mark up significant differences between latex files.

http://www.ctan.org/tex-archive/support/latexdiff

This perl script will generate a latex file with changes marked up (similar to Word) from two latex files

latexdiff-so fileold.tex filenew.tex > dif.tex

ps2pdf to all ps files in current directory

for x in $(ls *.ps); do echo $x; ps2pdf $x $(echo $x|sed -e 's/ps$/pdf/');done

or more efficiently according to Yarko's comment

for x in *.ps; do
echo $x;
ps2pdf $x ${x/%ps/pdf};
done


Contributors

google