Thursday, February 2, 2012

python script extract variants from large file based on smaller file

#small script to extract variants from a large file based on a smaller file (by Vasa)

largeChromosomes = {}
largePositions = {}
largeFile = open('','r')
for line in largeFile:
       line = line.split('\t')
       chrm = line [0]
       rs = line[1]
       pos = line[2]
       largeChromosomes[rs] = chrm
       largePositions[rs] = pos

smallFile = open('reck.txt','r')
fileOut = open('reck.1kpositions.txt','w')
for line in smallFile:
       line = line.split('\t')
       rs = line[1]
       largeChrom = largeChromosomes[rs]
       largePos = largePositions[rs]
       lineOut = '     '.join( [rs, largeChrom, largePos] )
       print >> fileOut, lineOut

Below is a more stylish script that does the same:
create a file named, make it excutable and run
> python -s [name|number] selector_file -c [name|number] large_file -o output_file

#!/usr/bin/env python

# March 29th, 2011
# Yarko Tymciurak
# Research Computing Group
# Biostatistics Lab
# University of Chicago

Given two files, select (output) lines of the second (data) file which contain one of
the data items in the first (selector) file.
from __future__ import print_function
from collections import namedtuple
from optparse import OptionParser
import sys

if sys.version_info[0] < 3 and sys.version_info[0] == 2 and sys.version_info[1] < 6:
    print( 'This program needs a minimum of Python version 2.6 to run.  You are using:\n%s' % sys.version, file=sys.stderr)
LINEEND = '\r\n'   # for rstrip
# FIELDSEP = None    # for split

error_log = sys.stderr

def main():
    useage = '''useage:
    %prog [-s [name|number] selector_file [-c [name|number] data_file [-o output_file]"
    Given two input files, the values of a specified column of
     a selector file will be used to select output.
    Data in the specified "compare" column of the data_file will
     be checked against the selector data.
    If a compare data item from the data_file exists in the selector set,
     then that data_file row (line) will be output.
    You may also specify an output file.
    - If a field name is given, the first line of the file
      is assumed to contain field names.
    - If a field number is given, the n-th field is used (left-to-right,
      beginning with 1); all lines are assumed to be data
    - Fields are assumed to be TAB separated.
    % %prog -s IID relationships.txt -c 2 some.ped -o filtered.ped
    This will use the "IID" column from relationships.  Any row in "some.ped"
    which has column-2 data matching an IID data will be output to "filtered.ped"
    This is equivalent:
    % %prog -sIID relationships.txt -c2 some.ped -ofiltered.ped
    parser = OptionParser(useage)

    parser.add_option( "-s", "--select-column", dest="select_column",
                       help="column from which to build selection matching data")
    parser.add_option( "-c", "--compare-column", dest="compare_column",
                       help="column to compare against selection criteria")
    parser.add_option( "-o", "--output", dest="output",
                       help="output file for selected data")
    parser.add_option( "-p", "--separator", dest="FIELDSEP",
                       help="set field separator (each string will be one separator).\n\
                       default separator: any successive whitespace")

    (options,args) = parser.parse_args()
    output = options.output
    select_column = proper_type(options.select_column)
    compare_column = proper_type(options.compare_column)
    compare_asindex = type(compare_column) is int
    global FIELDSEP
    FIELDSEP = options.FIELDSEP # I depend on this being 'None' when not set

    if (select_column is None) or (compare_column is None):
        error_log.write("Need to specify select column (-s) and compare column (-c).\nTry -h option for help.\n")
    if len(args)<2:

    fname = args[0]
    fp = sys.stdin if fname == '-' else must_open(fname)
    fo = sys.stdout if output is None else must_open(output, 'w')
    selectors = get_selectors(fp, select_column)
    if type(select_column) is int:
        select_column -= 1  # adjust for zero-based array indexing

    for fname in args[1:]:
        global n
        n = 0
        fp = sys.stdin if fname == '-' else must_open(fname)
        line = fp.readline()
        # assume first line is a header line, if named data field:
        if not compare_asindex:
            dat = namedtuple("dat", line.rstrip(LINEEND).split(FIELDSEP))
            line = fp.readline()
            maxsplit = compare_column
            compare_column -= 1   # i.e. 2nd column => row[1]
        while not (line == ''):
            # split the line fields
            if compare_asindex:
                line_element = line.rstrip(LINEEND).split(FIELDSEP, maxsplit)
                drow = dat(*line.rstrip(LINEEND).split(FIELDSEP))
                line_element = drow._asdict()
            # output if the specified data field is in the selector
                if line_element[compare_column] in selectors:
            except Exception as e:
                print( e, "\nMaybe you need to specify an alternate field separator to process your data?",
            line = fp.readline()

def get_selectors(fp, column):
    using the global option, select a column from the file, and return a dict (hash performance) with all the values
    selectors = {}
    global FIELDSEP
    # Get column option:
    # - if numeric, split the fields and use number as an index,
    # - else use named.tuples, and use the first line as the header
    asindex = type(column) is int
    if not asindex:
        # get the first line, which we expect to be the column header
        line = fp.readline()
        hdr = namedtuple("hdr", line.rstrip(LINEEND).split(FIELDSEP))
    else: # can only limit splits on numerics
        maxsplit = column
        column -= 1   # i.e. 2nd column => row[1]
    # now go thru the file:
    line = fp.readline()
    while not (line==''):
        # prepare so we index correctly either way:
        if asindex:
            row = line.rstrip(LINEEND).split(FIELDSEP, maxsplit)
            drow = hdr(*line.rstrip(LINEEND).split(FIELDSEP))
            row = drow._asdict()
        selectors[row[column]]=''   # we only care about key, not value
        line = fp.readline()
    return selectors

def must_open(f, c='r'):
        fp = open(f,c)
    except IOError as e:
        error_log.write('Unable to open %s : %s\n' % (f,e))
    return fp

def proper_type(i):
    return int(i) if (i and i.isdigit()) else i

if __name__=='__main__':


Yarko said...


The "stylish" script I wrote for Haky last year has an important attribute: it uses "file.write()" instead of "print" to output the results. When Haky first gave me sample data, I was amazed at the line length of the input data - on the order of 5MB!

Since the print function scans every character of a line, to prepare platform specific output (such as newline characters, etc.), and character encoding conversions as may be necessary, it turned out this was about 2 orders of magnitude slower than the simple file.write() version I implemented for Haky.

Whatever you do yourself with this kind of data, you will probably be happier using simple file.write's for output.

Yarko said...

In the script, near ~line 98, the second "fp = ..." is redundant and should be removed.

Yarko said...

"small script" appears to have been copied into blogger - whenever using a special character for splitting data, be sure to write it as that special character in your code.

It looks like "small script" is splitting on 4 spaces, when in fact the files which I have seen separate fields by the TAB-character. The proper way to represent the TAB character in a string is as '\t', so that:

line = line.split('\t')

is what you would want in that code, if you were to try to use it.