#small script to extract variants from a large file based on a smaller file (by Vasa) largeChromosomes = {} largePositions = {} largeFile = open('ref1k.rs.snps.txt','r') for line in largeFile: line = line.split('\t') chrm = line [0] rs = line[1] pos = line[2] largeChromosomes[rs] = chrm largePositions[rs] = pos smallFile = open('reck.txt','r') fileOut = open('reck.1kpositions.txt','w') for line in smallFile: line = line.split('\t') rs = line[1] largeChrom = largeChromosomes[rs] largePos = largePositions[rs] lineOut = ' '.join( [rs, largeChrom, largePos] ) print >> fileOut, lineOut Below is a more stylish script that does the same: create a file named select.py, make it excutable and run > python select.py -s [name|number] selector_file -c [name|number] large_file -o output_file #!/usr/bin/env python ### # March 29th, 2011 # Yarko Tymciurak # yarkot@uchicago.edu # # Research Computing Group # Biostatistics Lab # University of Chicago #### ''' Given two files, select (output) lines of the second (data) file which contain one of the data items in the first (selector) file. ''' from __future__ import print_function from collections import namedtuple from optparse import OptionParser import sys if sys.version_info[0] < 3 and sys.version_info[0] == 2 and sys.version_info[1] < 6: print( 'This program needs a minimum of Python version 2.6 to run. You are using:\n%s' % sys.version, file=sys.stderr) exit(1) LINEEND = '\r\n' # for rstrip # FIELDSEP = None # for split error_log = sys.stderr def main(): useage = '''useage: %prog [-s [name|number] selector_file [-c [name|number] data_file [-o output_file]" Given two input files, the values of a specified column of a selector file will be used to select output. Data in the specified "compare" column of the data_file will be checked against the selector data. If a compare data item from the data_file exists in the selector set, then that data_file row (line) will be output. You may also specify an output file. - If a field name is given, the first line of the file is assumed to contain field names. - If a field number is given, the n-th field is used (left-to-right, beginning with 1); all lines are assumed to be data - Fields are assumed to be TAB separated. example: % %prog -s IID relationships.txt -c 2 some.ped -o filtered.ped This will use the "IID" column from relationships. Any row in "some.ped" which has column-2 data matching an IID data will be output to "filtered.ped" This is equivalent: % %prog -sIID relationships.txt -c2 some.ped -ofiltered.ped ''' parser = OptionParser(useage) parser.add_option( "-s", "--select-column", dest="select_column", help="column from which to build selection matching data") parser.add_option( "-c", "--compare-column", dest="compare_column", help="column to compare against selection criteria") parser.add_option( "-o", "--output", dest="output", help="output file for selected data") parser.add_option( "-p", "--separator", dest="FIELDSEP", help="set field separator (each string will be one separator).\n\ default separator: any successive whitespace") (options,args) = parser.parse_args() output = options.output select_column = proper_type(options.select_column) compare_column = proper_type(options.compare_column) compare_asindex = type(compare_column) is int global FIELDSEP FIELDSEP = options.FIELDSEP # I depend on this being 'None' when not set if (select_column is None) or (compare_column is None): error_log.write("Need to specify select column (-s) and compare column (-c).\nTry -h option for help.\n") exit(1) if len(args)<2: parser.print_help() exit(1) fname = args[0] fp = sys.stdin if fname == '-' else must_open(fname) fo = sys.stdout if output is None else must_open(output, 'w') selectors = get_selectors(fp, select_column) if type(select_column) is int: select_column -= 1 # adjust for zero-based array indexing for fname in args[1:]: global n n = 0 fp = sys.stdin if fname == '-' else must_open(fname) line = fp.readline() # assume first line is a header line, if named data field: if not compare_asindex: dat = namedtuple("dat", line.rstrip(LINEEND).split(FIELDSEP)) line = fp.readline() else: maxsplit = compare_column compare_column -= 1 # i.e. 2nd column => row[1] while not (line == ''): n+=1 # split the line fields if compare_asindex: line_element = line.rstrip(LINEEND).split(FIELDSEP, maxsplit) else: drow = dat(*line.rstrip(LINEEND).split(FIELDSEP)) line_element = drow._asdict() # output if the specified data field is in the selector try: if line_element[compare_column] in selectors: fo.write(line) except Exception as e: print( e, "\nMaybe you need to specify an alternate field separator to process your data?", file=sys.stdout); return line = fp.readline() def get_selectors(fp, column): ''' using the global option, select a column from the file, and return a dict (hash performance) with all the values ''' selectors = {} global FIELDSEP # Get column option: # - if numeric, split the fields and use number as an index, # - else use named.tuples, and use the first line as the header asindex = type(column) is int if not asindex: # get the first line, which we expect to be the column header line = fp.readline() hdr = namedtuple("hdr", line.rstrip(LINEEND).split(FIELDSEP)) else: # can only limit splits on numerics maxsplit = column column -= 1 # i.e. 2nd column => row[1] # now go thru the file: line = fp.readline() while not (line==''): # prepare so we index correctly either way: if asindex: row = line.rstrip(LINEEND).split(FIELDSEP, maxsplit) else: drow = hdr(*line.rstrip(LINEEND).split(FIELDSEP)) row = drow._asdict() selectors[row[column]]='' # we only care about key, not value line = fp.readline() return selectors def must_open(f, c='r'): try: fp = open(f,c) except IOError as e: error_log.write('Unable to open %s : %s\n' % (f,e)) exit(2) return fp def proper_type(i): return int(i) if (i and i.isdigit()) else i if __name__=='__main__': main() ##----------
Thursday, February 2, 2012
python script extract variants from large file based on smaller file
Subscribe to:
Post Comments (Atom)
3 comments:
Note:
The "stylish" script I wrote for Haky last year has an important attribute: it uses "file.write()" instead of "print" to output the results. When Haky first gave me sample data, I was amazed at the line length of the input data - on the order of 5MB!
Since the print function scans every character of a line, to prepare platform specific output (such as newline characters, etc.), and character encoding conversions as may be necessary, it turned out this was about 2 orders of magnitude slower than the simple file.write() version I implemented for Haky.
Whatever you do yourself with this kind of data, you will probably be happier using simple file.write's for output.
In the select.py script, near ~line 98, the second "fp = ..." is redundant and should be removed.
"small script" appears to have been copied into blogger - whenever using a special character for splitting data, be sure to write it as that special character in your code.
It looks like "small script" is splitting on 4 spaces, when in fact the files which I have seen separate fields by the TAB-character. The proper way to represent the TAB character in a string is as '\t', so that:
line = line.split('\t')
is what you would want in that code, if you were to try to use it.
Post a Comment