#small script to extract variants from a large file based on a smaller file (by Vasa)
largeChromosomes = {}
largePositions = {}
largeFile = open('ref1k.rs.snps.txt','r')
for line in largeFile:
line = line.split('\t')
chrm = line [0]
rs = line[1]
pos = line[2]
largeChromosomes[rs] = chrm
largePositions[rs] = pos
smallFile = open('reck.txt','r')
fileOut = open('reck.1kpositions.txt','w')
for line in smallFile:
line = line.split('\t')
rs = line[1]
largeChrom = largeChromosomes[rs]
largePos = largePositions[rs]
lineOut = ' '.join( [rs, largeChrom, largePos] )
print >> fileOut, lineOut
Below is a more stylish script that does the same:
create a file named select.py, make it excutable and run
> python select.py -s [name|number] selector_file -c [name|number] large_file -o output_file
#!/usr/bin/env python
###
# March 29th, 2011
# Yarko Tymciurak
# yarkot@uchicago.edu
#
# Research Computing Group
# Biostatistics Lab
# University of Chicago
####
'''
Given two files, select (output) lines of the second (data) file which contain one of
the data items in the first (selector) file.
'''
from __future__ import print_function
from collections import namedtuple
from optparse import OptionParser
import sys
if sys.version_info[0] < 3 and sys.version_info[0] == 2 and sys.version_info[1] < 6:
print( 'This program needs a minimum of Python version 2.6 to run. You are using:\n%s' % sys.version, file=sys.stderr)
exit(1)
LINEEND = '\r\n' # for rstrip
# FIELDSEP = None # for split
error_log = sys.stderr
def main():
useage = '''useage:
%prog [-s [name|number] selector_file [-c [name|number] data_file [-o output_file]"
Given two input files, the values of a specified column of
a selector file will be used to select output.
Data in the specified "compare" column of the data_file will
be checked against the selector data.
If a compare data item from the data_file exists in the selector set,
then that data_file row (line) will be output.
You may also specify an output file.
- If a field name is given, the first line of the file
is assumed to contain field names.
- If a field number is given, the n-th field is used (left-to-right,
beginning with 1); all lines are assumed to be data
- Fields are assumed to be TAB separated.
example:
% %prog -s IID relationships.txt -c 2 some.ped -o filtered.ped
This will use the "IID" column from relationships. Any row in "some.ped"
which has column-2 data matching an IID data will be output to "filtered.ped"
This is equivalent:
% %prog -sIID relationships.txt -c2 some.ped -ofiltered.ped
'''
parser = OptionParser(useage)
parser.add_option( "-s", "--select-column", dest="select_column",
help="column from which to build selection matching data")
parser.add_option( "-c", "--compare-column", dest="compare_column",
help="column to compare against selection criteria")
parser.add_option( "-o", "--output", dest="output",
help="output file for selected data")
parser.add_option( "-p", "--separator", dest="FIELDSEP",
help="set field separator (each string will be one separator).\n\
default separator: any successive whitespace")
(options,args) = parser.parse_args()
output = options.output
select_column = proper_type(options.select_column)
compare_column = proper_type(options.compare_column)
compare_asindex = type(compare_column) is int
global FIELDSEP
FIELDSEP = options.FIELDSEP # I depend on this being 'None' when not set
if (select_column is None) or (compare_column is None):
error_log.write("Need to specify select column (-s) and compare column (-c).\nTry -h option for help.\n")
exit(1)
if len(args)<2:
parser.print_help()
exit(1)
fname = args[0]
fp = sys.stdin if fname == '-' else must_open(fname)
fo = sys.stdout if output is None else must_open(output, 'w')
selectors = get_selectors(fp, select_column)
if type(select_column) is int:
select_column -= 1 # adjust for zero-based array indexing
for fname in args[1:]:
global n
n = 0
fp = sys.stdin if fname == '-' else must_open(fname)
line = fp.readline()
# assume first line is a header line, if named data field:
if not compare_asindex:
dat = namedtuple("dat", line.rstrip(LINEEND).split(FIELDSEP))
line = fp.readline()
else:
maxsplit = compare_column
compare_column -= 1 # i.e. 2nd column => row[1]
while not (line == ''):
n+=1
# split the line fields
if compare_asindex:
line_element = line.rstrip(LINEEND).split(FIELDSEP, maxsplit)
else:
drow = dat(*line.rstrip(LINEEND).split(FIELDSEP))
line_element = drow._asdict()
# output if the specified data field is in the selector
try:
if line_element[compare_column] in selectors:
fo.write(line)
except Exception as e:
print( e, "\nMaybe you need to specify an alternate field separator to process your data?",
file=sys.stdout);
return
line = fp.readline()
def get_selectors(fp, column):
'''
using the global option, select a column from the file, and return a dict (hash performance) with all the values
'''
selectors = {}
global FIELDSEP
# Get column option:
# - if numeric, split the fields and use number as an index,
# - else use named.tuples, and use the first line as the header
asindex = type(column) is int
if not asindex:
# get the first line, which we expect to be the column header
line = fp.readline()
hdr = namedtuple("hdr", line.rstrip(LINEEND).split(FIELDSEP))
else: # can only limit splits on numerics
maxsplit = column
column -= 1 # i.e. 2nd column => row[1]
# now go thru the file:
line = fp.readline()
while not (line==''):
# prepare so we index correctly either way:
if asindex:
row = line.rstrip(LINEEND).split(FIELDSEP, maxsplit)
else:
drow = hdr(*line.rstrip(LINEEND).split(FIELDSEP))
row = drow._asdict()
selectors[row[column]]='' # we only care about key, not value
line = fp.readline()
return selectors
def must_open(f, c='r'):
try:
fp = open(f,c)
except IOError as e:
error_log.write('Unable to open %s : %s\n' % (f,e))
exit(2)
return fp
def proper_type(i):
return int(i) if (i and i.isdigit()) else i
if __name__=='__main__':
main()
##----------
Thursday, February 2, 2012
python script extract variants from large file based on smaller file
Subscribe to:
Post Comments (Atom)
3 comments:
Note:
The "stylish" script I wrote for Haky last year has an important attribute: it uses "file.write()" instead of "print" to output the results. When Haky first gave me sample data, I was amazed at the line length of the input data - on the order of 5MB!
Since the print function scans every character of a line, to prepare platform specific output (such as newline characters, etc.), and character encoding conversions as may be necessary, it turned out this was about 2 orders of magnitude slower than the simple file.write() version I implemented for Haky.
Whatever you do yourself with this kind of data, you will probably be happier using simple file.write's for output.
In the select.py script, near ~line 98, the second "fp = ..." is redundant and should be removed.
"small script" appears to have been copied into blogger - whenever using a special character for splitting data, be sure to write it as that special character in your code.
It looks like "small script" is splitting on 4 spaces, when in fact the files which I have seen separate fields by the TAB-character. The proper way to represent the TAB character in a string is as '\t', so that:
line = line.split('\t')
is what you would want in that code, if you were to try to use it.
Post a Comment