#!/usr/bin/python
# Usage: cat input_data.txt | ./sort-id.py pri1.txt pri2.txt pri3.txt -
import fileinput, re, sys
pri_files, id_cache, save_records = [], {}, {}
matcher = re.compile('"(\d+)') # not necessary but faster
def process_priority_file(line):
if fileinput.isfirstline(): # save the file name and initialize the record capture list
pri_files.append(fileinput.filename())
save_records[fileinput.filename()] = []
id, separator, rest_of_line = line.partition('\t') # compare to line.split()...
# You're not printing out multiple copies of the same record if it shows up in more than one
# priority file, so I'm not sure what the point of saving a hash per priority filename was.
if id not in id_cache:
id_cache[id] = fileinput.filename()
def process_stdin(line):
matches = matcher.match(line) # matches = re.match('"(\d+)', line) if not precompiled
if not matches:
print 'Cannot get id from %s' % line
sys.exit(1)
rec_id = matches.group(1)
pri = id_cache.get(rec_id)
if pri:
save_records[pri].append(line)
def print_prioritized():
for priority in pri_files:
for r in save_records[priority]:
# didn't see you doing a chomp, but here's the Python version since print adds a newline
print r.rstrip('\r\n')
def read_files():
# http://docs.python.o...ry/fileinput.html is worth reading.
# I didn't feel like reading the filename count.
for line in fileinput.input():
if fileinput.isstdin():
process_stdin(line)
else:
process_priority_file(line)
print_prioritized()
# This test lets you only run something if the module is run from the command line.
# This code won't run if someone does 'import thisfilename'; idiomatically, people will
# sometimes put test code in this stanza so tests can be easily run from the command line.
# You could also put getopts tests for --help, etc. in here.
if __name__ == "__main__":
read_files()
Also, if you don't want to add a number to the command line and don't like the stdin trick I used, just use:
def read_files():
priority_files, input_file = sys.argv[1:-1], sys.argv[-1]
for line in fileinput.input(priority_files):
process_priority_file(line)
for line in fileinput.input(input_file):
process_stdin(line)
print_prioritized()