#!/usr/bin/env python """ findall.py - Find Occurances of a Regexp A Python port of "grep -o", with more features Usage: %prog [options] [] [*] Author: Sean B. Palmer, inamidst.com. GPL 2 Example: findall -t URL 2001-10.log """ import sys, re from optparse import OptionParser tokens = {'all': r'.*', 'URL': r'http://[^\s<>(),;\'"]+', 'URI': r'[a-z]+:[^\s<>"]+', 'literal': r'"[^"\\]*(?:\\.[^"\\]*)*"', 'todo': r'(?sm)@@[ \n].+?(?:\.(?=[ \n<])|\?(?=[ \n<])|.(?=<)|\n\n)', 'bNode': r'_:[A-Za-z][A-Za-z0-9]*', 'RDF-URI': r'<[a-z]+:[^\s<>"]+>', 'word': r"(?', 'sh-comment': r'^[ \t]*#[^\r\n]*', 'shebang': r'^#!/\S+', 'lo-bytes': r'[\x00-\x7f]+', 'hi-bytes': r'[\x80-\xff]+'} def barf(msg): print >> sys.stderr, msg sys.exit(1) def fileGenerator(filenames): for fn in filenames: yield open(fn) def findall(args, opt): if opt.token is not False: try: pattern = tokens[opt.token] except KeyError: barf("Error: %r token unknown. Try --list." % opt.token) filenames = args else: pattern, filenames = args[0], args[1:] if opt.null: sepchar = '\x00' else: sepchar = opt.sepchar if not filenames: files = [sys.stdin] else: files = fileGenerator(filenames) try: regexp = re.compile(pattern) except: barf("Error: %r is an invalid regexp" % pattern) # Define these here to prevent redoing the tests if (not opt.multiline) and (not opt.grep): def doFile(f): for line in f: line = line.rstrip('\r\n') for result in regexp.findall(line): sys.stdout.write(result + sepchar) elif (not opt.multiline): def doFile(f): for line in f: line = line.rstrip('\r\n') if regexp.search(line): sys.stdout.write(line + sepchar) else: def doFile(f): content = f.read() for result in regexp.findall(content): sys.stdout.write(result + sepchar) for f in files: doFile(f) if f is not sys.stdin: f.close() if sepchar != '\n': sys.stdout.write('\n') def main(argv=None): i = __doc__.find('Usage:') usage = __doc__[i+7:].rstrip() parser = OptionParser(usage=usage) # Mode flags parser.add_option("-m", "--multiline", dest="multiline", action="store_true", default=False, help="read in the whole file at once") parser.add_option("-l", "--list", dest="list", action="store_true", default=False, help="list the preset token regexps") parser.add_option("-g", "--grep", dest="grep", action="store_true", default=False, help="simulate grep") parser.add_option("-n", "--null", dest="null", action="store_true", default=False, help="use null separated output") # Argument flags parser.add_option("-t", "--token", dest="token", default=False, help="use one of the preset regexps (see --list)", metavar="TOK") parser.add_option("-s", "--sepchar", dest="sepchar", default='\n', help="set the sepchar; default is \\n", metavar="CH") options, args = parser.parse_args(argv) if options.multiline and options.grep: barf("Error: -mg (multiline and grep) are mutually exclusive") if options.list: keys = tokens.keys() keys.sort() for key in keys: print '%s: %r' % (key, tokens.get(key)) elif (len(args) > 0) or options.token: findall(args, options) else: parser.print_help() if __name__=="__main__": main()