import sys
import re
import urllib,urllib2
import string
import datetime
from optparse import OptionParser

def readIndexFile(uri = '',level = 0):
    global buff,ommit,anchor_match,options,search
  
    request = urllib2.Request(uri)  
    try:
        response = urllib2.urlopen(request)  
    except urllib2.HTTPError, e:
        print 'ERR: ('+str(e.code)+') Error occured. Current URI:'+uri
        buff += '\n|-'+'--'*level+uri[uri[:-1].rfind('/')+1:]+' ERR: ('+str(e.code)+')'
    except urllib2.URLError, e:
        print 'ERR: Failed to reach the URI ('+str(e.reason[0])+':'+e.reason[1]+')'
        buff += '\n|-'+'--'*level+uri[uri[:-1].rfind('/')+1:]+' ERR: ('+str(e.reason[0])+':'+e.reason[1]+')'
    else:
        for match in re.finditer(anchor_match,response.read()):
            if match.group(1) not in ommit and match.group(2) != 'Parent Directory':
                decoded_uri = urllib.unquote(match.group(1))
                if match.group(1)[-1] == '/':
                    if level > 0: buff += '\n| '+'  '*level+'|-'+decoded_uri
                    else: buff += '\n|-'+'--'*level+decoded_uri
                    readIndexFile(uri+match.group(1),level+1)        
                else:
                    if(options.dirs_only == False):
                        if (search != None and re.match(search,match.group(2)) or search == None):                  
                            if (options.extensions != None \
                            and match.group(1)[match.group(1).rfind('.')+1:] in options.extensions)\
							or (options.extensions == None):
                                if options.exclude == False:
                                    if level > 0: buff += '\n| '+'  '*level+'| '+decoded_uri
                                    else: buff += '\n| '+'|--'*level+decoded_uri
                            if options.extensions != None and options.exclude == True \
							and match.group(1)[match.group(1).rfind('.')+1:] not in options.extensions:
                                if level > 0: buff += '\n| '+'  '*level+'| '+decoded_uri
                                else: buff += '\n| '+'|--'*level+decoded_uri
                                                  

anchor_match = re.compile(r'.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?',re.IGNORECASE)
#ommited hrefs (apache sort)
ommit = ['?C=N;O=D','?C=S;O=A','?C=D;O=A','?C=M;O=A','?N=D','?D=A','?M=A','?S=A','/']
buff = ''
search = None;
    
usage = """usage:\tsPyder URI OUTPUT_FILE [-x m3u,cue] [-e] [-r "regexp"] [-d]
\tsPyder http://somedexx.com/dir1/dir%20with%20space c:/dump [-x m3u,cue] [-e] [-r "regexp"] [-d]"""
parser = OptionParser(usage)
parser.add_option('-x','--extensions',action='store',dest='extensions',\
                  help='dump only files with given extensions (separate multiple with ",")')
parser.add_option('-e','--exclude',action='store_true',dest='exclude',default = False,\
                  help='exclude files with given extension')
parser.add_option('-r','--regexp',action='append',dest='search',type='string',\
                  help='files matchin given regexp')
parser.add_option('-d','--dirs',action='store_true',dest='dirs_only',default = False,\
                  help='dumps only directories')
(options,args) = parser.parse_args()    

if len(args)<2:
    print 'Invalid syntax. Type sPyder -h for more help.\n'+usage
    sys.exit()
try:
    f = open(args[1], 'w')
except:
    print 'Directory does not exist or file not writable: '+str(args[1])
else:
    print 'Spidering URI.....'
    if args[0][-1] != '/': args[0] = args[0]+'/'
    if options.search != None:
        search = re.compile(options.search[0])
    if options.extensions != None:
        options.extensions = options.extensions.split(',')
    readIndexFile(args[0])
    print 'Writing to file...'
    f.write('sPyder 1.2 by starenka <oggova@starenka.net>')
    f.write('\n@'+datetime.datetime.now().strftime("%A %d.%m.%Y %H:%M:%S")+'\n\n'+args[0])   
    f.write(buff)
    print buff+'\n'
    print 'Done ;)'