#! /usr/bin/python
# ecashin@meili:~/script/quotations$ ./splitquot < quotations.html | ssh noserose.net 'cat > public_html/e/quotations.xml'
#
# I just spent about 50 minutes debugging this script only to find
# out that it just doesn't work with Python 2.2.3.  At home, with
# version 2.5.2, it works fine.

import sys, signal, cgi, re, pprint, string, time, sha, anydbm

nowtime = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
qdb = anydbm.open(sys.argv[0] + '.db', 'c')

def title(i):
    q = re.sub(r'\n', ' ', string.join(quotes[i]))
    q = re.sub(r'\s+', ' ', q)
    q = re.sub(r'^\s+', '', q)
    q = re.sub(r'<.*?>', '', q)
    q = re.sub(r'^[-	 ]*', '', q)
    q = re.sub('((\w+\W+){7}).*', r'\1', q, 1)
    q = re.sub(r'[-\W]*$', '', q)
    return cgi.escape(q)

def cmt(i):
    c = string.join(comments[i])
    c = re.sub('&#8212; *', '', c, 1)
    return c

def description(i):
    d = '<blockquote>' + string.join(quotes[i]) + '</blockquote>'
    d += '<p>&mdash;' + cmt(i) + '</p>'
    return cgi.escape(d)

def guid(i):
    return sha.new(string.join(quotes[i])).hexdigest()

def item(i):
    g = guid(i)
    date = nowtime
    if qdb.has_key(g):
        date = qdb[g]
    else:
        qdb[g] = date
    print '      <item>'
    print '         <title>' + title(i) + '</title>'
    print '         <description>' + description(i) + '</description>'
    print '         <pubDate>' + date + '</pubDate>'
    print '         <guid>' + g + '</guid>'
    print '      </item>'

def items():
    if len(quotes) != len(comments):
        print >> sys.stderr, "derp!"
        sys.exit(1)
    i = 0
    while i < len(quotes):
        item(i)
        i += 1

def close_db(signo, frame):
    print 'closing database'
    qdb.close()
    exit(0)

signal.signal(signal.SIGHUP, close_db)
signal.signal(signal.SIGINT, close_db)

start = re.compile('### BEGIN QUOTES ###')
for line in sys.stdin:
    if start.search(line):
        break

end = re.compile('### END QUOTES ###')
squot = re.compile('<blockquote>', re.IGNORECASE)
equot = re.compile('</blockquote>', re.IGNORECASE)
hr = re.compile('</?hr>', re.IGNORECASE)
empty_par = re.compile(r'<p>\s*</p>', re.IGNORECASE)
dash = re.compile('&#8212; *')
quotes = []
comments = []
buf = []
for line in sys.stdin:
    if end.search(line):
        break
    elif squot.search(line):
        if len(buf) != 0:
            comments.append(buf)
        buf = []
    elif equot.search(line):
        quotes.append(buf)
        buf = []
    else:
        line = hr.sub('', line)
        line = empty_par.sub('', line)
        if line != '\n':
            buf.append(line)
comments.append(buf)

#pprint.pprint(comments)
# pprint.pprint(quotes)
#sys.exit(0)

preamble = '''<?xml version="1.0"?>
<rss version="2.0">
   <channel>
      <title>Ed Cashin's Intriguing Quotes</title>
      <link>http://noserose.net/e/quotations.html</link>
      <description>Interesting quotations showing various perspectives.</description>
      <language>en-us</language>
      <pubDate>''' + nowtime + '''</pubDate>
      <lastBuildDate>''' + nowtime + '''</lastBuildDate>
      <docs>http://blogs.law.harvard.edu/tech/rss</docs>
      <generator>Custom Stuff by Ed Cashin</generator>
      <managingEditor>ecashin@noserose.net</managingEditor>
      <webMaster>ecashin@noserose.net</webMaster>
'''
sys.stdout.write(preamble)

items()

finish = '''</channel>
</rss>'''
sys.stdout.write(finish)
