#
# $Source: /home/blais/repos/cvsroot/arubomu/lib/python/arubomu/parsers/descarga.py,v $
# $Id: descarga.py,v 1.23 2005/07/10 21:21:45 blais Exp $
#

"""Parse Descarga HTML."""

__version__ = "$Revision: 1.23 $"
__author__ = "Martin Blais <blais@furius.ca>"


import sys, os
import string, re, textwrap
import StringIO

from arubomu.parsers.base import Fetcher
from arubomu.album import Song, Musician, Album, Disc
import strxtra



__all__ = ['DescargaFetcher']


def create_fetcher():
    return DescargaFetcher()


class DescargaFetcher(Fetcher):

    url = 'http://www.descarga.com/cgi-bin/db/%s'
    coverurl = 'http://www.descarga.com/db/images/%s.gif'

    def geturl(self, catalog_number):
        return self.url % normalizeCatalogNo(catalog_number)

    def parse(self, text):
        alb, catnum = parseHTML(text)

        images = []
        if catnum:
            coverurl = self.coverurl % normalizeCatalogNo(catnum)
            images.append(catnum)

        return (alb, images)



sorry_re = re.compile("<H3>Sorry, This page doesn't exist!</H3>")

def validateHTML(html_text):
    "Returns 1 if the page is a valid page, not a 'sorry, no answer' page."

    if sorry_re.search(html_text):
        return 0 # invalid page
    return 1



empty_re = re.compile('^\s*$')

artist_re = re.compile('<H3>(.*)</H3>')
title_re = re.compile('<B><I>(.*)</I></B>')
publ_re = re.compile('<BR>[^<]*\(([^\)]+)\),([^<]*)<BR>')

desc_re = re.compile('<b>Description:</b>')
rev_re = re.compile('<b>Reviews:</b>')
edpick_re = re.compile("<B>Editor's Pick:</B><BR>")
comm_re = re.compile('<!-- (.*) -->')
categ_re = re.compile('<B>Category:</B>')
br_re = re.compile('^<BR>', re.MULTILINE)
catalog_re = re.compile('<B>ORDER</B> (TL-[\d\.]+)')

songtitles_re = re.compile('<!-- song titles -->')
songtitles_pre = '<b>Song titles include:</b><br>'
song_re = re.compile('(.*)<I>(.*)</I><BR>')
titart_re = re.compile('(.*) - (.*)')

musicians_re = re.compile('<!-- musicians -->')
mustitles_pre = '<b>Musicians include:</b><br>'
mus_re = re.compile('(.*)<I>(.*)</I><BR>')

def parseHTML(html_text):
    "Parse the HTML result text and return the Album entry."

    if sorry_re.search(html_text):
        return None

    # First convert source file to Unicode.
    html_text = unicode(html_text, 'iso-8859-1')

    a = Album()

    spos = 0

    # title
    title_mo = title_re.search(html_text)
    if title_mo:
        a.title = title_mo.group(1)
        
        spos = max(spos, title_mo.end())

        # label and release date
        mo = publ_re.match(html_text, title_mo.end())
        if mo:
            a.label = mo.group(1)
            a.reldate = strxtra.simplifyWhitespace( mo.group(2) )
            if a.reldate and a.reldate[-1] == ';':
                a.reldate = a.reldate[:-1]
            #a.reldate = string.replace(a.reldate, ';', '/')

            spos = max(spos, mo.end())

    # artist
    artist_mo = artist_re.search(html_text)
    if artist_mo:
        a.artist = artist_mo.group(1)

        spos = max(spos, artist_mo.end())

    # description + reviews
    modesc = desc_re.search(html_text)
    morev = rev_re.search(html_text)
    moedpick = edpick_re.search(html_text)
    if modesc:
        spos = max(spos, modesc.start())
    elif morev:
        spos = max(spos, morev.start())
    elif moedpick:
        spos = max(spos, moedpick.start())
        
    mocomm = comm_re.search(html_text, spos)
    mocat = categ_re.search(html_text, spos)

    epos = len(html_text)
    if mocomm:
        epos = mocomm.start()
    elif mocat:
        epos = mocomm.end()

    a.reviews = [formatDescription( html_text[ spos : epos ] )]

    # catalog/order number
    mo = catalog_re.search(html_text)
    if mo:
        catalog_number = mo.group(1)
        a.catalogs.append( ('descarga', catalog_number) )
    else:
        catalog_number = None

    # songs
    mo = songtitles_re.search(html_text)
    if mo:
        # get all lines that match either prefix or song regexp
        lines = html_text[ mo.end(): ].splitlines()
        lines = map( string.strip, lines )
        
        titles = {}
        for l in lines:
            if l == '':
                continue
            if l == songtitles_pre:
                continue

            mo = song_re.match(l)
            if mo:
                if a.artist == 'Various Artists':
                    tamo = titart_re.match( mo.group(1).strip() )
                else:
                    tamo = None

                songno = len(titles) + 1
                if tamo:
                    s = Song(songno)
                    s.title = tamo.group(1).strip()
                    s.artist = tamo.group(2).strip()
                else:
                    s = Song(songno)
                    s.title = mo.group(1).strip()

                s.duration = mo.group(2).strip()
                titles[songno] = s
            else:
                break

        disc = Disc()
        disc.no = 1
        a.discs[1] = disc
        disc.songs = titles

    # musicians
    mo = musicians_re.search(html_text)
    if mo:
        # get all lines that match either prefix or mus regexp
        lines = html_text[ mo.end(): ].splitlines()
        lines = map( string.strip, lines )
        
        mus = []
        for l in lines:
            if l == '':
                continue
            if l == mustitles_pre:
                continue

            mo = mus_re.match(l)
            if mo:
                s = Musician()
                s.name = mo.group(1).strip()
                s.instrument = mo.group(2).strip()
                mus.append(s)
            else:
                break

        a.musicians = mus

    # category
    if mocat:
        mo = br_re.search(html_text, mocat.end())
        if mo:
            text = html_text[ mocat.end() : mo.start() ]
            notags = strxtra.stripHTMLTags(text)
            a.category = strxtra.simplifyWhitespace(notags)

    return a, catalog_number


def formatDescription(desc):
    "Splits and formats the description text and returns the description"

    notags = strxtra.stripHTMLTags(desc)
    return textwrap.fill(notags, 70)



def normalizeCatalogNo(tlno):
    if tlno.startswith('TL-'):
        tlno = tlno[3:]
    if not re.match('.*\\.\\d\\d$', tlno):
        tlno = tlno + '.10'
    return tlno



def main():

    ll = ['/tmp/a.html', '/tmp/b.html', '/tmp/d.html']
    ll = ['/tmp/b.html']
    for fn in ll:
        html = open(fn, 'r').read()
        a = parseHTML(html)
        print a
        print a.asXML()

# Run main if loaded as a script
if __name__ == "__main__":
    main()

