#
# $Source: /home/blais/repos/cvsroot/arubomu/lib/python/arubomu/parsers/amazon.py,v $
# $Id: amazon.py,v 1.19 2004/02/13 05:49:00 blais Exp $
#

"""Parse Amazon HTML.

"""

__version__ = "$Revision: 1.19 $"
__author__ = "Martin Blais <blais@furius.ca>"


import sys, os
import string, re
import StringIO

import htmllib, formatter
import strxtra

from arubomu.parsers.base import Fetcher
from arubomu.album import Song, Musician, Album, Disc


__all__ = ['AmazonFetcher']


def create_fetcher():
    return AmazonFetcher()


class AmazonFetcher(Fetcher):

    urlt = 'http://www.amazon.com/exec/obidos/ASIN/%s'

    def geturl(self, catalog_number):
        return self.urlt % catalog_number

    def parse(self, text):
        parser = AmazonMusicParser()
        parser.feed(text)
        parser.close()
        
        #print parser.getLargestImage()
        a = parser.get_results()
        return (a, [parser.getLargestImage()])
    


def attmap(attlist):
    "Convert an attributes list into an attributes map."
    m = {}
    for k, v in attlist:
        m[k] = v
    return m


class AmazonMusicParser(htmllib.HTMLParser):
    "Customized HTML parser that just accumulates nodes."

    class Results: pass

    tempre = re.compile('(-?\d+) / (-?\d+)')
    reldate = re.compile('.*release date', re.I)
    audiocd = re.compile('audio cd', re.I)
    asin = re.compile('asin.*', re.I)
    datere = re.compile('[0-9][0-9][0-9][0-9]')
    label = re.compile('label.*', re.I)
    tracks = re.compile('\s*(listen to samples|track listings)\s*', re.I)
    trktrej = re.compile('^(listen|music|listenmusic)$', re.I)
    trktpos = re.compile('^\s*(\d+)\s*\.\s*(.*)\s*$', re.I)
    discre = re.compile('^Disc: (\d+)$')

    gather_img_pfx = 'http://images.amazon.com/images/P/%s'
    img_re = [re.compile('.*LZZ*.jpg$'),
              re.compile('.*SCMZZ*_.jpg$'),
              re.compile('.*THUMBZ*_.jpg$')]

    title_additions = ['IMPORT',
                       'LIVE',
                       'GOLD CD',
                       'SOUNDTRACK',
                       'EXTRA TRACKS',
                       'BONUS TRACK',
                       'LIMITED EDITION',
                       'ENHANCED',
                       'EXPLICIT LYRICS',
                       'Bonus Tracks',
                       'ORIGINAL RECORDING REMASTERED']
    care = re.compile('\\[(%s)\\]' % \
                      string.join(title_additions, '|'))
    care = re.compile('\\[(.*)\\]')

    def __init__(self, verbose=0):
        htmllib.HTMLParser.__init__(self, formatter.NullFormatter(), verbose)
        self.nofill = 0
        self.saving = None

        self.dmap = {}
        self.discs = {}

        self.asin_guess = None
        self.images = []

        self.curdisc = 1

    def start_title(self, att):
        if not self.saving:
            self.save_bgn(); self.saving = 'title'

    def end_title(self):
        if self.saving == 'title':
            self.saving = None
            title = self.save_end()
            if not title.startswith('Amazon.com: Music'):
                raise ParseError('Error: not an amazon music page.')

    def start_b(self, att):
        if not self.saving:
            if not self.dmap.has_key('albumtitle') and ('class', 'sans') in att:
                self.save_bgn(); self.saving = 'albumtitle'
            elif ('class', 'h1') in att:
                self.save_bgn(); self.saving = 'b-h1'
            else:
                self.save_bgn(); self.saving = 'b'
        else:
            self.save_bgn(); self.saving = 'b'

    def end_b(self):
        if self.saving == 'albumtitle':
            albtitle = self.save_end()
            albtitle = self.care.sub('', albtitle)
            self.dmap['albumtitle'] = albtitle.strip()
            self.save_bgn(); self.saving = 'artist'
        elif self.saving == 'b-h1':
            bh1text = self.save_end()
            self.saving = None
            if self.tracks.match(bh1text):
                self.saving = 'nexttable'
        elif self.saving == 'b':
            btext = self.save_end()
            if self.reldate.match(btext):
                self.save_bgn(); self.saving = 'reldate'
            elif self.audiocd.match(btext):
                self.save_bgn(); self.saving = 'audiocd'
            elif self.label.match(btext):
                self.save_bgn(); self.saving = 'label'
            elif self.asin.match(btext):
                self.save_bgn(); self.saving = 'asin'
            elif self.discre.match(btext):
                mo = self.discre.match(btext)
                self.curdisc = int(mo.group(1))
            else:
                self.saving = None

    def start_a(self, att):
        href = filter(lambda x: x[0] == 'href', att)
        if len(href) == 1:
            k, v = href[0]
            if v.startswith(self.gather_img_pfx % self.asin_guess):
                if v not in self.images:
                    self.images.append(v)

    def end_a(self):
        if self.saving == 'artist':
            self.dmap['artist'] = self.save_end()
            self.saving = None

    def start_table(self, att):
        if self.saving == 'nexttable':
            self.saving = 'intable'

    def end_table(self):
        ##if self.saving == 'intable':
        self.saving = None

    def start_td(self, att):
        ##if self.saving == 'intable':
        self.save_bgn(); self.saving = 'trk'

    def end_td(self):
        if self.saving == 'trk':
            trkt = self.save_end().strip()
            mo = self.trktpos.match(trkt)
            if not self.trktrej.match(trkt) and mo:
                no, songname = mo.groups()
                no = int(no)
                if self.curdisc not in self.discs:
                    d = Disc()
                    d.no = self.curdisc
                    self.discs[self.curdisc] = d
                s = Song()
                s.no = no
                s.title = songname
                self.discs[self.curdisc].songs[no] = s
            self.saving = 'intable'

    def start_br(self, att):
        if self.saving == 'reldate':
            datetext = self.save_end()
            self.dmap['reldate'] = string.join(
                self.datere.findall(datetext), ', ')
            self.saving = None
        elif self.saving == 'audiocd':
            datetext = self.save_end()
            self.dmap['audiocd'] = string.join(
                self.datere.findall(datetext), ', ')
            self.saving = None

    def start_li(self, att):
        if self.saving == 'label':
            self.dmap['label'] = self.save_end().strip()
            self.saving = None
        elif self.saving == 'asin':
            self.dmap['asin'] = self.save_end()
            self.saving = None

    def end_ul(self):
        if self.saving == 'asin':
            self.dmap['asin'] = self.save_end()
            self.saving = None

    def get_results(self):
        a = Album()
        a.title = self.dmap.get('albumtitle', '')
        a.artist = self.dmap.get('artist', '')
        a.reldate = self.dmap.get('reldate', '')
        if not a.reldate:
            a.reldate = self.dmap.get('audiocd', '')
        a.label = self.dmap.get('label', '')
        a.catalogs.append( ('amazon', self.dmap.get('asin', '')) )
        a.discs = self.discs

        return a

    def start_input(self, att):
        attm = attmap(att)
        try:
            if attm['name'] == 'bookmark-url':
                val = attm['value']
                if val.startswith('ASIN'):
                    self.asin_guess = val.split('/')[1]
        except KeyError:
            pass

    def start_img(self, att):
        if self.asin_guess:
            attm = attmap(att)
            try:
                src = attm['src']
                if src.startswith(self.gather_img_pfx % self.asin_guess):
                    if src not in self.images:
                        self.images.append(src)
            except KeyError:
                pass

    def getLargestImage(self):
        for ir in self.img_re:
            fimages = filter(ir.match, self.images)
            if fimages:
                return fimages[0]



def getCoverURL(asin, text=None, cache=None):
    if not text:
        url = getURL(asin)
        if cache:
            text = cache.fetch(asin, url)
        else:
            # fetch text ourselves to find the url of the largest image
            import urllib
            htfile = urllib.urlopen()
            text = htfile.read()
            htfile.close()

    try:
        parser = AmazonMusicParser()
        parser.feed(text)
        parser.close()

        url = parser.getLargestImage()
    except ParseError, e:
        url = None

    return url
