#!/usr/bin/env python
# -*- coding: utf-8 -*-

# https://groups.google.com/forum/#!msg/foresite/3vS3_ZZ8Aj0/8tr_SgjbTAUJ

# Some more info than in the README.
#http://code.google.com/p/foresite-toolkit/source/browse/foresite-python/trunk/foresite/README.txt?r=85

# From that URL:

#Parsing existing Resource Maps.
#The argument to ReMDocument can be a filename or a URL.
#
#>>> remdoc = ReMDocument("http://www.openarchives.org/ore/1.0/atom-examples/atom_arXiv_maxi.atom")
#>>> ap = AtomParser()
#>>> rem = ap.parse(remdoc)
#>>> aggr = rem.aggregation
#
#Or an RDF Parser, which requires format to be set on the rem document:
#
#>>> rdfp = RdfLibParser()
#>>> remdoc2.format = 'rdfa' # done by the serializer by default
#>>> rdfp.parse(remdoc2)
#<foresite.ore.ResourceMap object ...
#
#Possible values for format: xml, trix, n3, nt, rdfa
#
#And then re-serialise in a different form:
#
#>>> rdfxml = RdfLibSerializer('xml')
#>>> rem2 = aggr.register_serialization(rdfxml, 'my-rdf-rem-uri')
#>>> remdoc3 = rem2.get_serialization()
#
#Creating arbitrary triples:
#
#>>> something = ArbitraryResource('uri-random')
#>>> a.add_triple(something)
#
#And then treat them like any object
#
#>>> something.title = "Random Title"
#>>> something._rdf.type = URIRef('http://somewhere.org/class/something')



# Stdlib.
import codecs
import datetime
import hashlib
import os
import sys
import StringIO
import pprint

# 3rd party.
import pyxb

# D1.
import d1_common.types.generated.dataoneTypes as dataoneTypes
import d1_common.const
import d1_client.data_package
import d1_client.mnclient

# Foresite / rdflib

from foresite import *
import foresite
from rdflib import URIRef, Namespace, Graph


def dump(obj):
  '''return a printable representation of an object for debugging'''
  newobj=obj
  if '__dict__' in dir(obj):
    newobj=obj.__dict__
    if ' object at ' in str(obj) and not newobj.has_key('__type__'):
      newobj['__type__']=str(obj)
    for attr in newobj:
      newobj[attr]=dump(newobj[attr])
  return newobj


def main():
  print get_identifiers_referenced_by_package('file:ori.xml')


def get_identifiers_referenced_by_package(rdf_xml_doc_uri):
  ''':rdf_xml_doc_path: Can be either a file URI, like "file:ori.xml", or an
  HTTP URL.
  '''
  remote_rdf_xml_doc = ReMDocument(rdf_xml_doc_uri)
  remote_rdf_xml_doc.format = 'xml'
  rdf_lib_parser = foresite.RdfLibParser()
  resource_map = rdf_lib_parser.parse(remote_rdf_xml_doc)
  # The example at
  # http://code.google.com/p/foresite-toolkit/source/browse/foresite-python/trunk/foresite/README.txt?r=85
  # simply references the aggregation directly. Ugly, but I guess that's the way
  # it's meant be used.
  aggr = resource_map.aggregation
  # foresite.ore.AggregatedResource -> OREResource
  pids = []
  for aggregated_resource in aggr:
    graph = aggregated_resource.graph
    for s, p, o in graph:
      # s = subject = rdflib.URIRef.URIRef
      # p = predicate = rdflib.URIRef.URIRef
      # o = object = rdflib.Literal.Literal or rdflib.URIRef.URIRef
      if str(p) == 'http://purl.org/dc/terms/identifier':
        pids.append(str(o))
  return pids


if __name__ == '__main__':
  main()
