Source code for rdflib.plugins.serializers.hext

"""
HextuplesSerializer RDF graph serializer for RDFLib.
See <https://github.com/ontola/hextuples> for details about the format.
"""

from __future__ import annotations

import json
import warnings
from typing import IO, Any, Callable, List, Optional, Type, Union, cast

from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, ConjunctiveGraph, Dataset, Graph
from rdflib.namespace import RDF, XSD
from rdflib.serializer import Serializer
from rdflib.term import BNode, IdentifiedNode, Literal, URIRef

try:
    import orjson

    _HAS_ORJSON = True
except ImportError:
    orjson = None  # type: ignore[assignment, unused-ignore]
    _HAS_ORJSON = False

__all__ = ["HextuplesSerializer"]


[docs] class HextuplesSerializer(Serializer): """ Serializes RDF graphs to NTriples format. """ contexts: List[Union[Graph, IdentifiedNode]] dumps: Callable
[docs] def __new__(cls, store: Union[Graph, Dataset, ConjunctiveGraph]): if _HAS_ORJSON: cls.str_local_id: Union[str, Any] = orjson.Fragment(b'"localId"') cls.str_global_id: Union[str, Any] = orjson.Fragment(b'"globalId"') cls.empty: Union[str, Any] = orjson.Fragment(b'""') cls.lang_str: Union[str, Any] = orjson.Fragment( b'"' + RDF.langString.encode("utf-8") + b'"' ) cls.xsd_string: Union[str, Any] = orjson.Fragment( b'"' + XSD.string.encode("utf-8") + b'"' ) else: cls.str_local_id = "localId" cls.str_global_id = "globalId" cls.empty = "" cls.lang_str = f"{RDF.langString}" cls.xsd_string = f"{XSD.string}" return super(cls, cls).__new__(cls)
[docs] def __init__(self, store: Union[Graph, Dataset, ConjunctiveGraph]): self.default_context: Optional[Union[Graph, IdentifiedNode]] self.graph_type: Union[Type[Graph], Type[Dataset], Type[ConjunctiveGraph]] if isinstance(store, (Dataset, ConjunctiveGraph)): self.graph_type = ( Dataset if isinstance(store, Dataset) else ConjunctiveGraph ) self.contexts = list(store.contexts()) if store.default_context: self.default_context = store.default_context self.contexts.append(store.default_context) else: self.default_context = None else: self.graph_type = Graph self.contexts = [store] self.default_context = None Serializer.__init__(self, store)
[docs] def serialize( self, stream: IO[bytes], base: Optional[str] = None, encoding: Optional[str] = "utf-8", **kwargs, ): if base is not None: warnings.warn( "base has no meaning for Hextuples serialization. " "I will ignore this value" ) if encoding not in [None, "utf-8"]: warnings.warn( f"Hextuples files are always utf-8 encoded. " f"I was passed: {encoding}, " "but I'm still going to use utf-8 anyway!" ) if self.store.formula_aware is True: raise Exception( "Hextuple serialization can't (yet) handle formula-aware stores" ) context: Union[Graph, IdentifiedNode] context_str: Union[bytes, str] for context in self.contexts: for triple in context: # Generate context string just once, because it doesn't change # for every triple in this context context_str = cast( Union[str, bytes], ( self.empty if self.graph_type is Graph else ( orjson.Fragment('"' + self._context_str(context) + '"') if _HAS_ORJSON else self._context_str(context) ) ), ) hl = self._hex_line(triple, context_str) if hl is not None: stream.write(hl if _HAS_ORJSON else hl.encode())
def _hex_line(self, triple, context_str: Union[bytes, str]): if isinstance( triple[0], (URIRef, BNode) ): # exclude QuotedGraph and other objects # value value = ( triple[2] if isinstance(triple[2], Literal) else self._iri_or_bn(triple[2]) ) # datatype if isinstance(triple[2], URIRef): # datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#namedNode" datatype = self.str_global_id elif isinstance(triple[2], BNode): # datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#blankNode" datatype = self.str_local_id elif isinstance(triple[2], Literal): if triple[2].datatype is not None: datatype = f"{triple[2].datatype}" else: if triple[2].language is not None: # language datatype = self.lang_str else: datatype = self.xsd_string else: return None # can't handle non URI, BN or Literal Object (QuotedGraph) # language if isinstance(triple[2], Literal): if triple[2].language is not None: language = f"{triple[2].language}" else: language = self.empty else: language = self.empty line_list = [ self._iri_or_bn(triple[0]), triple[1], value, datatype, language, context_str, ] outline: Union[str, bytes] if _HAS_ORJSON: outline = orjson.dumps(line_list, option=orjson.OPT_APPEND_NEWLINE) else: outline = json.dumps(line_list) + "\n" return outline else: # do not return anything for non-IRIs or BNs, e.g. QuotedGraph, Subjects return None def _iri_or_bn(self, i_): if isinstance(i_, URIRef): return f"{i_}" elif isinstance(i_, BNode): return f"{i_.n3()}" else: return None def _context_str(self, context: Union[Graph, IdentifiedNode]) -> str: context_identifier: IdentifiedNode = ( context.identifier if isinstance(context, Graph) else context ) if context_identifier == DATASET_DEFAULT_GRAPH_ID: return "" if self.default_context is not None: if ( isinstance(self.default_context, IdentifiedNode) and context_identifier == self.default_context ): return "" elif ( isinstance(self.default_context, Graph) and context_identifier == self.default_context.identifier ): return "" if self.graph_type is Graph: # Only emit a context name when serializing a Dataset or ConjunctiveGraph return "" return ( f"{context_identifier}" if isinstance(context_identifier, URIRef) else context_identifier.n3() )