earthsnake/earthsnake/document/es4.py

"""Document class for the es.4 format"""

from datetime import datetime, timezone
from hashlib import sha256
from typing import Any, Dict, Mapping, Optional, Tuple, TypedDict

import fastjsonschema

from ..base32 import base32_bytes_to_string
from ..exc import ValidationError
from ..identity import Identity
from ..path import Path
from ..share import Share
from ..types import B32_CHAR
from . import Document


class RawDocument(TypedDict, total=False):
    """A raw Document object"""

    author: str
    signature: str
    content: str
    contentHash: str
    deleteAfter: Optional[int]
    path: str
    workspace: str
    timestamp: int


class Es4Document(Document):  # pylint: disable=too-many-instance-attributes
    """An es.4 format document

    See https://earthstar-project.org/specs/data-spec
    """

    # Tolerance for accepting messages from the future (because of clock skew between peers)
    FUTURE_CUTOFF_MICROSECONDS = 600000000  # 10 minutes

    # Allowed valid range of timestamps (in microseconds, not milliseconds); these are taken from
    # the reference implementation, but they are not specced.
    MIN_TIMESTAMP = 10000000000000  # 1983-01-02T06:57:53Z
    MAX_TIMESTAMP = 9007199254740990  # 2255-06-05T23:47:34.740990Z

    # 4 million bytes = 4 megabytes (measured as bytes, not as utf-8 characters)
    MAX_CONTENT_LENGTH = 4000000

    CORE_SCHEMA = {
        'title': 'Earthstar document format es.4',
        'description': 'This schema describes a valid Earthstar document in the es.4 format',
        'type': 'object',
        'properties': {
            'format': {
                'description': 'The format identifier. Must be "es.4"',
                'enum': ['es.4'],
            },
            'author': {
                'type': 'string',
                'pattern': Identity.PATTERN,
            },
            'content': {
                'type': 'string',
                'maxLength': MAX_CONTENT_LENGTH,
            },
            'contentHash': {
                'type': 'string',
                'pattern': f'^b[{B32_CHAR}]{{52}}$',
            },
            'deleteAfter': {
                'oneOf': [
                    {
                        'type': 'integer',
                        'minimum': MIN_TIMESTAMP,
                        'maximum': MAX_TIMESTAMP,
                    },
                    {
                        'type': 'null',
                    },
                ],
            },
            'path': {
                'type': 'string',
                'pattern': Path.PATTERN,
                'minLength': 2,
                'maxLength': 512,
            },
            'signature': {
                'type': 'string',
                'pattern': f'^b[{B32_CHAR}]{{103}}$',
            },
            'timestamp': {
                'type': 'integer',
                'minimum': MIN_TIMESTAMP,
                'maximum': MAX_TIMESTAMP,
            },
            'workspace': {
                'type': 'string',
                'pattern': Share.PATTERN,
            },
        },
        'required': [
            'format',
            'timestamp',
            'deleteAfter',
            'author',
            'path',
            'workspace',
            'signature',
            'contentHash',
            'content',
        ],
        'additionalProperties': False,
    }

    __document_format__ = 'es.4'

    _validator = staticmethod(fastjsonschema.compile(CORE_SCHEMA))

    def __init__(  # pylint: disable=too-many-arguments
        self,
        author: Identity,
        share: Share,
        path: Path,
        timestamp: Optional[datetime] = None,
        content: Optional[str] = None,
        content_hash: Optional[str] = None,
        signature: Optional[str] = None,
        delete_after: Optional[datetime] = None,
    ):
        self.author = author
        self.path = path
        self.signature = signature
        self._content = content or ''
        self._content_hash = content_hash
        self.delete_after = delete_after
        self.share = share
        self.timestamp = timestamp or datetime.utcnow()

    @classmethod
    def from_json(cls, raw_document: Dict[str, Any]) -> 'Es4Document':
        """Validate raw_document as an es.4 document and create an ``Es4Document`` from it

        Checks if documents are spec-compliant before ingesting, and signs them according to spec.

        :returns: a new ``Es4Document``
        :raises ValidationError: if anything is wrong
        """

        # do this first to ensure we have all the right datatypes in the right fields
        cls._check_basic_document_validity(raw_document)

        # this is the most likely to fail under regular conditions, so do it next (because of
        # clock skew and expired ephemeral documents)
        cls._check_timestamp_is_ok(
            raw_document['timestamp'],
            raw_document['deleteAfter'],
        )

        timestamp = datetime.utcfromtimestamp(raw_document['timestamp'] / 1000000).replace(
            tzinfo=timezone.utc
        )

        if raw_document['deleteAfter']:
            delete_after = datetime.utcfromtimestamp(raw_document['deleteAfter'] / 1000000).replace(
                tzinfo=timezone.utc
            )
        else:
            delete_after = None

        author = Identity.from_address(raw_document['author'])
        share = Share.from_address(raw_document['workspace'])
        path = Path(raw_document['path'])

        if not path.can_write(author):
            raise ValidationError(f'Author {author} cannot write to path {path}')

        # do this last since it might be slow on a large document
        cls._check_content_matches_hash(raw_document['content'], raw_document['contentHash'])

        document = cls(
            author,
            share,
            path,
            timestamp=timestamp,
            content=raw_document['content'],
            content_hash=raw_document['contentHash'],
            signature=raw_document['signature'],
            delete_after=delete_after,
        )

        document.validate_signature()

        return document

    @property
    def content_hash(self) -> str:
        """The hash of the document’s content"""

        if not self._content_hash:
            self._content_hash = self.hash_value(self.content)

        return self._content_hash

    @staticmethod
    def hash_value(value: str) -> str:
        """Calculate the SHA-256 digest of the value and return its Base32 representation"""

        hasher = sha256()
        hasher.update(value.encode('utf-8'))

        return base32_bytes_to_string(hasher.digest())

    @property
    def content(self) -> str:
        """The content of the document"""

        return self._content

    @content.setter
    def content(self, value: str) -> None:
        self._content = value
        self._content_hash = self.hash_value(value)

    @property
    def content_length(self) -> int:
        return len(self._content)

    def generate_hash(self) -> str:
        """Calculate the hash of the document

        This hash will be signed with the author’s key to propagate the signature property."""

        hash_keys: Dict[str, Any] = {
            'author': self.author,
            'contentHash': self.content_hash,
            'deleteAfter': (
                int(self.delete_after.timestamp() * 1000000) if self.delete_after else None
            ),
            'format': self.__document_format__,
            'path': self.path,
            'timestamp': int(self.timestamp.timestamp() * 1000000),
            'workspace': self.share,
        }

        hasher = sha256()

        for key, value in sorted(hash_keys.items(), key=lambda elem: elem[0]):
            # Skip null fields
            if value is None:
                continue

            # Otherwise, append the fieldname and value.
            # Tab and newline are our field separators.
            # Convert integers to strings here.
            # (The newline is included on the last field.)
            hasher.update(f'{key}\t{value}\n'.encode('utf-8'))

        # Binary digest, not hex digest string!  Then convert bytes to Earthstar b32 format with
        # leading 'b'.
        return base32_bytes_to_string(hasher.digest())

    def sign(self, identity: Optional[Identity] = None) -> None:
        """Sign the document and store the signature into the document (mutating it)
        """

        if identity and identity != self.author:
            raise ValidationError(
                "when signing a document, keypair address must match document author"
            )

        if identity:
            self.author = identity

        self.signature = self.author.sign(self.generate_hash())

    @classmethod
    def remove_extra_fields(cls, doc: Dict[str, Any]) -> Tuple[RawDocument, Mapping[str, Any]]:
        """Return a copy of the doc without extra fields, plus the extra fields as a separate object

        This should be run before check_document_is_valid. The output doc will be more likely to
        be valid once the extra fields have been removed.

        :raises ValidationError: if the inpuc is not a plain JSON object
        """

        if not isinstance(doc, dict):
            raise ValidationError('Document is not a plain JSON object')

        valid_keys = set(cls.CORE_SCHEMA['properties'].keys())  # type: ignore

        doc2: RawDocument = {}
        extras: Mapping[str, Any] = {}

        for (key, val) in doc.items():
            if key in valid_keys:
                doc2[key] = val  # type: ignore
            else:
                if not key.startswith('_'):
                    raise ValidationError(
                        "extra document fields must have names starting with an underscore"
                    )

                extras[key] = val  # type: ignore

        return (doc2, extras)

    # These are broken out for easier unit testing.  They will not normally be used directly; use
    # the main assertDocumentIsValid instead.

    @classmethod
    def _check_basic_document_validity(cls, doc: Dict[str, Any]) -> None:
        """Check for correct fields and datatypes"""

        try:
            cls._validator(doc)
        except BaseException as exc:
            raise ValidationError(exc) from None

        # TODO: is there more to check?

    @classmethod
    def _check_timestamp_is_ok(cls, timestamp: int, delete_after: Optional[int]) -> None:
        """Check for valid timestamp, and expired ephemeral documents

        timestamp and deleteAfter are already verified as good numbers by the schema checker:
        - in the right range of min and max allowed timestamps
        - integers, and not NaN or infinity

        Timestamp must not be too far in the future (but allow some clock skew).
        """

        now = int(datetime.utcnow().timestamp() * 1000000)

        if timestamp > now + cls.FUTURE_CUTOFF_MICROSECONDS:
            raise ValidationError("timestamp too far in the future")

        # Ephemeral documents
        if delete_after is not None:
            # Only valid if expiration date is in the future
            if now > delete_after:
                raise ValidationError("ephemeral doc has expired")

            # Can't expire before it was created, that makes no sense
            if delete_after <= timestamp:
                raise ValidationError("ephemeral doc expired before it was created")

    def validate_signature(self) -> None:
        """Check if the signature is good"""

        try:
            doc_hash = self.generate_hash()
        except BaseException as exc:
            raise ValidationError(f'Cannot check signature: {exc}') from None

        if not self.signature:
            raise ValidationError('document has no signature assigned')

        if not self.author.verify(doc_hash, self.signature):
            raise ValidationError('signature is invalid')

    @staticmethod
    def _check_content_matches_hash(content: str, content_hash: str) -> None:
        """Ensure the contentHash matches the actual content"""

        hasher = sha256()
        hasher.update(content.encode('utf-8'))
        calculated_hash = base32_bytes_to_string(hasher.digest())

        if content is not None and calculated_hash != content_hash:
            raise ValidationError("content does not match contentHash")

    @staticmethod
    def compare_newest_first(doc_a, doc_b):
        """Compare two documents based on their time stamp"""

        if doc_a.timestamp < doc_b.timestamp:
            return Cmp.LT

        if doc_b.timestamp > doc_b.timestamp:
            return Cmp.GT

        if doc_a.signature < doc_b.signature:
            return Cmp.LT

        if doc_a.signature > doc_b.signature:
            return Cmp.GT

        return Cmp.EQ