From 280652d5a7b8c07c69e7ff20eca5f605a0391a76 Mon Sep 17 00:00:00 2001 From: Gergely Polonkai Date: Wed, 4 May 2022 14:31:25 +0200 Subject: [PATCH] Create the es.4 document format --- earthsnake/document/__init__.py | 2 +- earthsnake/document/es4.py | 356 ++++++++++++++++++++++++++++++++ earthsnake/path.py | 2 +- tests/test_document_es4.py | 274 ++++++++++++++++++++++++ 4 files changed, 632 insertions(+), 2 deletions(-) create mode 100644 earthsnake/document/es4.py create mode 100644 tests/test_document_es4.py diff --git a/earthsnake/document/__init__.py b/earthsnake/document/__init__.py index 918967e..5189859 100644 --- a/earthsnake/document/__init__.py +++ b/earthsnake/document/__init__.py @@ -17,7 +17,7 @@ class Document(ABC): # pragma: no cover pylint: disable=too-few-public-methods timestamp: datetime - signature: str + signature: Optional[str] @abstractmethod def sign(self, identity: Optional[Identity] = None) -> None: diff --git a/earthsnake/document/es4.py b/earthsnake/document/es4.py new file mode 100644 index 0000000..c03d231 --- /dev/null +++ b/earthsnake/document/es4.py @@ -0,0 +1,356 @@ +"""Format validator for raw (JSON) documents in the es.4 format""" + +from datetime import datetime, timezone +from hashlib import sha256 +from typing import Any, Dict, Mapping, Optional, Tuple, TypedDict + +import fastjsonschema + +from ..base32 import base32_bytes_to_string +from ..exc import ValidationError +from ..identity import Identity +from ..path import Path +from ..share import Share +from ..types import B32_CHAR +from . import Document + + +class RawDocument(TypedDict, total=False): + """A raw Document object""" + + author: str + signature: str + content: str + contentHash: str + deleteAfter: Optional[int] + path: str + workspace: str + timestamp: int + + +class Es4Document(Document): # pylint: disable=too-many-instance-attributes + """Validator for the 'es.4' format + + Checks if documents are spec-compliant before ingesting, and signs them according to spec. + + See https://earthstar-project.org/specs/data-spec + """ + + # Tolerance for accepting messages from the future (because of clock skew between peers) + FUTURE_CUTOFF_MICROSECONDS = 600000000 # 10 minutes + + # Allowed valid range of timestamps (in microseconds, not milliseconds); these are taken from + # the reference implementation, but they are not specced. + MIN_TIMESTAMP = 10000000000000 # 1983-01-02T06:57:53Z + MAX_TIMESTAMP = 9007199254740990 # 2255-06-05T23:47:34.740990Z + + # 4 million bytes = 4 megabytes (measured as bytes, not as utf-8 characters) + MAX_CONTENT_LENGTH = 4000000 + + CORE_SCHEMA = { + 'title': 'Earthstar document format es.4', + 'description': 'This schema describes a valid Earthstar document in the es.4 format', + 'type': 'object', + 'properties': { + 'format': { + 'description': 'The format identifier. Must be "es.4"', + 'enum': ['es.4'], + }, + 'author': { + 'type': 'string', + 'pattern': Identity.PATTERN, + }, + 'content': { + 'type': 'string', + 'maxLength': MAX_CONTENT_LENGTH, + }, + 'contentHash': { + 'type': 'string', + 'pattern': f'^b[{B32_CHAR}]{{52}}$', + }, + 'deleteAfter': { + 'oneOf': [ + { + 'type': 'integer', + 'minimum': MIN_TIMESTAMP, + 'maximum': MAX_TIMESTAMP, + }, + { + 'type': 'null', + }, + ], + }, + 'path': { + 'type': 'string', + 'pattern': Path.PATTERN, + 'minLength': 2, + 'maxLength': 512, + }, + 'signature': { + 'type': 'string', + 'pattern': f'^b[{B32_CHAR}]{{103}}$', + }, + 'timestamp': { + 'type': 'integer', + 'minimum': MIN_TIMESTAMP, + 'maximum': MAX_TIMESTAMP, + }, + 'workspace': { + 'type': 'string', + 'pattern': Share.PATTERN, + }, + }, + 'required': [ + 'format', + 'timestamp', + 'deleteAfter', + 'author', + 'path', + 'workspace', + 'signature', + 'contentHash', + 'content', + ], + 'additionalProperties': False, + } + + __document_format__ = 'es.4' + + _validator = staticmethod(fastjsonschema.compile(CORE_SCHEMA)) + + def __init__( # pylint: disable=too-many-arguments + self, + author: Identity, + share: Share, + path: Path, + timestamp: Optional[datetime] = None, + content: Optional[str] = None, + content_hash: Optional[str] = None, + signature: Optional[str] = None, + delete_after: Optional[datetime] = None, + ): + self.author: Identity = author + self.path = path + self.signature = signature + self._content = content or '' + self._content_hash = content_hash + self.delete_after = delete_after + self.share = share + self.timestamp = timestamp or datetime.utcnow() + + @classmethod + def from_json(cls, raw_document: Dict[str, Any]) -> 'Es4Document': + """Validate raw_document as an es.4 document and create an ``Es4Document`` from it + + :returns: a new ``Es4Document`` + :raises ValidationError: if anything is wrong + """ + + # do this first to ensure we have all the right datatypes in the right fields + cls._check_basic_document_validity(raw_document) + + # this is the most likely to fail under regular conditions, so do it next (because of + # clock skew and expired ephemeral documents) + cls._check_timestamp_is_ok( + raw_document['timestamp'], + raw_document['deleteAfter'], + ) + + timestamp = datetime.utcfromtimestamp(raw_document['timestamp'] / 1000000).replace( + tzinfo=timezone.utc + ) + + if raw_document['deleteAfter']: + delete_after = datetime.utcfromtimestamp(raw_document['deleteAfter'] / 1000000).replace( + tzinfo=timezone.utc + ) + else: + delete_after = None + + author = Identity.from_address(raw_document['author']) + share = Share.from_address(raw_document['workspace']) + path = Path(raw_document['path']) + + if not path.can_write(author): + raise ValidationError(f'Author {author} cannot write to path {path}') + + # do this last since it might be slow on a large document + cls._check_content_matches_hash(raw_document['content'], raw_document['contentHash']) + + document = cls( + author, + share, + path, + timestamp=timestamp, + content=raw_document['content'], + content_hash=raw_document['contentHash'], + signature=raw_document['signature'], + delete_after=delete_after, + ) + + document.validate_signature() + + return document + + @property + def content_hash(self) -> str: + """The hash of the document’s content""" + + if not self._content_hash: + self._content_hash = self.hash_value(self.content) + + return self._content_hash + + @staticmethod + def hash_value(value: str) -> str: + """Calculate the SHA-256 digest of the value and return its Base32 representation""" + + hasher = sha256() + hasher.update(value.encode('utf-8')) + + return base32_bytes_to_string(hasher.digest()) + + @property + def content(self) -> str: + """The content of the document""" + + return self._content + + @content.setter + def content(self, value: str) -> None: + self._content = value + self._content_hash = self.hash_value(value) + + def generate_hash(self) -> str: + """Calculate the hash of the document + + This hash will be signed with the author’s key to propagate the signature property.""" + + hash_keys: Dict[str, Any] = { + 'author': self.author, + 'contentHash': self.content_hash, + 'deleteAfter': ( + int(self.delete_after.timestamp() * 1000000) if self.delete_after else None + ), + 'format': self.__document_format__, + 'path': self.path, + 'timestamp': int(self.timestamp.timestamp() * 1000000), + 'workspace': self.share, + } + + hasher = sha256() + + for key, value in sorted(hash_keys.items(), key=lambda elem: elem[0]): + if value is None: + continue + + hasher.update(f'{key}\t{value}\n'.encode('utf-8')) + + return base32_bytes_to_string(hasher.digest()) + + def sign(self, identity: Optional[Identity] = None) -> None: + if identity and identity != self.author: + raise ValidationError( + "when signing a document, keypair address must match document author" + ) + + if identity: + self.author = identity + + self.signature = self.author.sign(self.generate_hash()) + + @classmethod + def remove_extra_fields(cls, doc: Dict[str, Any]) -> Tuple[RawDocument, Mapping[str, Any]]: + """Return a copy of the doc without extra fields, plus the extra fields as a separate object + + This should be run before check_document_is_valid. The output doc will be more likely to + be valid once the extra fields have been removed. + + :raises ValidationError: if the inpuc is not a plain JSON object + """ + + if not isinstance(doc, dict): + raise ValidationError('Document is not a plain JSON object') + + valid_keys = set(cls.CORE_SCHEMA['properties'].keys()) # type: ignore + + doc2: RawDocument = {} + extras: Mapping[str, Any] = {} + + for (key, val) in doc.items(): + if key in valid_keys: + doc2[key] = val # type: ignore + else: + if not key.startswith('_'): + raise ValidationError( + "extra document fields must have names starting with an underscore" + ) + + extras[key] = val # type: ignore + + return (doc2, extras) + + # These are broken out for easier unit testing. They will not normally be used directly; use + # the main assertDocumentIsValid instead. + + @classmethod + def _check_basic_document_validity(cls, doc: Dict[str, Any]) -> None: + """Check for correct fields and datatypes""" + + try: + cls._validator(doc) + except BaseException as exc: + raise ValidationError(exc) from None + + # TODO: is there more to check? + + @classmethod + def _check_timestamp_is_ok(cls, timestamp: int, delete_after: Optional[int]) -> None: + """Check for valid timestamp, and expired ephemeral documents + + timestamp and deleteAfter are already verified as good numbers by the schema checker: + - in the right range of min and max allowed timestamps + - integers, and not NaN or infinity + + Timestamp must not be too far in the future (but allow some clock skew). + """ + + now = int(datetime.utcnow().timestamp() * 1000000) + + if timestamp > now + cls.FUTURE_CUTOFF_MICROSECONDS: + raise ValidationError("timestamp too far in the future") + + # Ephemeral documents + if delete_after is not None: + # Only valid if expiration date is in the future + if now > delete_after: + raise ValidationError("ephemeral doc has expired") + + # Can't expire before it was created, that makes no sense + if delete_after <= timestamp: + raise ValidationError("ephemeral doc expired before it was created") + + def validate_signature(self) -> None: + """Check if the signature is good""" + + try: + doc_hash = self.generate_hash() + except BaseException as exc: + raise ValidationError(f'Cannot check signature: {exc}') from None + + if not self.signature: + raise ValidationError('document has no signature assigned') + + if not self.author.verify(doc_hash, self.signature): + raise ValidationError('signature is invalid') + + @staticmethod + def _check_content_matches_hash(content: str, content_hash: str) -> None: + """Ensure the contentHash matches the actual content""" + + hasher = sha256() + hasher.update(content.encode('utf-8')) + calculated_hash = base32_bytes_to_string(hasher.digest()) + + if content is not None and calculated_hash != content_hash: + raise ValidationError("content does not match contentHash") diff --git a/earthsnake/path.py b/earthsnake/path.py index b616fde..1d7adc1 100644 --- a/earthsnake/path.py +++ b/earthsnake/path.py @@ -12,7 +12,7 @@ class Path: """A document path""" _SEGMENT_PATTERN = f'/[{PATH_CHARACTER}]+'.replace('$', '\\$') - _PATTERN = f'^({_SEGMENT_PATTERN})+$' + PATTERN = f'^({_SEGMENT_PATTERN})+$' def __init__(self, path: str) -> None: self.validate(path, allow_ephemeral=True) diff --git a/tests/test_document_es4.py b/tests/test_document_es4.py new file mode 100644 index 0000000..2b64400 --- /dev/null +++ b/tests/test_document_es4.py @@ -0,0 +1,274 @@ +"""Tests for the es.4 document validator""" + +from datetime import datetime, timedelta +from typing import Any, Dict + +from ed25519 import SigningKey +import pytest +from pytest_mock.plugin import MockerFixture + +from earthsnake.exc import ValidationError +from earthsnake.document.es4 import Es4Document +from earthsnake.identity import Identity +from earthsnake.path import Path +from earthsnake.share import Share + + +VALID_DOCUMENT: Dict[str, Any] = { + 'format': 'es.4', + 'timestamp': 1651738871668993, + 'deleteAfter': None, + 'author': '@test.bcz76z52y5dlpohtkmpuj3jsdcvfmebzpcgfmtmhu4u7hlexzreya', + 'path': '/test.txt', + 'workspace': '+test.suffix', + 'signature': ( + 'bughac3ooy5qfect4bxdke2zkun2wqufhpivt57vj2mzq52mhqzesjvhyywttxm7qqjyhoskmiqd2lw72qy2u766r' + 'fqvnbwab4qp3gbi' + ), + 'content': 'test', + 'contentHash': 'bt6dnbamijr6wlgrp5kqmkwwqcwr36ty3fmfyelgrlvwblmhqbiea', +} + + +@pytest.mark.parametrize( + 'missing', + Es4Document.CORE_SCHEMA['required'], # type: ignore +) +def test_validate_missing_keys(missing: str) -> None: + """Test if validation fails when a required key is missing""" + + raw_document = VALID_DOCUMENT.copy() + del raw_document[missing] + + with pytest.raises(ValidationError) as ctx: + Es4Document.from_json(raw_document) + + required_field_names = ', '.join( + repr(x) for x in Es4Document.CORE_SCHEMA['required'] # type: ignore + ) + assert str(ctx.value) == f"data must contain [{required_field_names}] properties" + + +def test_validate_future_document() -> None: + """Test if validation fails when the document is created far in the future""" + + raw_document = VALID_DOCUMENT.copy() + raw_document['timestamp'] = int( + (datetime.utcnow() + timedelta(minutes=20)).timestamp() * 1000000 + ) + + with pytest.raises(ValidationError) as ctx: + Es4Document.from_json(raw_document) + + assert str(ctx.value) == 'timestamp too far in the future' + + +def test_validate_expiry_before_creation() -> None: + """Test if validation fails when the document is set to expire before it has been created""" + + raw_document = VALID_DOCUMENT.copy() + raw_document['timestamp'] = int( + (datetime.utcnow() + timedelta(minutes=5)).timestamp() * 1000000 + ) + raw_document['deleteAfter'] = raw_document['timestamp'] - 1 + + with pytest.raises(ValidationError) as ctx: + Es4Document.from_json(raw_document) + + assert str(ctx.value) == 'ephemeral doc expired before it was created' + + +def test_validate_deleteafter_in_past() -> None: + """Test if validation fails if deleteAfter is in the pas""" + + raw_document = VALID_DOCUMENT.copy() + yesterday = datetime.utcnow() - timedelta(days=1) + raw_document['deleteAfter'] = int(yesterday.timestamp() * 1000000) + + with pytest.raises(ValidationError) as ctx: + Es4Document.from_json(raw_document) + + assert str(ctx.value) == 'ephemeral doc has expired' + + +def test_validate_not_writable() -> None: + """Test if validation fails when the author is not allowed to write at the given path""" + + raw_document = VALID_DOCUMENT.copy() + raw_document['path'] = '/test/~@some.' + ('a' * 52) + + with pytest.raises(ValidationError) as ctx: + Es4Document.from_json(raw_document) + + assert ( + str(ctx.value) + == f'Author {raw_document["author"]} cannot write to path {raw_document["path"]}' + ) + + +def test_validate() -> None: + """Test if validation succeeds on a valid document""" + + document = Es4Document.from_json(VALID_DOCUMENT) + + assert isinstance(document, Es4Document) + + +def test_validate_ephemeral() -> None: + """Test if validation succeeds with a valid deleteAfter value""" + + raw_document = VALID_DOCUMENT.copy() + raw_document['deleteAfter'] = 9007199254740990 + raw_document['signature'] = ( + 'bw7zyncx7u2wrk2jz4ucfjbbc6b5t5rm7ma42vkaaius2yczvqdbvfyfbc6mh345flsaeqizw44gncot5huaskdmk' + 'npkty7mgtzbssdy' + ) + + document = Es4Document.from_json(raw_document) + assert isinstance(document, Es4Document) + + +def test_signature_check_error(mocker: MockerFixture) -> None: + """Test if validation fails if something goes wrong during signature checking""" + + mocker.patch( + 'earthsnake.document.es4.Es4Document.generate_hash', + side_effect=ValueError('test error'), + ) + + with pytest.raises(ValidationError) as ctx: + Es4Document.from_json(VALID_DOCUMENT) + + assert str(ctx.value) == 'Cannot check signature: test error' + + +def test_signature_check_bad_signature() -> None: + """Test if validation fails if the document’s signature doesn’t match the document itself""" + + raw_document = VALID_DOCUMENT.copy() + raw_document['signature'] = 'b' * 104 + + with pytest.raises(ValidationError) as ctx: + Es4Document.from_json(raw_document) + + assert str(ctx.value) == 'signature is invalid' + + +def test_remove_extra_fields_no_underscore() -> None: + """Test if remove_extra_fields() fails on non-specced fields not starting with an underscore""" + + raw_document = VALID_DOCUMENT.copy() + raw_document.update({'test': 'yes'}) + + with pytest.raises(ValidationError) as ctx: + Es4Document.remove_extra_fields(raw_document) + + assert str(ctx.value) == 'extra document fields must have names starting with an underscore' + + +def test_remove_extra_fields_non_object() -> None: + """Test if remove_extra_fields() fails if the validated document is not a JSON object""" + + with pytest.raises(ValidationError) as ctx: + Es4Document.remove_extra_fields([]) # type: ignore + + assert str(ctx.value) == 'Document is not a plain JSON object' + + +def test_remove_extra_fields() -> None: + """Test if remove_extra_fields() correctly extracts field names starting with an underscore""" + + raw_document = VALID_DOCUMENT.copy() + raw_document.update({'_test': True}) + + cleaned_doc, removed = Es4Document.remove_extra_fields(raw_document) + + assert cleaned_doc == VALID_DOCUMENT + assert removed == {'_test': True} + + +def test_check_content_hash() -> None: + """Test if validation fails if contentHash is not the hash of content""" + + raw_document = VALID_DOCUMENT.copy() + raw_document['content'] = 'other' + + with pytest.raises(ValidationError) as ctx: + Es4Document.from_json(raw_document) + + assert str(ctx.value) == 'content does not match contentHash' + + +def test_sign_different_author(identity: Identity) -> None: + """Test if sign() fails if the signing identity is not the same as the document author""" + + document = Es4Document.from_json(VALID_DOCUMENT) + + with pytest.raises(ValidationError) as ctx: + document.sign(identity=identity) + + assert str(ctx.value) == 'when signing a document, keypair address must match document author' + + +def test_sign_new_identity() -> None: + """Test if signing a document with a different entity also sets the author""" + + key_seed = ( + b'`_\x8dm\x18\xeem\xe3\\\xeb_\x1aw)\xcd\xb7\xd8\xd9\xdd\xad\x86\x9a#wQ"F\x95\xa1\x178r' + ) + key = SigningKey(key_seed) + identity = Identity('name', sign_key=key) + + document = Es4Document.from_json(VALID_DOCUMENT) + document.author = identity + document.sign() + + assert ( + document.signature + == 'bk76oxkhbydy3itajeeqtryyj7ej5y7hqjffae6ilf4kpyklh2w32hx3ndg6cb3zvlphj46zmuxbetk4cj2fh6' + '5bhxdtzflvf7oamcbi' + ) + assert document.author == identity + + +@pytest.mark.id_name('test') +def test_sign(identity: Identity) -> None: + """Test if the sign() method updates the document with a correct signature""" + + document = Es4Document.from_json(VALID_DOCUMENT) + document.sign(identity=identity) + + assert document.signature == ( + 'bughac3ooy5qfect4bxdke2zkun2wqufhpivt57vj2mzq52mhqzesjvhyywttxm7qqjyhoskmiqd2lw72qy2u766r' + 'fqvnbwab4qp3gbi' + ) + + +def test_content_setter() -> None: + """Check if the content setter recalculates content_hash, too""" + + document = Es4Document.from_json(VALID_DOCUMENT) + document.content = 'new content' + assert document.content_hash != VALID_DOCUMENT['contentHash'] + assert document.content_hash == 'b7yzgbde66w3m67r7srsiajj765xsj5hmaz4phuhqp6mejs77syaq' + + +def test_content_hash_getter() -> None: + """Test if the content_hash getter recalculates the content hash if it’s not already set""" + + document = Es4Document.from_json(VALID_DOCUMENT) + document._content = 'new content' # pylint: disable=protected-access + document._content_hash = None # pylint: disable=protected-access + + assert document.content_hash == 'b7yzgbde66w3m67r7srsiajj765xsj5hmaz4phuhqp6mejs77syaq' + + +def test_validate_unsigned_document(identity: Identity) -> None: + """Test if signature validation fails if there is no signature yet""" + + document = Es4Document(identity, Share.from_address('+test.share'), Path('/test')) + + with pytest.raises(ValidationError) as ctx: + document.validate_signature() + + assert str(ctx.value) == 'document has no signature assigned'