|
|
|
@ -0,0 +1,356 @@
|
|
|
|
|
"""Format validator for raw (JSON) documents in the es.4 format"""
|
|
|
|
|
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
from hashlib import sha256
|
|
|
|
|
from typing import Any, Dict, Mapping, Optional, Tuple, TypedDict
|
|
|
|
|
|
|
|
|
|
import fastjsonschema
|
|
|
|
|
|
|
|
|
|
from ..base32 import base32_bytes_to_string
|
|
|
|
|
from ..exc import ValidationError
|
|
|
|
|
from ..identity import Identity
|
|
|
|
|
from ..path import Path
|
|
|
|
|
from ..share import Share
|
|
|
|
|
from ..types import B32_CHAR
|
|
|
|
|
from . import Document
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RawDocument(TypedDict, total=False):
|
|
|
|
|
"""A raw Document object"""
|
|
|
|
|
|
|
|
|
|
author: str
|
|
|
|
|
signature: str
|
|
|
|
|
content: str
|
|
|
|
|
contentHash: str
|
|
|
|
|
deleteAfter: Optional[int]
|
|
|
|
|
path: str
|
|
|
|
|
workspace: str
|
|
|
|
|
timestamp: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Es4Document(Document): # pylint: disable=too-many-instance-attributes
|
|
|
|
|
"""Validator for the 'es.4' format
|
|
|
|
|
|
|
|
|
|
Checks if documents are spec-compliant before ingesting, and signs them according to spec.
|
|
|
|
|
|
|
|
|
|
See https://earthstar-project.org/specs/data-spec
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Tolerance for accepting messages from the future (because of clock skew between peers)
|
|
|
|
|
FUTURE_CUTOFF_MICROSECONDS = 600000000 # 10 minutes
|
|
|
|
|
|
|
|
|
|
# Allowed valid range of timestamps (in microseconds, not milliseconds); these are taken from
|
|
|
|
|
# the reference implementation, but they are not specced.
|
|
|
|
|
MIN_TIMESTAMP = 10000000000000 # 1983-01-02T06:57:53Z
|
|
|
|
|
MAX_TIMESTAMP = 9007199254740990 # 2255-06-05T23:47:34.740990Z
|
|
|
|
|
|
|
|
|
|
# 4 million bytes = 4 megabytes (measured as bytes, not as utf-8 characters)
|
|
|
|
|
MAX_CONTENT_LENGTH = 4000000
|
|
|
|
|
|
|
|
|
|
CORE_SCHEMA = {
|
|
|
|
|
'title': 'Earthstar document format es.4',
|
|
|
|
|
'description': 'This schema describes a valid Earthstar document in the es.4 format',
|
|
|
|
|
'type': 'object',
|
|
|
|
|
'properties': {
|
|
|
|
|
'format': {
|
|
|
|
|
'description': 'The format identifier. Must be "es.4"',
|
|
|
|
|
'enum': ['es.4'],
|
|
|
|
|
},
|
|
|
|
|
'author': {
|
|
|
|
|
'type': 'string',
|
|
|
|
|
'pattern': Identity.PATTERN,
|
|
|
|
|
},
|
|
|
|
|
'content': {
|
|
|
|
|
'type': 'string',
|
|
|
|
|
'maxLength': MAX_CONTENT_LENGTH,
|
|
|
|
|
},
|
|
|
|
|
'contentHash': {
|
|
|
|
|
'type': 'string',
|
|
|
|
|
'pattern': f'^b[{B32_CHAR}]{{52}}$',
|
|
|
|
|
},
|
|
|
|
|
'deleteAfter': {
|
|
|
|
|
'oneOf': [
|
|
|
|
|
{
|
|
|
|
|
'type': 'integer',
|
|
|
|
|
'minimum': MIN_TIMESTAMP,
|
|
|
|
|
'maximum': MAX_TIMESTAMP,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
'type': 'null',
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
'path': {
|
|
|
|
|
'type': 'string',
|
|
|
|
|
'pattern': Path.PATTERN,
|
|
|
|
|
'minLength': 2,
|
|
|
|
|
'maxLength': 512,
|
|
|
|
|
},
|
|
|
|
|
'signature': {
|
|
|
|
|
'type': 'string',
|
|
|
|
|
'pattern': f'^b[{B32_CHAR}]{{103}}$',
|
|
|
|
|
},
|
|
|
|
|
'timestamp': {
|
|
|
|
|
'type': 'integer',
|
|
|
|
|
'minimum': MIN_TIMESTAMP,
|
|
|
|
|
'maximum': MAX_TIMESTAMP,
|
|
|
|
|
},
|
|
|
|
|
'workspace': {
|
|
|
|
|
'type': 'string',
|
|
|
|
|
'pattern': Share.PATTERN,
|
|
|
|
|
},
|
|
|
|
|
},
|
|
|
|
|
'required': [
|
|
|
|
|
'format',
|
|
|
|
|
'timestamp',
|
|
|
|
|
'deleteAfter',
|
|
|
|
|
'author',
|
|
|
|
|
'path',
|
|
|
|
|
'workspace',
|
|
|
|
|
'signature',
|
|
|
|
|
'contentHash',
|
|
|
|
|
'content',
|
|
|
|
|
],
|
|
|
|
|
'additionalProperties': False,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__document_format__ = 'es.4'
|
|
|
|
|
|
|
|
|
|
_validator = staticmethod(fastjsonschema.compile(CORE_SCHEMA))
|
|
|
|
|
|
|
|
|
|
def __init__( # pylint: disable=too-many-arguments
|
|
|
|
|
self,
|
|
|
|
|
author: Identity,
|
|
|
|
|
share: Share,
|
|
|
|
|
path: Path,
|
|
|
|
|
timestamp: Optional[datetime] = None,
|
|
|
|
|
content: Optional[str] = None,
|
|
|
|
|
content_hash: Optional[str] = None,
|
|
|
|
|
signature: Optional[str] = None,
|
|
|
|
|
delete_after: Optional[datetime] = None,
|
|
|
|
|
):
|
|
|
|
|
self.author: Identity = author
|
|
|
|
|
self.path = path
|
|
|
|
|
self.signature = signature
|
|
|
|
|
self._content = content or ''
|
|
|
|
|
self._content_hash = content_hash
|
|
|
|
|
self.delete_after = delete_after
|
|
|
|
|
self.share = share
|
|
|
|
|
self.timestamp = timestamp or datetime.utcnow()
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_json(cls, raw_document: Dict[str, Any]) -> 'Es4Document':
|
|
|
|
|
"""Validate raw_document as an es.4 document and create an ``Es4Document`` from it
|
|
|
|
|
|
|
|
|
|
:returns: a new ``Es4Document``
|
|
|
|
|
:raises ValidationError: if anything is wrong
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# do this first to ensure we have all the right datatypes in the right fields
|
|
|
|
|
cls._check_basic_document_validity(raw_document)
|
|
|
|
|
|
|
|
|
|
# this is the most likely to fail under regular conditions, so do it next (because of
|
|
|
|
|
# clock skew and expired ephemeral documents)
|
|
|
|
|
cls._check_timestamp_is_ok(
|
|
|
|
|
raw_document['timestamp'],
|
|
|
|
|
raw_document['deleteAfter'],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
timestamp = datetime.utcfromtimestamp(raw_document['timestamp'] / 1000000).replace(
|
|
|
|
|
tzinfo=timezone.utc
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if raw_document['deleteAfter']:
|
|
|
|
|
delete_after = datetime.utcfromtimestamp(raw_document['deleteAfter'] / 1000000).replace(
|
|
|
|
|
tzinfo=timezone.utc
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
delete_after = None
|
|
|
|
|
|
|
|
|
|
author = Identity.from_address(raw_document['author'])
|
|
|
|
|
share = Share.from_address(raw_document['workspace'])
|
|
|
|
|
path = Path(raw_document['path'])
|
|
|
|
|
|
|
|
|
|
if not path.can_write(author):
|
|
|
|
|
raise ValidationError(f'Author {author} cannot write to path {path}')
|
|
|
|
|
|
|
|
|
|
# do this last since it might be slow on a large document
|
|
|
|
|
cls._check_content_matches_hash(raw_document['content'], raw_document['contentHash'])
|
|
|
|
|
|
|
|
|
|
document = cls(
|
|
|
|
|
author,
|
|
|
|
|
share,
|
|
|
|
|
path,
|
|
|
|
|
timestamp=timestamp,
|
|
|
|
|
content=raw_document['content'],
|
|
|
|
|
content_hash=raw_document['contentHash'],
|
|
|
|
|
signature=raw_document['signature'],
|
|
|
|
|
delete_after=delete_after,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
document.validate_signature()
|
|
|
|
|
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def content_hash(self) -> str:
|
|
|
|
|
"""The hash of the document’s content"""
|
|
|
|
|
|
|
|
|
|
if not self._content_hash:
|
|
|
|
|
self._content_hash = self.hash_value(self.content)
|
|
|
|
|
|
|
|
|
|
return self._content_hash
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def hash_value(value: str) -> str:
|
|
|
|
|
"""Calculate the SHA-256 digest of the value and return its Base32 representation"""
|
|
|
|
|
|
|
|
|
|
hasher = sha256()
|
|
|
|
|
hasher.update(value.encode('utf-8'))
|
|
|
|
|
|
|
|
|
|
return base32_bytes_to_string(hasher.digest())
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def content(self) -> str:
|
|
|
|
|
"""The content of the document"""
|
|
|
|
|
|
|
|
|
|
return self._content
|
|
|
|
|
|
|
|
|
|
@content.setter
|
|
|
|
|
def content(self, value: str) -> None:
|
|
|
|
|
self._content = value
|
|
|
|
|
self._content_hash = self.hash_value(value)
|
|
|
|
|
|
|
|
|
|
def generate_hash(self) -> str:
|
|
|
|
|
"""Calculate the hash of the document
|
|
|
|
|
|
|
|
|
|
This hash will be signed with the author’s key to propagate the signature property."""
|
|
|
|
|
|
|
|
|
|
hash_keys: Dict[str, Any] = {
|
|
|
|
|
'author': self.author,
|
|
|
|
|
'contentHash': self.content_hash,
|
|
|
|
|
'deleteAfter': (
|
|
|
|
|
int(self.delete_after.timestamp() * 1000000) if self.delete_after else None
|
|
|
|
|
),
|
|
|
|
|
'format': self.__document_format__,
|
|
|
|
|
'path': self.path,
|
|
|
|
|
'timestamp': int(self.timestamp.timestamp() * 1000000),
|
|
|
|
|
'workspace': self.share,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hasher = sha256()
|
|
|
|
|
|
|
|
|
|
for key, value in sorted(hash_keys.items(), key=lambda elem: elem[0]):
|
|
|
|
|
if value is None:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
hasher.update(f'{key}\t{value}\n'.encode('utf-8'))
|
|
|
|
|
|
|
|
|
|
return base32_bytes_to_string(hasher.digest())
|
|
|
|
|
|
|
|
|
|
def sign(self, identity: Optional[Identity] = None) -> None:
|
|
|
|
|
if identity and identity != self.author:
|
|
|
|
|
raise ValidationError(
|
|
|
|
|
"when signing a document, keypair address must match document author"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if identity:
|
|
|
|
|
self.author = identity
|
|
|
|
|
|
|
|
|
|
self.signature = self.author.sign(self.generate_hash())
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def remove_extra_fields(cls, doc: Dict[str, Any]) -> Tuple[RawDocument, Mapping[str, Any]]:
|
|
|
|
|
"""Return a copy of the doc without extra fields, plus the extra fields as a separate object
|
|
|
|
|
|
|
|
|
|
This should be run before check_document_is_valid. The output doc will be more likely to
|
|
|
|
|
be valid once the extra fields have been removed.
|
|
|
|
|
|
|
|
|
|
:raises ValidationError: if the inpuc is not a plain JSON object
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if not isinstance(doc, dict):
|
|
|
|
|
raise ValidationError('Document is not a plain JSON object')
|
|
|
|
|
|
|
|
|
|
valid_keys = set(cls.CORE_SCHEMA['properties'].keys()) # type: ignore
|
|
|
|
|
|
|
|
|
|
doc2: RawDocument = {}
|
|
|
|
|
extras: Mapping[str, Any] = {}
|
|
|
|
|
|
|
|
|
|
for (key, val) in doc.items():
|
|
|
|
|
if key in valid_keys:
|
|
|
|
|
doc2[key] = val # type: ignore
|
|
|
|
|
else:
|
|
|
|
|
if not key.startswith('_'):
|
|
|
|
|
raise ValidationError(
|
|
|
|
|
"extra document fields must have names starting with an underscore"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
extras[key] = val # type: ignore
|
|
|
|
|
|
|
|
|
|
return (doc2, extras)
|
|
|
|
|
|
|
|
|
|
# These are broken out for easier unit testing. They will not normally be used directly; use
|
|
|
|
|
# the main assertDocumentIsValid instead.
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def _check_basic_document_validity(cls, doc: Dict[str, Any]) -> None:
|
|
|
|
|
"""Check for correct fields and datatypes"""
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
cls._validator(doc)
|
|
|
|
|
except BaseException as exc:
|
|
|
|
|
raise ValidationError(exc) from None
|
|
|
|
|
|
|
|
|
|
# TODO: is there more to check?
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def _check_timestamp_is_ok(cls, timestamp: int, delete_after: Optional[int]) -> None:
|
|
|
|
|
"""Check for valid timestamp, and expired ephemeral documents
|
|
|
|
|
|
|
|
|
|
timestamp and deleteAfter are already verified as good numbers by the schema checker:
|
|
|
|
|
- in the right range of min and max allowed timestamps
|
|
|
|
|
- integers, and not NaN or infinity
|
|
|
|
|
|
|
|
|
|
Timestamp must not be too far in the future (but allow some clock skew).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
now = int(datetime.utcnow().timestamp() * 1000000)
|
|
|
|
|
|
|
|
|
|
if timestamp > now + cls.FUTURE_CUTOFF_MICROSECONDS:
|
|
|
|
|
raise ValidationError("timestamp too far in the future")
|
|
|
|
|
|
|
|
|
|
# Ephemeral documents
|
|
|
|
|
if delete_after is not None:
|
|
|
|
|
# Only valid if expiration date is in the future
|
|
|
|
|
if now > delete_after:
|
|
|
|
|
raise ValidationError("ephemeral doc has expired")
|
|
|
|
|
|
|
|
|
|
# Can't expire before it was created, that makes no sense
|
|
|
|
|
if delete_after <= timestamp:
|
|
|
|
|
raise ValidationError("ephemeral doc expired before it was created")
|
|
|
|
|
|
|
|
|
|
def validate_signature(self) -> None:
|
|
|
|
|
"""Check if the signature is good"""
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
doc_hash = self.generate_hash()
|
|
|
|
|
except BaseException as exc:
|
|
|
|
|
raise ValidationError(f'Cannot check signature: {exc}') from None
|
|
|
|
|
|
|
|
|
|
if not self.signature:
|
|
|
|
|
raise ValidationError('document has no signature assigned')
|
|
|
|
|
|
|
|
|
|
if not self.author.verify(doc_hash, self.signature):
|
|
|
|
|
raise ValidationError('signature is invalid')
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _check_content_matches_hash(content: str, content_hash: str) -> None:
|
|
|
|
|
"""Ensure the contentHash matches the actual content"""
|
|
|
|
|
|
|
|
|
|
hasher = sha256()
|
|
|
|
|
hasher.update(content.encode('utf-8'))
|
|
|
|
|
calculated_hash = base32_bytes_to_string(hasher.digest())
|
|
|
|
|
|
|
|
|
|
if content is not None and calculated_hash != content_hash:
|
|
|
|
|
raise ValidationError("content does not match contentHash")
|