Create the es.4 document format
This commit is contained in:
356
earthsnake/document/es4.py
Normal file
356
earthsnake/document/es4.py
Normal file
@@ -0,0 +1,356 @@
|
||||
"""Format validator for raw (JSON) documents in the es.4 format"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from hashlib import sha256
|
||||
from typing import Any, Dict, Mapping, Optional, Tuple, TypedDict
|
||||
|
||||
import fastjsonschema
|
||||
|
||||
from ..base32 import base32_bytes_to_string
|
||||
from ..exc import ValidationError
|
||||
from ..identity import Identity
|
||||
from ..path import Path
|
||||
from ..share import Share
|
||||
from ..types import B32_CHAR
|
||||
from . import Document
|
||||
|
||||
|
||||
class RawDocument(TypedDict, total=False):
|
||||
"""A raw Document object"""
|
||||
|
||||
author: str
|
||||
signature: str
|
||||
content: str
|
||||
contentHash: str
|
||||
deleteAfter: Optional[int]
|
||||
path: str
|
||||
workspace: str
|
||||
timestamp: int
|
||||
|
||||
|
||||
class Es4Document(Document): # pylint: disable=too-many-instance-attributes
|
||||
"""Validator for the 'es.4' format
|
||||
|
||||
Checks if documents are spec-compliant before ingesting, and signs them according to spec.
|
||||
|
||||
See https://earthstar-project.org/specs/data-spec
|
||||
"""
|
||||
|
||||
# Tolerance for accepting messages from the future (because of clock skew between peers)
|
||||
FUTURE_CUTOFF_MICROSECONDS = 600000000 # 10 minutes
|
||||
|
||||
# Allowed valid range of timestamps (in microseconds, not milliseconds); these are taken from
|
||||
# the reference implementation, but they are not specced.
|
||||
MIN_TIMESTAMP = 10000000000000 # 1983-01-02T06:57:53Z
|
||||
MAX_TIMESTAMP = 9007199254740990 # 2255-06-05T23:47:34.740990Z
|
||||
|
||||
# 4 million bytes = 4 megabytes (measured as bytes, not as utf-8 characters)
|
||||
MAX_CONTENT_LENGTH = 4000000
|
||||
|
||||
CORE_SCHEMA = {
|
||||
'title': 'Earthstar document format es.4',
|
||||
'description': 'This schema describes a valid Earthstar document in the es.4 format',
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'format': {
|
||||
'description': 'The format identifier. Must be "es.4"',
|
||||
'enum': ['es.4'],
|
||||
},
|
||||
'author': {
|
||||
'type': 'string',
|
||||
'pattern': Identity.PATTERN,
|
||||
},
|
||||
'content': {
|
||||
'type': 'string',
|
||||
'maxLength': MAX_CONTENT_LENGTH,
|
||||
},
|
||||
'contentHash': {
|
||||
'type': 'string',
|
||||
'pattern': f'^b[{B32_CHAR}]{{52}}$',
|
||||
},
|
||||
'deleteAfter': {
|
||||
'oneOf': [
|
||||
{
|
||||
'type': 'integer',
|
||||
'minimum': MIN_TIMESTAMP,
|
||||
'maximum': MAX_TIMESTAMP,
|
||||
},
|
||||
{
|
||||
'type': 'null',
|
||||
},
|
||||
],
|
||||
},
|
||||
'path': {
|
||||
'type': 'string',
|
||||
'pattern': Path.PATTERN,
|
||||
'minLength': 2,
|
||||
'maxLength': 512,
|
||||
},
|
||||
'signature': {
|
||||
'type': 'string',
|
||||
'pattern': f'^b[{B32_CHAR}]{{103}}$',
|
||||
},
|
||||
'timestamp': {
|
||||
'type': 'integer',
|
||||
'minimum': MIN_TIMESTAMP,
|
||||
'maximum': MAX_TIMESTAMP,
|
||||
},
|
||||
'workspace': {
|
||||
'type': 'string',
|
||||
'pattern': Share.PATTERN,
|
||||
},
|
||||
},
|
||||
'required': [
|
||||
'format',
|
||||
'timestamp',
|
||||
'deleteAfter',
|
||||
'author',
|
||||
'path',
|
||||
'workspace',
|
||||
'signature',
|
||||
'contentHash',
|
||||
'content',
|
||||
],
|
||||
'additionalProperties': False,
|
||||
}
|
||||
|
||||
__document_format__ = 'es.4'
|
||||
|
||||
_validator = staticmethod(fastjsonschema.compile(CORE_SCHEMA))
|
||||
|
||||
def __init__( # pylint: disable=too-many-arguments
|
||||
self,
|
||||
author: Identity,
|
||||
share: Share,
|
||||
path: Path,
|
||||
timestamp: Optional[datetime] = None,
|
||||
content: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
signature: Optional[str] = None,
|
||||
delete_after: Optional[datetime] = None,
|
||||
):
|
||||
self.author: Identity = author
|
||||
self.path = path
|
||||
self.signature = signature
|
||||
self._content = content or ''
|
||||
self._content_hash = content_hash
|
||||
self.delete_after = delete_after
|
||||
self.share = share
|
||||
self.timestamp = timestamp or datetime.utcnow()
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, raw_document: Dict[str, Any]) -> 'Es4Document':
|
||||
"""Validate raw_document as an es.4 document and create an ``Es4Document`` from it
|
||||
|
||||
:returns: a new ``Es4Document``
|
||||
:raises ValidationError: if anything is wrong
|
||||
"""
|
||||
|
||||
# do this first to ensure we have all the right datatypes in the right fields
|
||||
cls._check_basic_document_validity(raw_document)
|
||||
|
||||
# this is the most likely to fail under regular conditions, so do it next (because of
|
||||
# clock skew and expired ephemeral documents)
|
||||
cls._check_timestamp_is_ok(
|
||||
raw_document['timestamp'],
|
||||
raw_document['deleteAfter'],
|
||||
)
|
||||
|
||||
timestamp = datetime.utcfromtimestamp(raw_document['timestamp'] / 1000000).replace(
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
|
||||
if raw_document['deleteAfter']:
|
||||
delete_after = datetime.utcfromtimestamp(raw_document['deleteAfter'] / 1000000).replace(
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
else:
|
||||
delete_after = None
|
||||
|
||||
author = Identity.from_address(raw_document['author'])
|
||||
share = Share.from_address(raw_document['workspace'])
|
||||
path = Path(raw_document['path'])
|
||||
|
||||
if not path.can_write(author):
|
||||
raise ValidationError(f'Author {author} cannot write to path {path}')
|
||||
|
||||
# do this last since it might be slow on a large document
|
||||
cls._check_content_matches_hash(raw_document['content'], raw_document['contentHash'])
|
||||
|
||||
document = cls(
|
||||
author,
|
||||
share,
|
||||
path,
|
||||
timestamp=timestamp,
|
||||
content=raw_document['content'],
|
||||
content_hash=raw_document['contentHash'],
|
||||
signature=raw_document['signature'],
|
||||
delete_after=delete_after,
|
||||
)
|
||||
|
||||
document.validate_signature()
|
||||
|
||||
return document
|
||||
|
||||
@property
|
||||
def content_hash(self) -> str:
|
||||
"""The hash of the document’s content"""
|
||||
|
||||
if not self._content_hash:
|
||||
self._content_hash = self.hash_value(self.content)
|
||||
|
||||
return self._content_hash
|
||||
|
||||
@staticmethod
|
||||
def hash_value(value: str) -> str:
|
||||
"""Calculate the SHA-256 digest of the value and return its Base32 representation"""
|
||||
|
||||
hasher = sha256()
|
||||
hasher.update(value.encode('utf-8'))
|
||||
|
||||
return base32_bytes_to_string(hasher.digest())
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""The content of the document"""
|
||||
|
||||
return self._content
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str) -> None:
|
||||
self._content = value
|
||||
self._content_hash = self.hash_value(value)
|
||||
|
||||
def generate_hash(self) -> str:
|
||||
"""Calculate the hash of the document
|
||||
|
||||
This hash will be signed with the author’s key to propagate the signature property."""
|
||||
|
||||
hash_keys: Dict[str, Any] = {
|
||||
'author': self.author,
|
||||
'contentHash': self.content_hash,
|
||||
'deleteAfter': (
|
||||
int(self.delete_after.timestamp() * 1000000) if self.delete_after else None
|
||||
),
|
||||
'format': self.__document_format__,
|
||||
'path': self.path,
|
||||
'timestamp': int(self.timestamp.timestamp() * 1000000),
|
||||
'workspace': self.share,
|
||||
}
|
||||
|
||||
hasher = sha256()
|
||||
|
||||
for key, value in sorted(hash_keys.items(), key=lambda elem: elem[0]):
|
||||
if value is None:
|
||||
continue
|
||||
|
||||
hasher.update(f'{key}\t{value}\n'.encode('utf-8'))
|
||||
|
||||
return base32_bytes_to_string(hasher.digest())
|
||||
|
||||
def sign(self, identity: Optional[Identity] = None) -> None:
|
||||
if identity and identity != self.author:
|
||||
raise ValidationError(
|
||||
"when signing a document, keypair address must match document author"
|
||||
)
|
||||
|
||||
if identity:
|
||||
self.author = identity
|
||||
|
||||
self.signature = self.author.sign(self.generate_hash())
|
||||
|
||||
@classmethod
|
||||
def remove_extra_fields(cls, doc: Dict[str, Any]) -> Tuple[RawDocument, Mapping[str, Any]]:
|
||||
"""Return a copy of the doc without extra fields, plus the extra fields as a separate object
|
||||
|
||||
This should be run before check_document_is_valid. The output doc will be more likely to
|
||||
be valid once the extra fields have been removed.
|
||||
|
||||
:raises ValidationError: if the inpuc is not a plain JSON object
|
||||
"""
|
||||
|
||||
if not isinstance(doc, dict):
|
||||
raise ValidationError('Document is not a plain JSON object')
|
||||
|
||||
valid_keys = set(cls.CORE_SCHEMA['properties'].keys()) # type: ignore
|
||||
|
||||
doc2: RawDocument = {}
|
||||
extras: Mapping[str, Any] = {}
|
||||
|
||||
for (key, val) in doc.items():
|
||||
if key in valid_keys:
|
||||
doc2[key] = val # type: ignore
|
||||
else:
|
||||
if not key.startswith('_'):
|
||||
raise ValidationError(
|
||||
"extra document fields must have names starting with an underscore"
|
||||
)
|
||||
|
||||
extras[key] = val # type: ignore
|
||||
|
||||
return (doc2, extras)
|
||||
|
||||
# These are broken out for easier unit testing. They will not normally be used directly; use
|
||||
# the main assertDocumentIsValid instead.
|
||||
|
||||
@classmethod
|
||||
def _check_basic_document_validity(cls, doc: Dict[str, Any]) -> None:
|
||||
"""Check for correct fields and datatypes"""
|
||||
|
||||
try:
|
||||
cls._validator(doc)
|
||||
except BaseException as exc:
|
||||
raise ValidationError(exc) from None
|
||||
|
||||
# TODO: is there more to check?
|
||||
|
||||
@classmethod
|
||||
def _check_timestamp_is_ok(cls, timestamp: int, delete_after: Optional[int]) -> None:
|
||||
"""Check for valid timestamp, and expired ephemeral documents
|
||||
|
||||
timestamp and deleteAfter are already verified as good numbers by the schema checker:
|
||||
- in the right range of min and max allowed timestamps
|
||||
- integers, and not NaN or infinity
|
||||
|
||||
Timestamp must not be too far in the future (but allow some clock skew).
|
||||
"""
|
||||
|
||||
now = int(datetime.utcnow().timestamp() * 1000000)
|
||||
|
||||
if timestamp > now + cls.FUTURE_CUTOFF_MICROSECONDS:
|
||||
raise ValidationError("timestamp too far in the future")
|
||||
|
||||
# Ephemeral documents
|
||||
if delete_after is not None:
|
||||
# Only valid if expiration date is in the future
|
||||
if now > delete_after:
|
||||
raise ValidationError("ephemeral doc has expired")
|
||||
|
||||
# Can't expire before it was created, that makes no sense
|
||||
if delete_after <= timestamp:
|
||||
raise ValidationError("ephemeral doc expired before it was created")
|
||||
|
||||
def validate_signature(self) -> None:
|
||||
"""Check if the signature is good"""
|
||||
|
||||
try:
|
||||
doc_hash = self.generate_hash()
|
||||
except BaseException as exc:
|
||||
raise ValidationError(f'Cannot check signature: {exc}') from None
|
||||
|
||||
if not self.signature:
|
||||
raise ValidationError('document has no signature assigned')
|
||||
|
||||
if not self.author.verify(doc_hash, self.signature):
|
||||
raise ValidationError('signature is invalid')
|
||||
|
||||
@staticmethod
|
||||
def _check_content_matches_hash(content: str, content_hash: str) -> None:
|
||||
"""Ensure the contentHash matches the actual content"""
|
||||
|
||||
hasher = sha256()
|
||||
hasher.update(content.encode('utf-8'))
|
||||
calculated_hash = base32_bytes_to_string(hasher.digest())
|
||||
|
||||
if content is not None and calculated_hash != content_hash:
|
||||
raise ValidationError("content does not match contentHash")
|
Reference in New Issue
Block a user