The SSH host key has changed on 8 April, 2022 to this one: SHA256:573uTBSeh74kvOo0HJXi5ijdzRm8me27suzNEDlGyrQ
Python implementation of [Earthstar](
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

389 lines
12 KiB

8 months ago
"""Document class for the es.4 format"""
from datetime import datetime, timezone
from hashlib import sha256
from typing import Any, Dict, Mapping, Optional, Tuple, TypedDict
import fastjsonschema
from ..base32 import base32_bytes_to_string
from ..exc import ValidationError
from ..identity import Identity
from ..path import Path
from ..share import Share
from ..types import B32_CHAR
from . import Document
class RawDocument(TypedDict, total=False):
"""A raw Document object"""
author: str
signature: str
content: str
contentHash: str
deleteAfter: Optional[int]
path: str
workspace: str
timestamp: int
class Es4Document(Document): # pylint: disable=too-many-instance-attributes
8 months ago
"""An es.4 format document
# Tolerance for accepting messages from the future (because of clock skew between peers)
FUTURE_CUTOFF_MICROSECONDS = 600000000 # 10 minutes
# Allowed valid range of timestamps (in microseconds, not milliseconds); these are taken from
# the reference implementation, but they are not specced.
MIN_TIMESTAMP = 10000000000000 # 1983-01-02T06:57:53Z
MAX_TIMESTAMP = 9007199254740990 # 2255-06-05T23:47:34.740990Z
# 4 million bytes = 4 megabytes (measured as bytes, not as utf-8 characters)
'title': 'Earthstar document format es.4',
'description': 'This schema describes a valid Earthstar document in the es.4 format',
'type': 'object',
'properties': {
'format': {
'description': 'The format identifier. Must be "es.4"',
'enum': ['es.4'],
'author': {
'type': 'string',
'pattern': Identity.PATTERN,
'content': {
'type': 'string',
'contentHash': {
'type': 'string',
'pattern': f'^b[{B32_CHAR}]{{52}}$',
'deleteAfter': {
'oneOf': [
'type': 'integer',
'minimum': MIN_TIMESTAMP,
'maximum': MAX_TIMESTAMP,
'type': 'null',
'path': {
'type': 'string',
'pattern': Path.PATTERN,
'minLength': 2,
'maxLength': 512,
'signature': {
'type': 'string',
'pattern': f'^b[{B32_CHAR}]{{103}}$',
'timestamp': {
'type': 'integer',
'minimum': MIN_TIMESTAMP,
'maximum': MAX_TIMESTAMP,
'workspace': {
'type': 'string',
'pattern': Share.PATTERN,
'required': [
'additionalProperties': False,
__document_format__ = 'es.4'
_validator = staticmethod(fastjsonschema.compile(CORE_SCHEMA))
def __init__( # pylint: disable=too-many-arguments
author: Identity,
share: Share,
path: Path,
timestamp: Optional[datetime] = None,
content: Optional[str] = None,
content_hash: Optional[str] = None,
signature: Optional[str] = None,
delete_after: Optional[datetime] = None,
8 months ago = author
self.path = path
self.signature = signature
self._content = content or ''
self._content_hash = content_hash
self.delete_after = delete_after
self.share = share
self.timestamp = timestamp or datetime.utcnow()
def from_json(cls, raw_document: Dict[str, Any]) -> 'Es4Document':
"""Validate raw_document as an es.4 document and create an ``Es4Document`` from it
8 months ago
Checks if documents are spec-compliant before ingesting, and signs them according to spec.
:returns: a new ``Es4Document``
:raises ValidationError: if anything is wrong
# do this first to ensure we have all the right datatypes in the right fields
# this is the most likely to fail under regular conditions, so do it next (because of
# clock skew and expired ephemeral documents)
timestamp = datetime.utcfromtimestamp(raw_document['timestamp'] / 1000000).replace(
if raw_document['deleteAfter']:
delete_after = datetime.utcfromtimestamp(raw_document['deleteAfter'] / 1000000).replace(
delete_after = None
author = Identity.from_address(raw_document['author'])
share = Share.from_address(raw_document['workspace'])
path = Path(raw_document['path'])
if not path.can_write(author):
raise ValidationError(f'Author {author} cannot write to path {path}')
# do this last since it might be slow on a large document
cls._check_content_matches_hash(raw_document['content'], raw_document['contentHash'])
document = cls(
return document
def content_hash(self) -> str:
"""The hash of the document’s content"""
if not self._content_hash:
self._content_hash = self.hash_value(self.content)
return self._content_hash
def hash_value(value: str) -> str:
"""Calculate the SHA-256 digest of the value and return its Base32 representation"""
hasher = sha256()
return base32_bytes_to_string(hasher.digest())
def content(self) -> str:
"""The content of the document"""
return self._content
def content(self, value: str) -> None:
self._content = value
self._content_hash = self.hash_value(value)
def content_length(self) -> int:
return len(self._content)
def generate_hash(self) -> str:
"""Calculate the hash of the document
This hash will be signed with the authors key to propagate the signature property."""
hash_keys: Dict[str, Any] = {
'contentHash': self.content_hash,
'deleteAfter': (
int(self.delete_after.timestamp() * 1000000) if self.delete_after else None
'format': self.__document_format__,
'path': self.path,
'timestamp': int(self.timestamp.timestamp() * 1000000),
'workspace': self.share,
hasher = sha256()
for key, value in sorted(hash_keys.items(), key=lambda elem: elem[0]):
8 months ago
# Skip null fields
if value is None:
8 months ago
# Otherwise, append the fieldname and value.
# Tab and newline are our field separators.
# Convert integers to strings here.
# (The newline is included on the last field.)
8 months ago
# Binary digest, not hex digest string! Then convert bytes to Earthstar b32 format with
# leading 'b'.
return base32_bytes_to_string(hasher.digest())
def sign(self, identity: Optional[Identity] = None) -> None:
8 months ago
"""Sign the document and store the signature into the document (mutating it)
if identity and identity !=
raise ValidationError(
"when signing a document, keypair address must match document author"
if identity: = identity
self.signature =
def remove_extra_fields(cls, doc: Dict[str, Any]) -> Tuple[RawDocument, Mapping[str, Any]]:
"""Return a copy of the doc without extra fields, plus the extra fields as a separate object
This should be run before check_document_is_valid. The output doc will be more likely to
be valid once the extra fields have been removed.
:raises ValidationError: if the inpuc is not a plain JSON object
if not isinstance(doc, dict):
raise ValidationError('Document is not a plain JSON object')
valid_keys = set(cls.CORE_SCHEMA['properties'].keys()) # type: ignore
doc2: RawDocument = {}
extras: Mapping[str, Any] = {}
for (key, val) in doc.items():
if key in valid_keys:
doc2[key] = val # type: ignore
if not key.startswith('_'):
raise ValidationError(
"extra document fields must have names starting with an underscore"
extras[key] = val # type: ignore
return (doc2, extras)
# These are broken out for easier unit testing. They will not normally be used directly; use
# the main assertDocumentIsValid instead.
def _check_basic_document_validity(cls, doc: Dict[str, Any]) -> None:
"""Check for correct fields and datatypes"""
except BaseException as exc:
raise ValidationError(exc) from None
# TODO: is there more to check?
def _check_timestamp_is_ok(cls, timestamp: int, delete_after: Optional[int]) -> None:
"""Check for valid timestamp, and expired ephemeral documents
timestamp and deleteAfter are already verified as good numbers by the schema checker:
- in the right range of min and max allowed timestamps
- integers, and not NaN or infinity
Timestamp must not be too far in the future (but allow some clock skew).
now = int(datetime.utcnow().timestamp() * 1000000)
if timestamp > now + cls.FUTURE_CUTOFF_MICROSECONDS:
raise ValidationError("timestamp too far in the future")
# Ephemeral documents
if delete_after is not None:
# Only valid if expiration date is in the future
if now > delete_after:
raise ValidationError("ephemeral doc has expired")
# Can't expire before it was created, that makes no sense
if delete_after <= timestamp:
raise ValidationError("ephemeral doc expired before it was created")
def validate_signature(self) -> None:
"""Check if the signature is good"""
doc_hash = self.generate_hash()
except BaseException as exc:
raise ValidationError(f'Cannot check signature: {exc}') from None
if not self.signature:
raise ValidationError('document has no signature assigned')
if not, self.signature):
raise ValidationError('signature is invalid')
def _check_content_matches_hash(content: str, content_hash: str) -> None:
"""Ensure the contentHash matches the actual content"""
hasher = sha256()
calculated_hash = base32_bytes_to_string(hasher.digest())
if content is not None and calculated_hash != content_hash:
raise ValidationError("content does not match contentHash")
8 months ago
def compare_newest_first(doc_a, doc_b):
"""Compare two documents based on their time stamp"""
if doc_a.timestamp < doc_b.timestamp:
return Cmp.LT
if doc_b.timestamp > doc_b.timestamp:
return Cmp.GT
if doc_a.signature < doc_b.signature:
return Cmp.LT
if doc_a.signature > doc_b.signature:
return Cmp.GT
return Cmp.EQ