earthsnake/earthsnake/replica.py

599 lines
20 KiB
Python
Raw Normal View History

2022-05-03 13:27:54 +00:00
from abc import ABC, abstractmethod
from datetime import datetime
from enum import Enum, auto
2022-04-12 13:25:29 +00:00
import json
2022-05-03 13:27:54 +00:00
import logging
2022-04-12 13:25:29 +00:00
from typing import List, Literal, Mapping, Optional, Tuple, Union
#import { Lock, Superbus } from "../../deps.ts";
from .types import Cmp
#import {
# LocalIndex,
# ShareAddress,
#} from "../util/doc-types.ts";
from .document import Document, IncompleteDocument
2022-05-03 13:27:54 +00:00
from .identity import Identity
from .path import Path
2022-04-12 13:25:29 +00:00
from .share import Share
from .query import HistoryMode, Query
#import {
# IngestEvent,
# IReplica,
# IReplicaDriver,
# ReplicaBusChannel,
# ReplicaId,
#} from "./replica-types.ts";
#import { IFormatValidator } from "../format-validators/format-validator-types.ts";
from .format_validator import FormatValidatorBase
from .exc import EarthsnakeError, ReplicaIsClosedError, ValidationError
#import { randomId } from "../util/misc.ts";
from .util import microsecond_now, random_id
#import { compareArrays } from "./compare.ts";
#import { checkShareIsValid } from "../core-validators/addresses.ts";
#
#import { Crypto } from "../crypto/crypto.ts";
#
# --------------------------------------------------
#
#import { Logger } from "../util/log.ts";
J = json.dumps
2022-05-03 13:27:54 +00:00
logger = logging.getLogger(__name__)
2022-04-12 13:25:29 +00:00
#
# ================================================================================
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
class IngestEventKind(Enum):
failure = auto()
nothing_happened = auto()
success = auto()
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
class IngestEventBase:
kind: IngestEventKind
max_local_index: int
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
class IngestEventFailure(IngestEventBase):
reason: Literal['write_error', 'invalid_document']
err: Optional[Exception]
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
class IngestEventNothingHappened(IngestEventBase):
reason: Literal['obsolete_from_same_author', 'already_had_it']
doc: Document
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
class IngestEventSuccess(IngestEventBase):
doc: Document
doc_is_latest: bool
prev_doc_from_same_author: Optional[Document]
prev_latest_doc: Optional[Document]
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
IngestEvent = Union[IngestEventFailure, IngestEventNothingHappened, IngestEventSuccess]
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
def doc_compare_newest_first(a: Document, b: Document) -> Cmp:
# Sorts by timestamp DESC (newest fist) and breaks ties using the signature ASC.
return compare_arrays(
[a.timestamp, a.signature],
[b.timestamp, a.signature],
["DESC", "ASC"],
)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
class Replica: # (IReplica):
"""A replica of a share's data, used to read, write, and synchronise data to
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
Should be closed using the `close` method when no longer being used.
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
.. code-block:: python
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
replica = Replica('+a.a123', FormatValidator.ES4, MemoryReplicaDriver())
"""
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
replica_id: str # todo: save it to the driver too, and reload it when starting up
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
#: The address of the share this replica belongs to
share: Share
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
#: The validator used to validate ingested documents
format_validator: FormatValidatorBase
replica_driver: ReplicaDriverBase
bus: Superbus[ReplicaBusChannel]
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
_is_closed = False
_ingest_lock: Lock[IngestEvent]
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
def __init__(
self,
share: Union[str, Workspace],
validator: FormatValidatorBase,
driver: ReplicaDriverBase,
) -> None:
if isinstance(share, str):
share = Workspace.from_string(share)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
logger.debug(
'constructor. driver = %s',
driver.__class__.__name__,
)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
# If we got a class instead of an actual driver object, lets instantiate the driver
if isinstance(driver, type):
driver = driver(share)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
self.replica_id = 'replica-' + random_id()
self.share = share
self.format_validator = validator
self.replica_driver = driver
self.bus = Superbus('|')
self._ingest_lock = Lock()
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
# --------------------------------------------------
# LIFECYCLE
2022-05-03 13:27:54 +00:00
def is_closed(self) -> bool:
2022-04-12 13:25:29 +00:00
"""Returns whether the replica is closed or not
2022-05-03 13:27:54 +00:00
"""
2022-04-12 13:25:29 +00:00
return self._is_closed
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
async def close(self, erase: bool) -> None:
"""Closes the replica, preventing new documents from being ingested or events being emitted
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
Any methods called after closing will return `ReplicaIsClosedError`
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
:param erase: Erase the contents of the replica. Defaults to `false`
2022-05-03 13:27:54 +00:00
"""
2022-04-12 13:25:29 +00:00
logger.debug('closing...')
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
# TODO: do this all in a lock?
logger.debug(' sending willClose blockingly...')
await self.bus.send_and_wait("willClose")
logger.debug(' marking self as closed...')
2022-05-03 13:27:54 +00:00
self._is_closed = True
2022-04-12 13:25:29 +00:00
logger.debug(' closing ReplicaDriver (erase = %s)...', erase)
await self.replica_driver.close(erase)
logger.debug(' sending didClose nonblockingly...')
await self.bus.send_and_wait('didClose')
logger.debug('...closing done')
# --------------------------------------------------
# CONFIG
async def get_config(self, key: str) -> Optional[str]:
"""Get a specific config value"""
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
return await self.replica_driver.get_config(key)
async def set_config(self, key: str, value: str) -> None:
"""Set a specific configuration value"""
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
return await self.replica_driver.set_config(key, value)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
async def list_config_keys(self, ) -> List[str]:
"""List all available configuration keys
"""
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
return await self.replica_driver.list_configKeys()
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
async def delete_config(self, key: str) -> bool:
"""Delete a key from the configuration
"""
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
return await self.replica_driver.delete_config(key)
# --------------------------------------------------
# GET
2022-05-03 13:27:54 +00:00
def get_max_local_index(self) -> int:
2022-04-12 13:25:29 +00:00
"""Returns the max local index of all stored documents
"""
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
return self.replica_driver.get_max_local_index()
2022-04-12 13:25:29 +00:00
async def get_docs_after_local_index(
self,
history_mode: HistoryMode,
start_after: int,
limit: Optional[int] = None,
2022-05-03 13:27:54 +00:00
) -> List[Document]:
2022-04-12 13:25:29 +00:00
"""Get all documents after a specific index
"""
logger.debug(
'get_docs_after_local_index(%s, %s, %d)',
history_mode,
start_after,
limit or -1,
)
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
query: Query = {
'history_mode': history_mode,
'order_by': 'localIndex ASC',
'start_after': {
'local_index': start_after,
},
'limit': limit,
}
return await self.replica_driver.query_docs(query)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
async def get_all_docs(self) -> List[Document]:
"""Returns all documents, including historical versions of documents by other identities
"""
logger.debug('get_all_docs()')
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
return await self.replica_driver.query_docs(
history_mode=HistoryMode.ALL,
order_by='path ASC',
)
async def get_latest_docs(self) -> List[Document]:
"""Returns latest document from every path
"""
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
logger.debug('get_latest_docs()')
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
return await self.replica_driver.query_docs(
history_mode=HistoryMode.LATEST,
order_by='path ASC',
)
async def get_all_docs_at_path(self, path: Path) -> List[Document]:
"""Returns all versions of a document by different authors from a specific path
"""
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
logger.debug('get_all_docs_at_path("%s")', path)
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
return await self.replica_driver.query_docs(
history_mode=HistoryMode.ALL,
order_by='path ASC',
filter={'path': path},
)
async def get_latest_doc_at_path(self, path: Path) -> Optional[Document]:
"""Returns the most recently written version of a document at a path"""
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
logger.debug('get_latest_docs_at_path("%s")', path)
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
docs = await self.replica_driver.query_docs(
history_mode=HistoryMode.LATEST,
order_by='path ASC',
filter={'path': path},
2022-05-03 13:27:54 +00:00
)
if not docs:
return None
return docs[0]
2022-04-12 13:25:29 +00:00
async def query_docs(self, query: Optional[Query] = None) -> List[Document]:
"""Returns an array of docs for a given query
.. code-block:: python
my_query = {
'filter': {
'path_ends_with': '.txt'
},
'limit': 5,
}
first_five_text_docs = await my_replica.query_docs(my_query)
"""
logger.debug('queryDocs %s', query)
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
return await self.replica_driver.query_docs(query)
# def query_paths(query: Optional[Query]) -> List[Path]: pass
# def query_authors(query: Optional[Query]) -> List[AuthorAddress]: pass
# --------------------------------------------------
# SET
async def set(
self,
keypair: Identity,
doc_to_set: IncompleteDocument,
) -> IngestEvent:
"""Adds a new document to the replica
If a document signed by the same identity exists at the same path, it will be overwritten.
"""
logger.debug('set %s', doc_to_set)
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
logger.debug(
'...deciding timestamp: getting latest doc at the same path (from any author)',
)
2022-05-03 13:27:54 +00:00
timestamp: int
2022-04-12 13:25:29 +00:00
if isinstance(doc_to_set.timestamp, int):
2022-05-03 13:27:54 +00:00
timestamp = doc_to_set.timestamp
2022-04-12 13:25:29 +00:00
logger.debug('...docToSet already has a timestamp; not changing it from %d', timestamp)
2022-05-03 13:27:54 +00:00
else:
2022-04-12 13:25:29 +00:00
# bump timestamp if needed to win over existing latest doc at same path
latest_doc_same_path = await self.get_latest_doc_at_path(doc_to_set.path)
2022-05-03 13:27:54 +00:00
if latest_doc_same_path is None:
timestamp = microsecond_now()
2022-04-12 13:25:29 +00:00
logger.debug(
'...no existing latest doc, setting timestamp to now() = %s', timestamp
)
2022-05-03 13:27:54 +00:00
else:
timestamp = max(microsecond_now(), latest_doc_same_path.timestamp + 1)
2022-04-12 13:25:29 +00:00
logger.debug(
'...existing latest doc found, bumping timestamp to win if needed = %s',
timestamp,
)
2022-05-03 13:27:54 +00:00
doc = Document(
2022-04-12 13:25:29 +00:00
format=ValidationFormat.ES4,
author=keypair.address,
2022-05-03 13:27:54 +00:00
content=doc_to_set.content,
2022-04-12 13:25:29 +00:00
content_hash=await Crypto.sha256base32(doc_to_set.content),
delete_after=doc_to_set.delete_after or None,
path=doc_to_set.path,
2022-05-03 13:27:54 +00:00
timestamp=timestamp,
2022-04-12 13:25:29 +00:00
workspace=self.share,
signature='?', # signature will be added in just a moment
# _localIndex will be added during upsert. it's not needed for the signature.
2022-05-03 13:27:54 +00:00
)
2022-04-12 13:25:29 +00:00
logger.debug('...signing doc')
2022-05-03 13:27:54 +00:00
try:
2022-04-12 13:25:29 +00:00
signed_doc = await self.format_validator.sign_document(keypair, doc)
except EarthsnakeError:
return {
'kind': 'failure',
'reason': 'invalid_document',
'err': signed_doc,
'max_local_index': self.replica_driver.get_max_local_index(),
}
logger.debug('...signature = %s', signed_doc.signature)
logger.debug('...ingesting')
logger.debug('-----------------------')
ingest_event = await self.ingest(signed_doc)
logger.debug('-----------------------')
logger.debug('...done ingesting')
logger.debug('...set is done.')
return ingest_event
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
async def ingest(self, doc_to_ingest: Doc) -> IngestEvent:
"""Ingest an existing signed document to the replica
"""
logger.debug('ingest %s', doc_to_ingest)
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
2022-04-12 13:25:29 +00:00
logger.debug('...removing extra fields')
2022-05-03 13:27:54 +00:00
try:
2022-04-12 13:25:29 +00:00
remove_results_or_err = self.format_validator.remove_extra_fields(doc_to_ingest)
2022-05-03 13:27:54 +00:00
except EarthsnakeError as exc:
2022-04-12 13:25:29 +00:00
return {
'kind': "failure",
'reason': "invalid_document",
'err': exc,
'max_local_index': self.replica_driver.get_max_local_index(),
}
doc_to_ingest = remove_results_or_err.doc # a copy of doc without extra fields
extra_fields = remove_results_or_err.extras # any extra fields starting with underscores
2022-05-03 13:27:54 +00:00
if extra_fields:
2022-04-12 13:25:29 +00:00
logger.debug('...extra fields found: %s', J(extra_fields))
2022-05-03 13:27:54 +00:00
try:
2022-04-12 13:25:29 +00:00
# now actually check doc validity against core schema
2022-05-03 13:27:54 +00:00
self.format_validator.check_document_is_valid(doc_to_ingest)
except EarthsnakeError as exc:
2022-04-12 13:25:29 +00:00
return {
'kind': "failure",
'reason': "invalid_document",
'err': exc,
'max_local_index': self.replica_driver.get_max_local_index(),
}
async def write_to_driver_with_lock() -> IngestEvent:
# get other docs at the same path
logger.debug(' >> ingest: start of protected region')
logger.debug(' > getting other history docs at the same path by any author')
existing_docs_same_path = await self.get_all_docs_at_path(doc_to_ingest.path)
logger.debug(' > ...got %d', len(existing_docs_same_path))
logger.debug(' > getting prevLatest and prevSameAuthor')
prev_latest: Optional[Document] = existing_docs_same_path[0] if existing_docs_same_path else None
prev_same_author: Optional[Document] = [
document
for document in existing_docs_same_path
if document.author == doc_to_ingest.author
][0] or None
logger.debug(' > checking if new doc is latest at this path')
existing_docs_same_path.push(doc_to_ingest)
existing_docs_same_path.sort(doc_compare_newest_first)
2022-05-03 13:27:54 +00:00
is_latest = existing_docs_same_path[0] == doc_to_ingest
2022-04-12 13:25:29 +00:00
logger.debug(' > ...isLatest: %s', is_latest)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
if not is_latest and prev_same_author is not None:
logger.debug(
' > new doc is not latest and there is another one from the same author...'
)
# check if this is obsolete or redudant from the same author
doc_comp = doc_compare_newest_first(doc_to_ingest, prev_same_author)
2022-05-03 13:27:54 +00:00
if doc_comp == Cmp.GT:
2022-04-12 13:25:29 +00:00
logger.debug(' > new doc is GT prevSameAuthor, so it is obsolete')
return {
'kind': "nothing_happened",
'reason': "obsolete_from_same_author",
'doc': doc_to_ingest,
'max_local_index': self.replica_driver.get_max_local_index(),
}
2022-05-03 13:27:54 +00:00
if doc_comp == Cmp.EQ:
2022-04-12 13:25:29 +00:00
logger.debug(
' > new doc is EQ prevSameAuthor, so it is redundant (already_had_it)',
2022-05-03 13:27:54 +00:00
)
2022-04-12 13:25:29 +00:00
return {
'kind': "nothing_happened",
'reason': "already_had_it",
'doc': doc_to_ingest,
'max_local_index': self.replica_driver.get_max_local_index(),
}
# save it
logger.debug(" > upserting into ReplicaDriver...")
# TODO: pass existing_docs_same_path to save another lookup
doc_as_written = await self.replica_driver.upsert(doc_to_ingest)
logger.debug(" > ...done upserting into ReplicaDriver")
logger.debug(" > ...getting ReplicaDriver maxLocalIndex...")
2022-05-03 13:27:54 +00:00
max_local_index = self.replica_driver.get_max_local_index()
2022-04-12 13:25:29 +00:00
logger.debug(
' >> ingest: end of protected region, returning a WriteEvent from the lock'
2022-05-03 13:27:54 +00:00
)
2022-04-12 13:25:29 +00:00
return {
'kind': "success",
'max_local_index': max_local_index,
'doc': doc_as_written, # with updated extra properties like _localIndex
'doc_is_latest': is_latest,
'prev_doc_from_same_author': prev_same_author,
'prev_latest_doc': prev_latest,
}
logger.debug(" >> ingest: running protected region...")
ingest_event: IngestEvent = await self._ingest_lock.run(
write_to_driver_with_lock,
)
logger.debug(" >> ingest: ...done running protected region")
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
logger.debug("...send ingest event after releasing the lock")
logger.debug("...ingest event: %s", ingest_event)
await self.bus.send_and_wait(
'ingest|{doc_to_ingest.path}',
ingest_event,
) # include the path in the channel even on failures
2022-05-03 13:27:54 +00:00
return ingest_event
2022-04-12 13:25:29 +00:00
async def overwrite_all_docs_by_author(self, keypair: Identity) -> int:
"""Overwrite every document from this author, including history versions, with an empty doc
:returns: the number of documents changed, or -1 if there was an error.
"""
logger.debug('overwriteAllDocsByAuthor("%s")', keypair.address)
2022-05-03 13:27:54 +00:00
if self._is_closed:
raise ReplicaIsClosedError()
# TODO: do this in batches
2022-04-12 13:25:29 +00:00
query = Query(
flt={'author': keypair.address},
history_mode=HistoryMode.ALL,
)
docs_to_overwrite = await self.query_docs(query)
logger.debug(' ...found %d docs to overwrite', len(docs_to_overwrite))
2022-05-03 13:27:54 +00:00
num_overwritten = 0
num_already_empty = 0
for doc in docs_to_overwrite:
if not doc.content:
num_already_empty += 1
continue
2022-04-12 13:25:29 +00:00
# remove extra fields
cleaned_result = self.format_validator.remove_extra_fields(doc)
cleaned_doc = cleaned_result.doc
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
# make new doc which is empty and just barely newer than the original
2022-05-03 13:27:54 +00:00
empty_doc = Document(
2022-04-12 13:25:29 +00:00
cleaned_doc,
content='',
content_hash=await Crypto.sha256base32(''),
timestamp=doc.timestamp + 1,
signature='?',
2022-05-03 13:27:54 +00:00
)
2022-04-12 13:25:29 +00:00
try:
# sign and ingest it
signed_doc = await self.format_validator.sign_document(keypair, empty_doc)
except EarthsnakeError:
return signed_doc
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
ingest_event = await self.ingest(signed_doc)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
if ingest_event.kind == 'failure':
return ValidationError(
f'ingestion error during overwriteAllDocsBySameAuthor: {ingest_event.reason}: {ingest_event.err}',
2022-05-03 13:27:54 +00:00
)
2022-04-12 13:25:29 +00:00
if ingest_event.kind == 'nothing_happened':
return ValidationError(
f'ingestion did nothing during overwriteAllDocsBySameAuthor: {ingest_event.reason}',
)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
# success
num_overwritten += 1
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
logger.debug(
' ...done; %d overwritten to be empty; %d were already empty; out of total %d docs',
num_overwritten,
num_already_empty,
len(docs_to_overwrite)
)
2022-05-03 13:27:54 +00:00
2022-04-12 13:25:29 +00:00
return num_overwritten