diff options
Diffstat (limited to 'groupthink/stringtree.py')
-rw-r--r-- | groupthink/stringtree.py | 555 |
1 files changed, 555 insertions, 0 deletions
diff --git a/groupthink/stringtree.py b/groupthink/stringtree.py new file mode 100644 index 0000000..5701636 --- /dev/null +++ b/groupthink/stringtree.py @@ -0,0 +1,555 @@ +import random +random.seed() #Works around some mysterious bug in the Journal or Rainbow that + #causes the random number generator to be seeded with the same + #constant value if a Journal entry is re-launched from the + #Journal. +from collections import defaultdict +import dbus # We do dbus (de)serialization in this file to minimize abstraction breakage +import logging +import aatree + +inf = float('inf') + +class Change: + """Each Change represents a chanage to the StringTree. + """ + def __init__(self, unique_id, parent, edit): + """unique_id is a unique identifier for this Change + parent is the unique_id that is affected by this + Change. Parent unique_id are always associated with an Insertion edit. + edit is an Insertion, Deletion, or Removal. It's what is to be done to + the parent""" + self.unique_id = unique_id + self.parent = parent + self.edit = edit + + def __repr__(self): + return "Change(%d, %d, %s)" % (self.unique_id, self.parent, str(self.edit)) + +class Insertion: + """Represents the action of inserting a particular bit of text + """ + def __init__(self, position, text): + """position is the point at which to insert text, in the unmodified + coordinates of the parent node (other insertions and deletions + to that node do not affect these coordinates) + text is the string to insert""" + self.position = position + self.text = text + + def __repr__(self): + return "Insertion(%d, %s)" % (self.position, self.text) + +class Deletion: + """Represents the deletion of only those characters present in the parent in + this range. Characters that have been inserted into the parent by + an Insertion are not affected. + """ + def __init__(self, position, length): + """position is the point, in the unmodified coordinates of the parent + node, at which to start deletion + length is an integer greater than 0 representing the number of + characters to delete""" + self.position = position + self.length = length + + def __repr__(self): + return "Deletion(%d, %d)" % (self.position, self.length) + +class Removal: + """Represents the deletion of all characters ever inserted in this range. + Insertions at the endpoints are _not_ included in the Removal, and must + be Removed separately. + """ + def __init__(self, position, length): + """position is the point at which to start Removal + length is the number of points in the unmodified parent coordinate to + the end of the Removal. Note that many more than length characters + can be removed if there are Insertions in this range.""" + self.position = position + self.length = length + + def __repr__(self): + return "Removal(%d, %d)" % (self.position, self.length) + +class Record: + """Each Record is used to store one Change inside the StringTree. + The purpose of a Record is contain both the Change itself, and any cached + information about the current effect of that Change. + """ + def __init__(self, change, depth): + self.change = change + self.depth = depth + + def __str__(self): + return "Record(%s, %d)" % (str(self.change), self.depth) + + __repr__ = __str__ + +def flatten(L): + o = [] + for x in L: + if isinstance(x,list): + o.extend(flatten(x)) #recursive + else: + o.append(x) + return o + +def my_rand(): + return random.getrandbits(63) + +def translator(c, pack): + if pack: + if isinstance(c.edit, Insertion): + return dbus.Struct((dbus.Int64(c.unique_id), + dbus.Int64(c.parent), + dbus.Int16(0), + dbus.Int64(c.edit.position), + dbus.UTF8String(c.edit.text)), + signature='xxnxs') + elif isinstance(c.edit, Deletion): + return dbus.Struct((dbus.Int64(c.unique_id), + dbus.Int64(c.parent), + dbus.Int16(1), + dbus.Int64(c.edit.position), + dbus.Int64(c.edit.length)), + signature='xxnxx') + elif isinstance(c.edit, Removal): + return dbus.Struct((dbus.Int64(c.unique_id), + dbus.Int64(c.parent), + dbus.Int16(2), + dbus.Int64(c.edit.position), + dbus.Int64(c.edit.length)), + signature='xxnxx') + else: + raise Unimplemented + else: + if c[2] == 0: + ed = Insertion(int(c[3]),str(c[4])) + elif c[2] == 1: + ed = Deletion(int(c[3]),int(c[4])) + elif c[2] == 2: + ed = Removal(int(c[3]),int(c[4])) + else: + raise "unknown type" + return Change(int(c[0]), int(c[1]), ed) + +class EagerHideList: + """EagerHideList provides a list with hidden elements. The standard index + considers only the visible elements, but the 'all' index considers all + elements. The standard index of an invisible element is considered to be + the index of the next visible element.""" + def __init__(self): + self._sourcelist = [] + self._poslist = [] + self._posmap = {} + def __len__(self): + return len(self._sourcelist) + def __iter__(self): + return self._sourcelist.__iter__() + def __getitem__(self, s): + return self._sourcelist[s] + def index(self, item): + x = self._poslist[self._posmap[item]] + return x[2] + def hide(self, position, length): + """Given the position in _sourcelist, and the length of the deletion, + perform the deletion in the lists""" + pfirst = self._sourcelist[position] + plast = self._sourcelist[position+length-1] + ifirst = self._posmap[pfirst] + ilast = self._posmap[plast] + for i in xrange(ifirst, ilast+1): + L = self._poslist[i] + L[1] = False #No longer visible, if it was visible before + L[2] = position #collapse positions + for L in self._poslist[ilast+1:]: + L[2] -= length #move all subsequent character up by length + + del self._sourcelist[position:(position+length)] + #self._check_invariants() + def getitem_all(self, s): + if isinstance(s, int): + return self._poslist[s][0] + else: + return [x[0] for x in self._poslist[s]] + def index_all(self, item): + return self._posmap[item] + def is_visible(self, i): + return self._poslist[i][1] + def is_visible_item(self, item): + return self._poslist[self._posmap[item]][1] + def insert_sequence_all(self, position, sequence, visibility): + """Insert sequence with visibility into the all-coordinates at position""" + if position > 0: + psource = self._poslist[position][2] + else: + psource = 0 + length = len(sequence) + newlist = [] + newlistsource = [] + i = psource + for elt, viz in zip(sequence, visibility): + newlist.append([elt, viz, i]) + if viz: + newlistsource.append(elt) + i += 1 + self._poslist[position:position] = newlist + for i in xrange(position,position+length): + L = self._poslist[i] + self._posmap[L[0]] = i + num_viz = len(newlistsource) + for i in xrange(position+length,len(self._poslist)): + L = self._poslist[i] + L[2] += num_viz + self._posmap[L[0]] = i + self._sourcelist[psource:psource] = newlistsource + #self._check_invariants() + def insert_sequence_leftof(self, target, sequence, visibility): + self.insert_sequence_all(self._posmap[target], sequence, visibility) + + def _check_invariants(self): + assert len(self._posmap) == len(self._poslist) + for i in xrange(len(self._poslist)): + assert self._posmap[self._poslist[i][0]] == i + if self._poslist[i][1]: + assert self._sourcelist[self._poslist[i][2]] == self._poslist[i][0] + if i > 0: + if self._poslist[i-1][1]: + assert self._poslist[i-1][2] + 1 == self._poslist[i][2] + else: + assert self._poslist[i-1][2] == self._poslist[i][2] + +class SimpleStringTree: + """SimpleStringTree is a StringTree that supports only Insertions and + Deletions. Handling References, while valuable, has proven quite difficult, + and so will not be addressed by this data structure. + + Code for handling Removals will be left in for the moment, since it is + easy enough, even though it is not presently used.""" + + def __init__(self, initstring=""): + self.ROOT = Record(Change(-1, -1, Insertion(0, "")), 0) + self._id2rec = dict() #unique_id: Record(change) | change.unique_id = unique_id + self._id2rec[-1] = self.ROOT + self._parent2children = defaultdict(set) # unique_id: {Record(change) | change.parent == unique_id} + self._cursor = 0 + self._listing = aatree.AATreeHideList() + self._listing.insert_sequence_all(0,[(-1,0)],[False]) + if initstring: + self.insert(initstring, 0) + + def __repr__(self): + return "\n".join((str(v) for v in self._id2rec.values())) + + def getvalue(self, r = None): + if r is None: + r = self.ROOT + s = "".join(self._id2rec[x[0]].change.edit.text[x[1]] for x in self._listing) + return s + + def next(self): + raise + + def flush(self): + # This could be used to implement lazy evaluation of input by not + # re-evaluating the string until flush (or read?) + pass + + def close(self): + raise + + def read(self, size=float('inf')): + #Efficiency: This method should use size to avoid calling getvalue and rendering the entire string + s = self.getvalue() + outpoint = min(len(s), self._cursor + size) + inpoint = self._cursor + self._cursor = outpoint + return s[inpoint:outpoint] + + def readline(self, size=float('inf')): + #Efficiency: This method should use size to avoid rendering the whole string + s = self.getvalue() + outpoint = min(len(s), self._cursor + size) + inpoint = self._cursor + i = s.find("\n", inpoint, outpoint) + if i == -1 or i >= outpoint: + self._cursor = outpoint + return s[inpoint:outpoint] + else: + self._cursor = i + 1 + return s[inpoint:(i+1)] + + def readlines(self, sizehint=None): + #Efficiency: use sizehint + s = self.getvalue() + t = s[self._cursor:] + self._cursor = len(s) + return t.splitlines(True) + + def seek(self, offset, whence=0): + if whence == 0: + self._cursor = offset + elif whence == 1: + self._cursor += offset + elif whence == 2: + self._cursor = len(self._listing) + offset + + def tell(self): + return self._cursor + + def truncate(self, size=None): + if size is None: + size = self._cursor + return self.delete(size, len(self._listing) - size) + + def write(self, text): + L = min(len(self._listing) - self._cursor, len(text)) + changelist = [] + if L > 0: + changelist.extend(self.delete(self._cursor, L)) + changelist.extend(self.insert(text)) + return changelist + + def writelines(self, sequence): + s = "".join(sequence) + return self.write(s) + + # Non-filelike text editing methods + + def insert(self, text, k=None): + if k is None: + k = self._cursor + + if len(self._listing) == 0: + r = self.ROOT + uid = -1 + inspoint = 0 + elif k == 0: + (uid, inspoint) = self._listing[k] + r = self._id2rec[uid] + elif k < len(self._listing): + # When not inserting at the endpoints, we have to be sure to + # check if we are at the boundary between a parent and one of its + # descendants. If so, we must make our insertion in the descendant, + # not the parent, because the insertions would "conflict" (occur at + # the same location) in the parent, which would produce an ordering + # ambiguity, which will resolve in our favor only 50% of the time. + pL, pR = self._listing[(k-1):(k+1)] + (uidL, inspointL) = pL + (uidR, inspointR) = pR + rR = self._id2rec[uidR] + if uidL == uidR: #There's no boundary here at all (at least not one + # of any importance to us. Therefore, we have to insert to the + # left of the character at k, as usual. + r = rR + uid = uidR + inspoint = inspointR + else: #There's a boundary of some sort here. We always choose to + # insert at the deeper node (breaking left in case of a tie). + # (In the case that neither node is the ancestor of the other, + # either choice would be acceptable, regardless of depth. This + # logic is therefore acceptable, and has the advantage of being + # simple and fast.) + rL = self._id2rec[uidL] + if rR.depth > rL.depth: + r = rR + uid = uidR + inspoint = inspointR + else: + r = rL + uid = uidL + inspoint = inspointL + 1 + elif k == len(self._listing): + (uid,i) = self._listing[k-1] + r = self._id2rec[uid] + inspoint = i + 1 + else: + raise + + e = Insertion(inspoint, text) + c = Change(my_rand(), r.change.unique_id, e) + self._add_change_treeonly(c) + target = (uid, inspoint) + self._insert_listonly(c.unique_id, target, len(text)) + self._cursor = k + len(text) + return [c] + + def delete(self, k, n): + """Starting at a point k (0-indexed), delete n characters""" + if k + n > len(self._listing): + raise + p = self._listing[k] + contigs = [[p[0],p[1],p[1]]] + for (uid, index) in self._listing[(k+1):(k+n)]: + #This logic produces deletions that span any missing chunks. This + #produces a smaller number of deletions than making sure that they + #are actually "contiguous", but it might interact badly with a + #hypothetical undo system. + if contigs[-1][0] == uid: + contigs[-1][2] = index + else: + contigs.append([uid,index,index]) + changelist = [Change(my_rand(), c[0], Deletion(c[1],1 + c[2]-c[1])) for c in contigs] + for c in changelist: + self._add_change_treeonly(c) + self._delete_listonly(k,n) + return changelist + + def get_range(self, rec, point, m): + todo = [(rec, point, m, None)] # None is a dummy value since p is unused + ranges = [] + while len(todo) > 0: + (rec, point, m, p) = todo[0] + h = self._range_helper(point, m) + self._step(h, rec) + if h.outpoint is not None: + ranges.append((rec, h.point_parent, h.outpoint - h.point_parent)) + #print rec, h.point_parent, h.outpoint - h.point_parent + todo.extend(h.todo) + #print todo + del todo[0] + return ranges + + def move(self, rempoint, n, inspoint): + """In StringTree, move() should coherently copy a section of text, + such that any conflicting edits appear in the new location, not the old. + In SimpleStringTree, this is not possible, so move() just falls back to + Deletion and Insertion.""" + self.seek(rempoint) + t = self.read(n) + + if rempoint > inspoint: + L = self.delete(rempoint,n) + L.extend(self.insert(t, inspoint)) + else: + L = self.insert(t, inspoint) + L.extend(self.delete(rempoint,n)) + return L + + # Patch management methods + + def add_change(self, c): + if c.unique_id in self._id2rec: + return [] + if isinstance(c.edit, Insertion): + p = self._effective_parent(c.unique_id, c.parent, c.edit.position) + i = self._listing.index(p) + d = len(c.edit.text) + self._insert_listonly(c.unique_id, p, d) + flist = [Insertion(i, c.edit.text)] + elif isinstance(c.edit, Deletion): + flist = [] + start = None + end = None + for i in xrange(c.edit.position,c.edit.position + c.edit.length): + p = (c.parent, i) + if self._listing.is_visible_item(p): + q = self._listing.index(p) + if end == q: + end += 1 + else: + if end is not None: + n = end - start + flist.append(Deletion(start,n)) + self._delete_listonly(start,n) + start = q + end = start + 1 + if end is not None: # the last contig won't be handled by the loop + n = end - start + flist.append(Deletion(start,n)) + self._delete_listonly(start,n) + else: + raise + self._add_change_treeonly(c) + return flist + + def _effective_parent(self, uid, parentid, position): + """The effective parent of an insertion is defined as the (uid, loc) pair + that causes it to be inserted in the right location in the string. This + is only relevant in the event of a conflict, in which case the conflict + edits are required to be ordered from least uid to greatest uid. The + effective parent of a conflicted edit, then, is either the original pair, + or (u_next, 0), where u_next is the uid of the sibling directly to the right + of the input uid. That is to say, it is the least u greater than uid + that has the same position.""" + if parentid in self._parent2children: + u_next = None + for r in self._parent2children[parentid]: + u = r.change.unique_id + p = r.change.edit.position + if (p == position) and isinstance(r.change.edit, Insertion) and (u > uid): + if (u_next is None) or (u < u_next): + u_next = u + if u_next is not None: + return (u_next, 0) + return (parentid, position) + + def _delete_listonly(self, position, length): + """Given the position in _sourcelist, and the length of the deletion, + perform the deletion in the lists""" + self._listing.hide(position,length) + + def _insert_listonly(self, uid, target, length): + """Make a new insertion into the lists with uid and length at position + in _poslist""" + elts = [(uid,i) for i in xrange(length+1)] + visibility = [True] * length + visibility.append(False) + self._listing.insert_sequence_leftof(target, elts, visibility) + + def _add_change_treeonly(self, c): + if c.unique_id in self._id2rec: + return + d = self._id2rec[c.parent].depth + 1 + r = Record(c,d) + self._id2rec[c.unique_id] = r + if c.parent not in self._parent2children: + self._parent2children[c.parent] = set() + self._parent2children[c.parent].add(r) + + def get_insert(self, k, n = 0, parent=None): + #FIXME: This method could be useful, but it no longer works + """get_insert finds and returns the youngest insertion containing positions + k to k + n in the total coordinates of parent. If parent is unspecified, + then it is taken to be the root, meaning the coordinates in question + are the current document coordinates. + + The return value is a tuple (rec, k_unmodified, k_modified), meaning + the record containing k and k + n, the position of k in rec's + unmodified coordinates, and the position of k in rec's + modified coordinates. + + "containing" is defined so that a record with n characters + (labeled 0...n-1) can still be returned as containing k...k+n. In + other words, each Insertion contains both endpoints. However, an + Insertion that has been totally deleted is ignored entirely.""" + if parent is None: + parent = self.ROOT + + h = self._get_insert_helper(k, n) + self._step(h, parent) + while h.child is not None: + parent = h.child + h = self._get_insert_helper(h.child_k, n) + self._step(h, parent) + + return (parent, h.k_in_parent, h.k) + + def get_changes(self): #TODO: add arguments to get only changes in a certain range + """get_changes provides a depth-first topologically sorted list of all + Changes in this Tree""" + L = [] + stack = [-1] + while stack: + x = self._id2rec[stack.pop()].change + L.append(x) + if x.unique_id in self._parent2children: + stack.extend(r.change.unique_id + for r in self._parent2children[x.unique_id]) + return L + + def is_ready(self, c): + """Returns a boolean indicating whether a Change c may safely be added + to the Tree. Specifically, it checks whether c.parent is already known.""" + return c.parent in self._id2rec |