Prev Next

I use an iterator to group UCS-2 surrogate characters together.


class SurrogateGroupIter(object):
def __init__ (self, uni):
self.uni = uni
self.len = len (uni)
self.i = 0

def __iter__ (self):
return self

def next (self):
if self.i == self.len:
raise StopIteration
ch = self.uni [self.i]
self.i += 1

if ord(ch) >= 0xD800 and ord(ch) <= 0xDBFF and self.i < self.len:
next = self.uni[self.i]
if ord (next) >= 0xDC00 and ord (next) <= 0xDCFF:
self.i += 1
return ch + next
return ch