telemeta.util.unaccent module
# This file by Fredrik Lundh from: # http://effbot.org/zone/unicode-convert.htm # http://effbot.python-hosting.com/file/stuff/sandbox/text/unaccent.py # use a dynamically populated translation dictionary to remove accents # from a string import unicodedata, sys CHAR_REPLACEMENT = { # latin-1 characters that don't have a unicode decomposition 0xc6: u"AE", # LATIN CAPITAL LETTER AE 0xd0: u"D", # LATIN CAPITAL LETTER ETH 0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE 0xde: u"Th", # LATIN CAPITAL LETTER THORN 0xdf: u"ss", # LATIN SMALL LETTER SHARP S 0xe6: u"ae", # LATIN SMALL LETTER AE 0xf0: u"d", # LATIN SMALL LETTER ETH 0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE 0xfe: u"th", # LATIN SMALL LETTER THORN } ## # Translation dictionary. Translation entries are added to this # dictionary as needed. class UnaccentedMap(dict): ## # Maps a unicode character code (the key) to a replacement code # (either a character code or a unicode string). def mapchar(self, key): ch = self.get(key) if ch is not None: return ch de = unicodedata.decomposition(unichr(key)) if de: try: ch = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): ch = key else: ch = CHAR_REPLACEMENT.get(key, key) self[key] = ch return ch if sys.version >= "2.5": # use __missing__ where available __missing__ = mapchar else: # otherwise, use standard __getitem__ hook (this is slower, # since it's called for each character) __getitem__ = mapchar _map = UnaccentedMap() def unaccent(str): return str.translate(_map) def unaccent_icmp(str1, str2): str1 = unaccent(str1).lower() str2 = unaccent(str2).lower() if str1 > str2: return 1 if str1 < str2: return -1 return 0
Module variables
var CHAR_REPLACEMENT
Functions
def unaccent(
str)
def unaccent(str): return str.translate(_map)
def unaccent_icmp(
str1, str2)
def unaccent_icmp(str1, str2): str1 = unaccent(str1).lower() str2 = unaccent(str2).lower() if str1 > str2: return 1 if str1 < str2: return -1 return 0
Classes
class UnaccentedMap
class UnaccentedMap(dict): ## # Maps a unicode character code (the key) to a replacement code # (either a character code or a unicode string). def mapchar(self, key): ch = self.get(key) if ch is not None: return ch de = unicodedata.decomposition(unichr(key)) if de: try: ch = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): ch = key else: ch = CHAR_REPLACEMENT.get(key, key) self[key] = ch return ch if sys.version >= "2.5": # use __missing__ where available __missing__ = mapchar else: # otherwise, use standard __getitem__ hook (this is slower, # since it's called for each character) __getitem__ = mapchar
Ancestors (in MRO)
- UnaccentedMap
- __builtin__.dict
- __builtin__.object
Methods
def mapchar(
self, key)
def mapchar(self, key): ch = self.get(key) if ch is not None: return ch de = unicodedata.decomposition(unichr(key)) if de: try: ch = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): ch = key else: ch = CHAR_REPLACEMENT.get(key, key) self[key] = ch return ch