telemeta.util.unaccent module

Show source ≡

# This file by Fredrik Lundh from:
# http://effbot.org/zone/unicode-convert.htm
# http://effbot.python-hosting.com/file/stuff/sandbox/text/unaccent.py

# use a dynamically populated translation dictionary to remove accents
# from a string

import unicodedata, sys

CHAR_REPLACEMENT = {
    # latin-1 characters that don't have a unicode decomposition
    0xc6: u"AE", # LATIN CAPITAL LETTER AE
    0xd0: u"D",  # LATIN CAPITAL LETTER ETH
    0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE
    0xde: u"Th", # LATIN CAPITAL LETTER THORN
    0xdf: u"ss", # LATIN SMALL LETTER SHARP S
    0xe6: u"ae", # LATIN SMALL LETTER AE
    0xf0: u"d",  # LATIN SMALL LETTER ETH
    0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE
    0xfe: u"th", # LATIN SMALL LETTER THORN
    }

##
# Translation dictionary.  Translation entries are added to this
# dictionary as needed.

class UnaccentedMap(dict):

    ##
    # Maps a unicode character code (the key) to a replacement code
    # (either a character code or a unicode string).

    def mapchar(self, key):
        ch = self.get(key)
        if ch is not None:
            return ch
        de = unicodedata.decomposition(unichr(key))
        if de:
            try:
                ch = int(de.split(None, 1)[0], 16)
            except (IndexError, ValueError):
                ch = key
        else:
            ch = CHAR_REPLACEMENT.get(key, key)
        self[key] = ch
        return ch

    if sys.version >= "2.5":
        # use __missing__ where available
        __missing__ = mapchar
    else:
        # otherwise, use standard __getitem__ hook (this is slower,
        # since it's called for each character)
        __getitem__ = mapchar


_map = UnaccentedMap()

def unaccent(str):
    return str.translate(_map)

def unaccent_icmp(str1, str2):
    str1 = unaccent(str1).lower()
    str2 = unaccent(str2).lower()
    if str1 > str2:
        return 1

    if str1 < str2:
        return -1

    return 0

Module variables

var CHAR_REPLACEMENT

Functions

def unaccent(

str)

Show source ≡

def unaccent(str):
    return str.translate(_map)

def unaccent_icmp(

str1, str2)

Show source ≡

def unaccent_icmp(str1, str2):
    str1 = unaccent(str1).lower()
    str2 = unaccent(str2).lower()
    if str1 > str2:
        return 1

    if str1 < str2:
        return -1

    return 0

Classes

class UnaccentedMap

Show source ≡

class UnaccentedMap(dict):

    ##
    # Maps a unicode character code (the key) to a replacement code
    # (either a character code or a unicode string).

    def mapchar(self, key):
        ch = self.get(key)
        if ch is not None:
            return ch
        de = unicodedata.decomposition(unichr(key))
        if de:
            try:
                ch = int(de.split(None, 1)[0], 16)
            except (IndexError, ValueError):
                ch = key
        else:
            ch = CHAR_REPLACEMENT.get(key, key)
        self[key] = ch
        return ch

    if sys.version >= "2.5":
        # use __missing__ where available
        __missing__ = mapchar
    else:
        # otherwise, use standard __getitem__ hook (this is slower,
        # since it's called for each character)
        __getitem__ = mapchar

Ancestors (in MRO)

UnaccentedMap
__builtin__.dict
__builtin__.object

Methods

def mapchar(

self, key)

Show source ≡

def mapchar(self, key):
    ch = self.get(key)
    if ch is not None:
        return ch
    de = unicodedata.decomposition(unichr(key))
    if de:
        try:
            ch = int(de.split(None, 1)[0], 16)
        except (IndexError, ValueError):
            ch = key
    else:
        ch = CHAR_REPLACEMENT.get(key, key)
    self[key] = ch
    return ch