diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2006-03-09 23:38:20 +0000 |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2006-03-09 23:38:20 +0000 |
commit | 480f1bb67ba8d2857d87921391df278c5569774c (patch) | |
tree | 16370e5215e51cb589a2f07b93a2105c851ce956 /Modules/unicodedata.c | |
parent | Move entry to correct section. (diff) | |
download | cpython-480f1bb67ba8d2857d87921391df278c5569774c.tar.gz cpython-480f1bb67ba8d2857d87921391df278c5569774c.tar.bz2 cpython-480f1bb67ba8d2857d87921391df278c5569774c.zip |
Update Unicode database to Unicode 4.1.
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r-- | Modules/unicodedata.c | 241 |
1 files changed, 213 insertions, 28 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 90f2ca502e6..a8548376b9d 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -14,6 +14,7 @@ #include "Python.h" #include "ucnhash.h" +#include "structmember.h" /* character properties */ @@ -28,6 +29,14 @@ typedef struct { _PyUnicode_EastAsianWidth */ } _PyUnicode_DatabaseRecord; +typedef struct change_record { + /* sequence of fields should be the same as in merge_old_version */ + const unsigned char bidir_changed; + const unsigned char category_changed; + const unsigned char decimal_changed; + const int numeric_changed; +} change_record; + /* data file generated by Tools/unicode/makeunicodedata.py */ #include "unicodedata_db.h" @@ -51,6 +60,85 @@ _getrecord(PyUnicodeObject* v) return _getrecord_ex(*PyUnicode_AS_UNICODE(v)); } +/* ------------- Previous-version API ------------------------------------- */ +typedef struct previous_version { + PyObject_HEAD + const char *name; + const change_record* (*getrecord)(Py_UCS4); + Py_UCS4 (*normalization)(Py_UCS4); +} PreviousDBVersion; + +#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) + +/* Forward declaration */ +static PyMethodDef unicodedata_functions[]; + +static PyMemberDef DB_members[] = { + {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, + {NULL} +}; + +static PyTypeObject Xxo_Type = { + /* The ob_type field must be initialized in the module init function + * to be portable to Windows without using C++. */ + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "unicodedata.DB", /*tp_name*/ + sizeof(PreviousDBVersion), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + /* methods */ + (destructor)PyObject_Del, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + PyObject_GenericGetAttr,/*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + 0, /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + unicodedata_functions, /*tp_methods*/ + DB_members, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + 0, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ +}; + +static PyObject* +new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), + Py_UCS4 (*normalization)(Py_UCS4)) +{ + PreviousDBVersion *self; + self = PyObject_New(PreviousDBVersion, &Xxo_Type); + if (self == NULL) + return NULL; + self->name = name; + self->getrecord = getrecord; + self->normalization = normalization; + return (PyObject*)self; +} + /* --- Module API --------------------------------------------------------- */ PyDoc_STRVAR(unicodedata_decimal__doc__, @@ -65,6 +153,7 @@ unicodedata_decimal(PyObject *self, PyObject *args) { PyUnicodeObject *v; PyObject *defobj = NULL; + int have_old = 0; long rc; if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) @@ -74,7 +163,22 @@ unicodedata_decimal(PyObject *self, PyObject *args) "need a single Unicode character as parameter"); return NULL; } - rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v)); + + if (self) { + const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); + if (old->category_changed == 0) { + /* unassigned */ + have_old = 1; + rc = -1; + } + else if (old->decimal_changed != 0xFF) { + have_old = 1; + rc = old->decimal_changed; + } + } + + if (!have_old) + rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, @@ -136,6 +240,7 @@ unicodedata_numeric(PyObject *self, PyObject *args) { PyUnicodeObject *v; PyObject *defobj = NULL; + int have_old = 0; double rc; if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) @@ -145,7 +250,22 @@ unicodedata_numeric(PyObject *self, PyObject *args) "need a single Unicode character as parameter"); return NULL; } - rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); + + if (self) { + const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); + if (old->category_changed == 0) { + /* unassigned */ + have_old = 1; + rc = -1; + } + else if (old->decimal_changed != 0xFF) { + have_old = 1; + rc = old->decimal_changed; + } + } + + if (!have_old) + rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "not a numeric character"); @@ -180,6 +300,11 @@ unicodedata_category(PyObject *self, PyObject *args) return NULL; } index = (int) _getrecord(v)->category; + if (self) { + const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); + if (old->category_changed != 0xFF) + index = old->category_changed; + } return PyString_FromString(_PyUnicode_CategoryNames[index]); } @@ -205,6 +330,13 @@ unicodedata_bidirectional(PyObject *self, PyObject *args) return NULL; } index = (int) _getrecord(v)->bidirectional; + if (self) { + const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); + if (old->category_changed == 0) + index = 0; /* unassigned */ + else if (old->bidir_changed != 0xFF) + index = old->bidir_changed; + } return PyString_FromString(_PyUnicode_BidirectionalNames[index]); } @@ -219,6 +351,7 @@ static PyObject * unicodedata_combining(PyObject *self, PyObject *args) { PyUnicodeObject *v; + int index; if (!PyArg_ParseTuple(args, "O!:combining", &PyUnicode_Type, &v)) @@ -228,7 +361,13 @@ unicodedata_combining(PyObject *self, PyObject *args) "need a single Unicode character as parameter"); return NULL; } - return PyInt_FromLong((int) _getrecord(v)->combining); + index = (int) _getrecord(v)->combining; + if (self) { + const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); + if (old->category_changed == 0) + index = 0; /* unassigned */ + } + return PyInt_FromLong(index); } PyDoc_STRVAR(unicodedata_mirrored__doc__, @@ -242,6 +381,7 @@ static PyObject * unicodedata_mirrored(PyObject *self, PyObject *args) { PyUnicodeObject *v; + int index; if (!PyArg_ParseTuple(args, "O!:mirrored", &PyUnicode_Type, &v)) @@ -251,7 +391,13 @@ unicodedata_mirrored(PyObject *self, PyObject *args) "need a single Unicode character as parameter"); return NULL; } - return PyInt_FromLong((int) _getrecord(v)->mirrored); + index = (int) _getrecord(v)->mirrored; + if (self) { + const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); + if (old->category_changed == 0) + index = 0; /* unassigned */ + } + return PyInt_FromLong(index); } PyDoc_STRVAR(unicodedata_east_asian_width__doc__, @@ -275,6 +421,11 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args) return NULL; } index = (int) _getrecord(v)->east_asian_width; + if (self) { + const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); + if (old->category_changed == 0) + index = 0; /* unassigned */ + } return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); } @@ -303,6 +454,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args) code = (int) *PyUnicode_AS_UNICODE(v); + if (self) { + const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); + if (old->category_changed == 0) + return PyString_FromString(""); /* unassigned */ + } + if (code < 0 || code >= 0x110000) index = 0; else { @@ -337,11 +494,14 @@ unicodedata_decomposition(PyObject *self, PyObject *args) } void -get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count) +get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) { if (code >= 0x110000) { *index = 0; - } + } else if (self && get_old_record(self, code)->category_changed==0) { + /* unassigned in old version */ + *index = 0; + } else { *index = decomp_index1[(code>>DECOMP_SHIFT)]; *index = decomp_index2[(*index<<DECOMP_SHIFT)+ @@ -367,7 +527,7 @@ get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count) #define SCount (LCount*NCount) static PyObject* -nfd_nfkd(PyObject *input, int k) +nfd_nfkd(PyObject *self, PyObject *input, int k) { PyObject *result; Py_UNICODE *i, *end, *o; @@ -416,8 +576,17 @@ nfd_nfkd(PyObject *input, int k) } continue; } - /* Other decompoistions. */ - get_decomp_record(code, &index, &prefix, &count); + /* normalization changes */ + if (self) { + Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); + if (value != 0) { + stack[stackptr++] = value; + continue; + } + } + + /* Other decompositions. */ + get_decomp_record(self, code, &index, &prefix, &count); /* Copy character if it is not decomposable, or has a compatibility decomposition, but we do NFD. */ @@ -467,7 +636,7 @@ nfd_nfkd(PyObject *input, int k) } static int -find_nfc_index(struct reindex* nfc, Py_UNICODE code) +find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code) { int index; for (index = 0; nfc[index].start; index++) { @@ -483,7 +652,7 @@ find_nfc_index(struct reindex* nfc, Py_UNICODE code) } static PyObject* -nfc_nfkc(PyObject *input, int k) +nfc_nfkc(PyObject *self, PyObject *input, int k) { PyObject *result; Py_UNICODE *i, *i1, *o, *end; @@ -492,7 +661,7 @@ nfc_nfkc(PyObject *input, int k) Py_UNICODE *skipped[20]; int cskipped = 0; - result = nfd_nfkd(input, k); + result = nfd_nfkd(self, input, k); if (!result) return NULL; @@ -536,7 +705,7 @@ nfc_nfkc(PyObject *input, int k) continue; } - f = find_nfc_index(nfc_first, *i); + f = find_nfc_index(self, nfc_first, *i); if (f == -1) { *o++ = *i++; continue; @@ -551,7 +720,7 @@ nfc_nfkc(PyObject *input, int k) i1++; continue; } - l = find_nfc_index(nfc_last, *i1); + l = find_nfc_index(self, nfc_last, *i1); /* *i1 cannot be combined with *i. If *i1 is a starter, we don't need to look further. Otherwise, record the combining class. */ @@ -575,7 +744,7 @@ nfc_nfkc(PyObject *input, int k) /* Mark the second character unused. */ skipped[cskipped++] = i1; i1++; - f = find_nfc_index(nfc_first, *i); + f = find_nfc_index(self, nfc_first, *i); if (f == -1) break; } @@ -610,13 +779,13 @@ unicodedata_normalize(PyObject *self, PyObject *args) } if (strcmp(form, "NFC") == 0) - return nfc_nfkc(input, 0); + return nfc_nfkc(self, input, 0); if (strcmp(form, "NFKC") == 0) - return nfc_nfkc(input, 1); + return nfc_nfkc(self, input, 1); if (strcmp(form, "NFD") == 0) - return nfd_nfkd(input, 0); + return nfd_nfkd(self, input, 0); if (strcmp(form, "NFKD") == 0) - return nfd_nfkd(input, 1); + return nfd_nfkd(self, input, 1); PyErr_SetString(PyExc_ValueError, "invalid normalization form"); return NULL; } @@ -686,7 +855,7 @@ is_unified_ideograph(Py_UCS4 code) } static int -_getucname(Py_UCS4 code, char* buffer, int buflen) +_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) { int offset; int i; @@ -726,6 +895,15 @@ _getucname(Py_UCS4 code, char* buffer, int buflen) if (code >= 0x110000) return 0; + if (self) { + const change_record *old = get_old_record(self, code); + if (old->category_changed == 0) { + /* unassigned */ + return 0; + } + } + + /* get offset into phrasebook */ offset = phrasebook_offset1[(code>>phrasebook_shift)]; offset = phrasebook_offset2[(offset<<phrasebook_shift) + @@ -768,12 +946,12 @@ _getucname(Py_UCS4 code, char* buffer, int buflen) } static int -_cmpname(int code, const char* name, int namelen) +_cmpname(PyObject *self, int code, const char* name, int namelen) { /* check if code corresponds to the given name */ int i; char buffer[NAME_MAXLEN]; - if (!_getucname(code, buffer, sizeof(buffer))) + if (!_getucname(self, code, buffer, sizeof(buffer))) return 0; for (i = 0; i < namelen; i++) { if (toupper(name[i]) != buffer[i]) @@ -803,7 +981,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column) } static int -_getcode(const char* name, int namelen, Py_UCS4* code) +_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) { unsigned int h, v; unsigned int mask = code_size-1; @@ -860,7 +1038,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code) v = code_hash[i]; if (!v) return 0; - if (_cmpname(v, name, namelen)) { + if (_cmpname(self, v, name, namelen)) { *code = v; return 1; } @@ -872,7 +1050,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code) v = code_hash[i]; if (!v) return 0; - if (_cmpname(v, name, namelen)) { + if (_cmpname(self, v, name, namelen)) { *code = v; return 1; } @@ -914,8 +1092,8 @@ unicodedata_name(PyObject* self, PyObject* args) return NULL; } - if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v), - name, sizeof(name))) { + if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v), + name, sizeof(name))) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "no such name"); return NULL; @@ -947,7 +1125,7 @@ unicodedata_lookup(PyObject* self, PyObject* args) if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) return NULL; - if (!_getcode(name, namelen, &code)) { + if (!_getcode(self, name, namelen, &code)) { char fmt[] = "undefined character name '%s'"; char *buf = PyMem_MALLOC(sizeof(fmt) + namelen); sprintf(buf, fmt, name); @@ -985,6 +1163,8 @@ static PyMethodDef unicodedata_functions[] = { {NULL, NULL} /* sentinel */ }; + + PyDoc_STRVAR(unicodedata_docstring, "This module provides access to the Unicode Character Database which\n\ defines character properties for all Unicode characters. The data in\n\ @@ -1007,6 +1187,11 @@ initunicodedata(void) PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); + /* Previous versions */ + v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); + if (v != NULL) + PyModule_AddObject(m, "db_3_2_0", v); + /* Export C API */ v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); if (v != NULL) |