300 lines
7.8 KiB
C
300 lines
7.8 KiB
C
|
/*************************************************
|
||
|
* Perl-Compatible Regular Expressions *
|
||
|
*************************************************/
|
||
|
|
||
|
/* PCRE is a library of functions to support regular expressions whose syntax
|
||
|
and semantics are as close as possible to those of the Perl 5 language.
|
||
|
|
||
|
Written by Philip Hazel
|
||
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||
|
New API code Copyright (c) 2016-2018 University of Cambridge
|
||
|
|
||
|
-----------------------------------------------------------------------------
|
||
|
Redistribution and use in source and binary forms, with or without
|
||
|
modification, are permitted provided that the following conditions are met:
|
||
|
|
||
|
* Redistributions of source code must retain the above copyright notice,
|
||
|
this list of conditions and the following disclaimer.
|
||
|
|
||
|
* Redistributions in binary form must reproduce the above copyright
|
||
|
notice, this list of conditions and the following disclaimer in the
|
||
|
documentation and/or other materials provided with the distribution.
|
||
|
|
||
|
* Neither the name of the University of Cambridge nor the names of its
|
||
|
contributors may be used to endorse or promote products derived from
|
||
|
this software without specific prior written permission.
|
||
|
|
||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||
|
POSSIBILITY OF SUCH DAMAGE.
|
||
|
-----------------------------------------------------------------------------
|
||
|
*/
|
||
|
|
||
|
|
||
|
#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||
|
#define PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||
|
|
||
|
/* This file contains definitions of the property values that are returned by
|
||
|
the UCD access macros. New values that are added for new releases of Unicode
|
||
|
should always be at the end of each enum, for backwards compatibility.
|
||
|
|
||
|
IMPORTANT: Note also that the specific numeric values of the enums have to be
|
||
|
the same as the values that are generated by the maint/MultiStage2.py script,
|
||
|
where the equivalent property descriptive names are listed in vectors.
|
||
|
|
||
|
ALSO: The specific values of the first two enums are assumed for the table
|
||
|
called catposstab in pcre2_compile.c. */
|
||
|
|
||
|
/* These are the general character categories. */
|
||
|
|
||
|
enum {
|
||
|
ucp_C, /* Other */
|
||
|
ucp_L, /* Letter */
|
||
|
ucp_M, /* Mark */
|
||
|
ucp_N, /* Number */
|
||
|
ucp_P, /* Punctuation */
|
||
|
ucp_S, /* Symbol */
|
||
|
ucp_Z /* Separator */
|
||
|
};
|
||
|
|
||
|
/* These are the particular character categories. */
|
||
|
|
||
|
enum {
|
||
|
ucp_Cc, /* Control */
|
||
|
ucp_Cf, /* Format */
|
||
|
ucp_Cn, /* Unassigned */
|
||
|
ucp_Co, /* Private use */
|
||
|
ucp_Cs, /* Surrogate */
|
||
|
ucp_Ll, /* Lower case letter */
|
||
|
ucp_Lm, /* Modifier letter */
|
||
|
ucp_Lo, /* Other letter */
|
||
|
ucp_Lt, /* Title case letter */
|
||
|
ucp_Lu, /* Upper case letter */
|
||
|
ucp_Mc, /* Spacing mark */
|
||
|
ucp_Me, /* Enclosing mark */
|
||
|
ucp_Mn, /* Non-spacing mark */
|
||
|
ucp_Nd, /* Decimal number */
|
||
|
ucp_Nl, /* Letter number */
|
||
|
ucp_No, /* Other number */
|
||
|
ucp_Pc, /* Connector punctuation */
|
||
|
ucp_Pd, /* Dash punctuation */
|
||
|
ucp_Pe, /* Close punctuation */
|
||
|
ucp_Pf, /* Final punctuation */
|
||
|
ucp_Pi, /* Initial punctuation */
|
||
|
ucp_Po, /* Other punctuation */
|
||
|
ucp_Ps, /* Open punctuation */
|
||
|
ucp_Sc, /* Currency symbol */
|
||
|
ucp_Sk, /* Modifier symbol */
|
||
|
ucp_Sm, /* Mathematical symbol */
|
||
|
ucp_So, /* Other symbol */
|
||
|
ucp_Zl, /* Line separator */
|
||
|
ucp_Zp, /* Paragraph separator */
|
||
|
ucp_Zs /* Space separator */
|
||
|
};
|
||
|
|
||
|
/* These are grapheme break properties. The Extended Pictographic property
|
||
|
comes from the emoji-data.txt file. */
|
||
|
|
||
|
enum {
|
||
|
ucp_gbCR, /* 0 */
|
||
|
ucp_gbLF, /* 1 */
|
||
|
ucp_gbControl, /* 2 */
|
||
|
ucp_gbExtend, /* 3 */
|
||
|
ucp_gbPrepend, /* 4 */
|
||
|
ucp_gbSpacingMark, /* 5 */
|
||
|
ucp_gbL, /* 6 Hangul syllable type L */
|
||
|
ucp_gbV, /* 7 Hangul syllable type V */
|
||
|
ucp_gbT, /* 8 Hangul syllable type T */
|
||
|
ucp_gbLV, /* 9 Hangul syllable type LV */
|
||
|
ucp_gbLVT, /* 10 Hangul syllable type LVT */
|
||
|
ucp_gbRegionalIndicator, /* 11 */
|
||
|
ucp_gbOther, /* 12 */
|
||
|
ucp_gbZWJ, /* 13 */
|
||
|
ucp_gbExtended_Pictographic /* 14 */
|
||
|
};
|
||
|
|
||
|
/* These are the script identifications. */
|
||
|
|
||
|
enum {
|
||
|
ucp_Unknown,
|
||
|
ucp_Arabic,
|
||
|
ucp_Armenian,
|
||
|
ucp_Bengali,
|
||
|
ucp_Bopomofo,
|
||
|
ucp_Braille,
|
||
|
ucp_Buginese,
|
||
|
ucp_Buhid,
|
||
|
ucp_Canadian_Aboriginal,
|
||
|
ucp_Cherokee,
|
||
|
ucp_Common,
|
||
|
ucp_Coptic,
|
||
|
ucp_Cypriot,
|
||
|
ucp_Cyrillic,
|
||
|
ucp_Deseret,
|
||
|
ucp_Devanagari,
|
||
|
ucp_Ethiopic,
|
||
|
ucp_Georgian,
|
||
|
ucp_Glagolitic,
|
||
|
ucp_Gothic,
|
||
|
ucp_Greek,
|
||
|
ucp_Gujarati,
|
||
|
ucp_Gurmukhi,
|
||
|
ucp_Han,
|
||
|
ucp_Hangul,
|
||
|
ucp_Hanunoo,
|
||
|
ucp_Hebrew,
|
||
|
ucp_Hiragana,
|
||
|
ucp_Inherited,
|
||
|
ucp_Kannada,
|
||
|
ucp_Katakana,
|
||
|
ucp_Kharoshthi,
|
||
|
ucp_Khmer,
|
||
|
ucp_Lao,
|
||
|
ucp_Latin,
|
||
|
ucp_Limbu,
|
||
|
ucp_Linear_B,
|
||
|
ucp_Malayalam,
|
||
|
ucp_Mongolian,
|
||
|
ucp_Myanmar,
|
||
|
ucp_New_Tai_Lue,
|
||
|
ucp_Ogham,
|
||
|
ucp_Old_Italic,
|
||
|
ucp_Old_Persian,
|
||
|
ucp_Oriya,
|
||
|
ucp_Osmanya,
|
||
|
ucp_Runic,
|
||
|
ucp_Shavian,
|
||
|
ucp_Sinhala,
|
||
|
ucp_Syloti_Nagri,
|
||
|
ucp_Syriac,
|
||
|
ucp_Tagalog,
|
||
|
ucp_Tagbanwa,
|
||
|
ucp_Tai_Le,
|
||
|
ucp_Tamil,
|
||
|
ucp_Telugu,
|
||
|
ucp_Thaana,
|
||
|
ucp_Thai,
|
||
|
ucp_Tibetan,
|
||
|
ucp_Tifinagh,
|
||
|
ucp_Ugaritic,
|
||
|
ucp_Yi,
|
||
|
/* New for Unicode 5.0 */
|
||
|
ucp_Balinese,
|
||
|
ucp_Cuneiform,
|
||
|
ucp_Nko,
|
||
|
ucp_Phags_Pa,
|
||
|
ucp_Phoenician,
|
||
|
/* New for Unicode 5.1 */
|
||
|
ucp_Carian,
|
||
|
ucp_Cham,
|
||
|
ucp_Kayah_Li,
|
||
|
ucp_Lepcha,
|
||
|
ucp_Lycian,
|
||
|
ucp_Lydian,
|
||
|
ucp_Ol_Chiki,
|
||
|
ucp_Rejang,
|
||
|
ucp_Saurashtra,
|
||
|
ucp_Sundanese,
|
||
|
ucp_Vai,
|
||
|
/* New for Unicode 5.2 */
|
||
|
ucp_Avestan,
|
||
|
ucp_Bamum,
|
||
|
ucp_Egyptian_Hieroglyphs,
|
||
|
ucp_Imperial_Aramaic,
|
||
|
ucp_Inscriptional_Pahlavi,
|
||
|
ucp_Inscriptional_Parthian,
|
||
|
ucp_Javanese,
|
||
|
ucp_Kaithi,
|
||
|
ucp_Lisu,
|
||
|
ucp_Meetei_Mayek,
|
||
|
ucp_Old_South_Arabian,
|
||
|
ucp_Old_Turkic,
|
||
|
ucp_Samaritan,
|
||
|
ucp_Tai_Tham,
|
||
|
ucp_Tai_Viet,
|
||
|
/* New for Unicode 6.0.0 */
|
||
|
ucp_Batak,
|
||
|
ucp_Brahmi,
|
||
|
ucp_Mandaic,
|
||
|
/* New for Unicode 6.1.0 */
|
||
|
ucp_Chakma,
|
||
|
ucp_Meroitic_Cursive,
|
||
|
ucp_Meroitic_Hieroglyphs,
|
||
|
ucp_Miao,
|
||
|
ucp_Sharada,
|
||
|
ucp_Sora_Sompeng,
|
||
|
ucp_Takri,
|
||
|
/* New for Unicode 7.0.0 */
|
||
|
ucp_Bassa_Vah,
|
||
|
ucp_Caucasian_Albanian,
|
||
|
ucp_Duployan,
|
||
|
ucp_Elbasan,
|
||
|
ucp_Grantha,
|
||
|
ucp_Khojki,
|
||
|
ucp_Khudawadi,
|
||
|
ucp_Linear_A,
|
||
|
ucp_Mahajani,
|
||
|
ucp_Manichaean,
|
||
|
ucp_Mende_Kikakui,
|
||
|
ucp_Modi,
|
||
|
ucp_Mro,
|
||
|
ucp_Nabataean,
|
||
|
ucp_Old_North_Arabian,
|
||
|
ucp_Old_Permic,
|
||
|
ucp_Pahawh_Hmong,
|
||
|
ucp_Palmyrene,
|
||
|
ucp_Psalter_Pahlavi,
|
||
|
ucp_Pau_Cin_Hau,
|
||
|
ucp_Siddham,
|
||
|
ucp_Tirhuta,
|
||
|
ucp_Warang_Citi,
|
||
|
/* New for Unicode 8.0.0 */
|
||
|
ucp_Ahom,
|
||
|
ucp_Anatolian_Hieroglyphs,
|
||
|
ucp_Hatran,
|
||
|
ucp_Multani,
|
||
|
ucp_Old_Hungarian,
|
||
|
ucp_SignWriting,
|
||
|
/* New for Unicode 10.0.0 (no update since 8.0.0) */
|
||
|
ucp_Adlam,
|
||
|
ucp_Bhaiksuki,
|
||
|
ucp_Marchen,
|
||
|
ucp_Newa,
|
||
|
ucp_Osage,
|
||
|
ucp_Tangut,
|
||
|
ucp_Masaram_Gondi,
|
||
|
ucp_Nushu,
|
||
|
ucp_Soyombo,
|
||
|
ucp_Zanabazar_Square,
|
||
|
/* New for Unicode 11.0.0 */
|
||
|
ucp_Dogra,
|
||
|
ucp_Gunjala_Gondi,
|
||
|
ucp_Hanifi_Rohingya,
|
||
|
ucp_Makasar,
|
||
|
ucp_Medefaidrin,
|
||
|
ucp_Old_Sogdian,
|
||
|
ucp_Sogdian,
|
||
|
/* New for Unicode 12.0.0 */
|
||
|
ucp_Elymaic,
|
||
|
ucp_Nandinagari,
|
||
|
ucp_Nyiakeng_Puachue_Hmong,
|
||
|
ucp_Wancho,
|
||
|
/* New for Unicode 13.0.0 */
|
||
|
ucp_Chorasmian,
|
||
|
ucp_Dives_Akuru,
|
||
|
ucp_Khitan_Small_Script,
|
||
|
ucp_Yezidi
|
||
|
};
|
||
|
|
||
|
#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */
|
||
|
|
||
|
/* End of pcre2_ucp.h */
|