version=0.2
#by Turkish National Corpus Team

[RULE-ORDER]
NUMBER-ORDINAL ROMAN-NUMERALS DATE NP-COMP URL URL-WWW URL-DOMAIN
E-MAIL ABBREVIATION INITIAL SMILEY REVERSE-SMILEY ABBREVIATION-KNOWN
PUNCTUATION-MULTI
NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN


[META-RULES]
SPLITTER=%
NUMBER-ORDINAL = (?>(?:[\.,]?\p{N}+)+)-?(?i)(?: %ORDINALS% )(?:\Z|\P{Lu}|\P{Ll})$

ABBREVIATION-KNOWN = (?:\p{P}*)?(?:\A|[^\p{L}\.])((?:%ABBREVIATIONS%)\.)(?:\Z|\P{L})
#PREFIX = (?:\A|[^\p{Lu}\.]|[^\p{Ll}\.])(%PREFIXES% )(\p{L}+)
#SUFFIX = ((?:\p{L})+)( %SUFFIXES% )(?:\Z|\P{L})

[RULES]
%include url
%include e-mail
%include smiley

#Affixed proper nouns and compounds with dash/underscore.
#Ex= <Mahir'in> <siyah-beyaz>
NP-COMP=\p{L}+(?:['`’‘´-]\p{L}+)+

#Abbreviations with multiple periods as in <A.B.C>
ABBREVIATION=^(?:[\{\(\[\<]?)(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;\}\)\]\>])

#retain initials
INITIAL=^(?:\p{Lt}|\p{Lu})\.$

#Homogeneous punctuation (ellipsis etc)
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}

#Abbreviations with multiple periods
ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z

#Date
DATE=\p{N}{1,2}\.\p{N}{1,2}\.\p{N}{2,4}(?:['`’‘´-]\p{L}+)+

NUMBER-YEAR=(['`’‘´]\p{N}{2})(?:\P{N}|\z)

FRACNUMBER=\p{N}+(?:/\p{N}+)+

#Times
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:a\.?m\.?|p\.?m\.?)?

#retain digits, including those starting with initial period (.22), and negative numbers
NUMBER=-?(?:[\.,]?\p{N}+)+

ROMAN-NUMERALS=\b[IVXivx]{1,2}\.

CURRENCY=\p{Sc}

WORD=[\p{L}\p{Mn}]+

PUNCTUATION=\p{P}

UNKNOWN=.

[PREFIXES]

[SUFFIXES]

#Ordinals and Numbering
[ORDINALS]
\.
\)
\-

[TOKENS]

[UNITS]
km
m
cm
mm
g
kg
C
l
s
sec
min
gb
mb
kb

[CURRENCY]
YTL
TL
EUR
USD
GBP
CAD
NZD
AUD
SGD
HKD

[ABBREVIATIONS]
%include tur

[EOSMARKERS]
# Character: !
# Name: EXCLAMATION MARK
# Code: 33 (0x21)
\u0021

# Character: ?
# Name: QUESTION MARK
# Code: 3f (0x3f)
\u003F

# Character: ;
# Name: GREEK QUESTION MARK
# Code: 894 (0x37e)
\u037e

# Character: ؟
# Name: ARABIC QUESTION MARK
# Code: 1567 (0x61f)
\u061f

# Character: 。
# Name: IDEOGRAPHIC FULL STOP
# Code: 12290 (0x3002)
\u3002

# Character: ｡
# Name: HALFWIDTH IDEOGRAPHIC FULL STOP
# Code: 65377 (0xff61)
\uff61

# Character: ？
# Name: FULLWIDTH QUESTION MARK
# Code: 65311 (0xff1f)
\uff1f

# Character: ！
# Name: FULLWIDTH EXCLAMATION MARK
# Code: 65281 (0xff01)
\uff01

# Character: ।
# Name: DEVANAGARI DANDA
# Code: 2404 (0x964)
\u0964

# Character: ։
# Name: ARMENIAN FULL STOP
# Code: 1417 (0x589)
\u0589

# Character: ՞
# Name: ARMENIAN QUESTION MARK
# Code: 1374 (0x55e)
\u055e

# Character: ።
# Name: ETHIOPIC FULL STOP
# Code: 4962 (0x1362)
\u1362

# Character: ᙮
# Name: CANADIAN SYLLABICS FULL STOP
# Code: 5742 (0x166e)
\u166e

# Character: ។
# Name: KHMER SIGN KHAN
# Code: 6100 (0x17d4)
\u17d4

# Character: ៕
# Name: KHMER SIGN BARIYOOSAN
# Code: 6101 (0x17d5)
\u17d5

# Character: ᠃
# Name: MONGOLIAN FULL STOP
# Code: 6147 (0x1803)
\u1803

# Character: ᠉
# Name: MONGOLIAN MANCHU FULL STOP
# Code: 6153 (0x1809)
\u1809
