version=0.2
[RULE-ORDER]
WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN WORD-INFIX-COMPOUND NUMBER-ORDINAL
URL URL-WWW URL-DOMAIN E-MAIL
WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
ABBREVIATION INITIALS INITIAL
SMILEY REVERSE-SMILEY
PUNCTUATION-MULTI DATE DATE-REVERSE
NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
# to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code)

[META-RULES]
SPLITTER=%
NUMBER-ORDINAL = (?>(?:[\.,]?\p{N}+)+)-?(?i)(?: %ORDINALS% )(?:\Z|\P{Lu}|\P{Ll})$
ABBREVIATION-KNOWN = (?:\p{P}*)?(?:\A|[^\p{L}\.])((?:%ABBREVIATIONS%)\.)(?:\Z|\P{L})
WORD-TOKEN =(%TOKENS%)(?:\p{P}*)?$
WORD-WITHSUFFIX = ((?:\p{L}|\p{N}|-)+(?: %ATTACHEDSUFFIXES% ))(?:\Z)
WORD-INFIX-COMPOUND = ((?:\p{L}|\p{N}|-)+(?: %ATTACHEDSUFFIXES% )-(?:\p{L}+))$
#PREFIX = (?:\A|[^\p{Lu}\.]|[^\p{Ll}\.])(%PREFIXES% )(\p{L}+)
#SUFFIX = ((?:\p{L})+)( %SUFFIXES% )(?:\Z|\P{L})

[RULES]
%include url
%include e-mail
%include smiley

#Ex (oud)-studente(s)
WORD-PARPREFIX-PARSUFFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Ex: (oud)-studente, (on)zin,
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*

#Ex: koning(in)
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Keep dash/underscore connected parts (even if they are in parenthesis)
WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+

#Abbreviations with multiple periods
ABBREVIATION=^(?:[\{\(\[\<]?)(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;\}\)\]\>])

#Initials glued to a longer word: A.F.Zetterij -> A.F. Zetterij
INITIALS=(\p{L}(?:\.\p{L})+\.)\p{Lu}\p{L}{3,999}+

#retain initials
INITIAL=^(?:\p{Lt}|\p{Lu})\.$

#SMILEY=^(?:>?[:;]['`^]?[-~]*[)}\](\\/\[{Ss\$PpDd]+)$

#Homogeneous punctuation (ellipsis etc)
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}

#Date
DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}

FRACNUMBER=\p{N}+(?:/\p{N}+)+

NUMBER-YEAR=(['`’‘´]\p{N}{2})(?:\P{N}|\z)

#Times
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:am|pm)?

#retain digits, including those starting with initial period (.22), and negative numbers
NUMBER=-?(?:[\.,]?\p{N}+)+

CURRENCY=\p{Sc}

WORD=[\p{L}\p{Mn}]+

PUNCTUATION=\p{P}

UNKNOWN=.

[PREFIXES]


[ORDINALS]
e
de
ste

[ATTACHEDSUFFIXES]
\['`’'`’‘´]s

[TOKENS]
\['`’‘´]s
\['`’‘´]k
\['`’‘´]m
\['`’‘´]n
\['`’‘´]t
\['`’‘´]d
\['`’‘´]e

[UNITS]
km
m
cm
mm
g
kg
C
l
s
sec
min
gb
mb
kb


[CURRENCY]
EUR
hfl
fl
f


[ABBREVIATIONS]
%include fry

[FILTER]
%include ligatures
# also filter soft hyphen
\u00AD

[EOSMARKERS]
%include standard-eos

[QUOTES]
%include standard-quotes
%include exotic-quotes
