1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
|
'use strict';
module.exports = function (opts) {
var re = {};
// Use direct extract instead of `regenerate` to reduse browserified size
re.src_Any = require('uc.micro/properties/Any/regex').source;
re.src_Cc = require('uc.micro/categories/Cc/regex').source;
re.src_Z = require('uc.micro/categories/Z/regex').source;
re.src_P = require('uc.micro/categories/P/regex').source;
// \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation)
re.src_ZPCc = [ re.src_Z, re.src_P, re.src_Cc ].join('|');
// \p{\Z\Cc} (white spaces + control)
re.src_ZCc = [ re.src_Z, re.src_Cc ].join('|');
// Experimental. List of chars, completely prohibited in links
// because can separate it from other part of text
var text_separators = '[><\uff5c]';
// All possible word characters (everything without punctuation, spaces & controls)
// Defined via punctuation & spaces to save space
// Should be something like \p{\L\N\S\M} (\w but without `_`)
re.src_pseudo_letter = '(?:(?!' + text_separators + '|' + re.src_ZPCc + ')' + re.src_Any + ')';
// The same as abothe but without [0-9]
// var src_pseudo_letter_non_d = '(?:(?![0-9]|' + src_ZPCc + ')' + src_Any + ')';
////////////////////////////////////////////////////////////////////////////////
re.src_ip4 =
'(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
// Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch.
re.src_auth = '(?:(?:(?!' + re.src_ZCc + '|[@/\\[\\]()]).)+@)?';
re.src_port =
'(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?';
re.src_host_terminator =
'(?=$|' + text_separators + '|' + re.src_ZPCc + ')(?!-|_|:\\d|\\.-|\\.(?!$|' + re.src_ZPCc + '))';
re.src_path =
'(?:' +
'[/?#]' +
'(?:' +
'(?!' + re.src_ZCc + '|' + text_separators + '|[()[\\]{}.,"\'?!\\-;]).|' +
'\\[(?:(?!' + re.src_ZCc + '|\\]).)*\\]|' +
'\\((?:(?!' + re.src_ZCc + '|[)]).)*\\)|' +
'\\{(?:(?!' + re.src_ZCc + '|[}]).)*\\}|' +
'\\"(?:(?!' + re.src_ZCc + '|["]).)+\\"|' +
"\\'(?:(?!" + re.src_ZCc + "|[']).)+\\'|" +
"\\'(?=" + re.src_pseudo_letter + '|[-]).|' + // allow `I'm_king` if no pair found
'\\.{2,}[a-zA-Z0-9%/&]|' + // google has many dots in "google search" links (#66, #81).
// github has ... in commit range links,
// Restrict to
// - english
// - percent-encoded
// - parts of file path
// - params separator
// until more examples found.
'\\.(?!' + re.src_ZCc + '|[.]).|' +
(opts && opts['---'] ?
'\\-(?!--(?:[^-]|$))(?:-*)|' // `---` => long dash, terminate
:
'\\-+|'
) +
',(?!' + re.src_ZCc + ').|' + // allow `,,,` in paths
';(?!' + re.src_ZCc + ').|' + // allow `;` if not followed by space-like char
'\\!+(?!' + re.src_ZCc + '|[!]).|' + // allow `!!!` in paths, but not at the end
'\\?(?!' + re.src_ZCc + '|[?]).' +
')+' +
'|\\/' +
')?';
// Allow anything in markdown spec, forbid quote (") at the first position
// because emails enclosed in quotes are far more common
re.src_email_name =
'[\\-;:&=\\+\\$,\\.a-zA-Z0-9_][\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]*';
re.src_xn =
'xn--[a-z0-9\\-]{1,59}';
// More to read about domain names
// http://serverfault.com/questions/638260/
re.src_domain_root =
// Allow letters & digits (http://test1)
'(?:' +
re.src_xn +
'|' +
re.src_pseudo_letter + '{1,63}' +
')';
re.src_domain =
'(?:' +
re.src_xn +
'|' +
'(?:' + re.src_pseudo_letter + ')' +
'|' +
'(?:' + re.src_pseudo_letter + '(?:-|' + re.src_pseudo_letter + '){0,61}' + re.src_pseudo_letter + ')' +
')';
re.src_host =
'(?:' +
// Don't need IP check, because digits are already allowed in normal domain names
// src_ip4 +
// '|' +
'(?:(?:(?:' + re.src_domain + ')\\.)*' + re.src_domain/*_root*/ + ')' +
')';
re.tpl_host_fuzzy =
'(?:' +
re.src_ip4 +
'|' +
'(?:(?:(?:' + re.src_domain + ')\\.)+(?:%TLDS%))' +
')';
re.tpl_host_no_ip_fuzzy =
'(?:(?:(?:' + re.src_domain + ')\\.)+(?:%TLDS%))';
re.src_host_strict =
re.src_host + re.src_host_terminator;
re.tpl_host_fuzzy_strict =
re.tpl_host_fuzzy + re.src_host_terminator;
re.src_host_port_strict =
re.src_host + re.src_port + re.src_host_terminator;
re.tpl_host_port_fuzzy_strict =
re.tpl_host_fuzzy + re.src_port + re.src_host_terminator;
re.tpl_host_port_no_ip_fuzzy_strict =
re.tpl_host_no_ip_fuzzy + re.src_port + re.src_host_terminator;
////////////////////////////////////////////////////////////////////////////////
// Main rules
// Rude test fuzzy links by host, for quick deny
re.tpl_host_fuzzy_test =
'localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:' + re.src_ZPCc + '|>|$))';
re.tpl_email_fuzzy =
'(^|' + text_separators + '|"|\\(|' + re.src_ZCc + ')' +
'(' + re.src_email_name + '@' + re.tpl_host_fuzzy_strict + ')';
re.tpl_link_fuzzy =
// Fuzzy link can't be prepended with .:/\- and non punctuation.
// but can start with > (markdown blockquote)
'(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|' + re.src_ZPCc + '))' +
'((?![$+<=>^`|\uff5c])' + re.tpl_host_port_fuzzy_strict + re.src_path + ')';
re.tpl_link_no_ip_fuzzy =
// Fuzzy link can't be prepended with .:/\- and non punctuation.
// but can start with > (markdown blockquote)
'(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|' + re.src_ZPCc + '))' +
'((?![$+<=>^`|\uff5c])' + re.tpl_host_port_no_ip_fuzzy_strict + re.src_path + ')';
return re;
};
|