@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.)
hq.recaptime.dev/wiki/Phorge
phorge
phabricator
1<?php
2
3final class PhutilRemarkupHyperlinkRule extends PhutilRemarkupRule {
4
5 const KEY_HYPERLINKS = 'hyperlinks';
6
7 public function getPriority() {
8 return 400.0;
9 }
10
11 public function apply($text) {
12 static $angle_pattern;
13 static $curly_pattern;
14 static $bare_pattern;
15
16 if ($angle_pattern === null) {
17 // See T13608. A previous version of this code matched bare URIs
18 // starting with "\w{3,}", which can take a very long time to match
19 // against long inputs.
20 //
21 // Use a protocol length limit in all patterns for general sanity,
22 // and a negative lookbehind in the bare pattern to avoid explosive
23 // complexity during expression evaluation.
24
25 $protocol_fragment = '\w{3,32}';
26 $uri_fragment = '[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+';
27
28 $angle_pattern = sprintf(
29 '(<(%s://%s?)>)',
30 $protocol_fragment,
31 $uri_fragment);
32
33 $curly_pattern = sprintf(
34 '({(%s://%s?)})',
35 $protocol_fragment,
36 $uri_fragment);
37
38 $bare_pattern = sprintf(
39 '((?<!\w)%s://%s)',
40 $protocol_fragment,
41 $uri_fragment);
42 }
43
44 // Hyperlinks with explicit "<>" around them get linked exactly, without
45 // the "<>". Angle brackets are basically special and mean "this is a URL
46 // with weird characters". This is assumed to be reasonable because they
47 // don't appear in most normal text or most normal URLs.
48 $text = preg_replace_callback(
49 $angle_pattern,
50 array($this, 'markupHyperlinkAngle'),
51 $text);
52
53 // We match "{uri}", but do not link it by default.
54 $text = preg_replace_callback(
55 $curly_pattern,
56 array($this, 'markupHyperlinkCurly'),
57 $text);
58
59 // Anything else we match "ungreedily", which means we'll look for
60 // stuff that's probably puncutation or otherwise not part of the URL and
61 // not link it. This lets someone write "QuicK! Go to
62 // https://www.example.com/!". We also apply some paren balancing rules.
63
64 // NOTE: We're explicitly avoiding capturing stored blocks, so text like
65 // `https://www.example.com/[[x | y]]` doesn't get aggressively captured.
66
67 $text = preg_replace_callback(
68 $bare_pattern,
69 array($this, 'markupHyperlinkUngreedy'),
70 $text);
71
72 return $text;
73 }
74
75 public function markupHyperlinkAngle(array $matches) {
76 return $this->markupHyperlink('<', $matches);
77 }
78
79 public function markupHyperlinkCurly(array $matches) {
80 return $this->markupHyperlink('{', $matches);
81 }
82
83 /**
84 * @return string Token in the format <0x01>1234Z.
85 * See @{class:PhutilRemarkupBlockStorage} for details
86 */
87 protected function markupHyperlink($mode, array $matches) {
88 $raw_uri = $matches[1];
89
90 try {
91 $uri = new PhutilURI($raw_uri);
92 } catch (Exception $ex) {
93 return $matches[0];
94 }
95
96 $engine = $this->getEngine();
97
98 $token = $engine->storeText($raw_uri);
99
100 $list_key = self::KEY_HYPERLINKS;
101 $link_list = $engine->getTextMetadata($list_key, array());
102
103 $link_list[] = array(
104 'token' => $token,
105 'uri' => $raw_uri,
106 'mode' => $mode,
107 );
108
109 $engine->setTextMetadata($list_key, $link_list);
110
111 return $token;
112 }
113
114 protected function renderHyperlink($link, $is_embed) {
115 // If the URI is "{uri}" and no handler picked it up, we just render it
116 // as plain text.
117 if ($is_embed) {
118 return $this->renderRawLink($link, $is_embed);
119 }
120
121 $engine = $this->getEngine();
122
123 $uri = new PhutilURIHelper($link);
124 $is_self = $uri->isSelf();
125 $same_window = $engine->getConfig('uri.same-window', $is_self);
126 if ($same_window) {
127 $target = null;
128 } else {
129 $target = '_blank';
130 }
131
132 return phutil_tag(
133 'a',
134 array(
135 'href' => $link,
136 'class' => $this->getRemarkupLinkClass($is_self),
137 'target' => $target,
138 'rel' => 'noreferrer',
139 ),
140 $link);
141 }
142
143 private function renderRawLink($link, $is_embed) {
144 if ($is_embed) {
145 return '{'.$link.'}';
146 } else {
147 return $link;
148 }
149 }
150
151 protected function markupHyperlinkUngreedy($matches) {
152 $match = $matches[0];
153 $tail = null;
154 $trailing = null;
155 if (preg_match('/[;,.:!?]+$/', $match, $trailing)) {
156 $tail = $trailing[0];
157 $match = substr($match, 0, -strlen($tail));
158 }
159
160 // If there's a closing paren at the end but no balancing open paren in
161 // the URL, don't link the close paren. This is an attempt to gracefully
162 // handle the two common paren cases, Wikipedia links and English language
163 // parentheticals, e.g.:
164 //
165 // https://en.wikipedia.org/wiki/Noun_(disambiguation)
166 // (see also https://www.example.com)
167 //
168 // We could apply a craftier heuristic here which tries to actually balance
169 // the parens, but this is probably sufficient.
170 if (preg_match('/\\)$/', $match) && !preg_match('/\\(/', $match)) {
171 $tail = ')'.$tail;
172 $match = substr($match, 0, -1);
173 }
174
175 try {
176 $uri = new PhutilURI($match);
177 } catch (Exception $ex) {
178 return $matches[0];
179 }
180
181 $link = $this->markupHyperlink(null, array(null, $match));
182
183 return hsprintf('%s%s', $link, $tail);
184 }
185
186 public function didMarkupText() {
187 $engine = $this->getEngine();
188
189 $protocols = $engine->getConfig('uri.allowed-protocols', array());
190 $is_toc = $engine->getState('toc');
191 $is_text = $engine->isTextMode();
192 $is_mail = $engine->isHTMLMailMode();
193
194 $list_key = self::KEY_HYPERLINKS;
195 $raw_list = $engine->getTextMetadata($list_key, array());
196
197 $links = array();
198 foreach ($raw_list as $key => $link) {
199 $token = $link['token'];
200 $raw_uri = $link['uri'];
201 $mode = $link['mode'];
202
203 $is_embed = ($mode === '{');
204 $is_literal = ($mode === '<');
205
206 // If we're rendering in a "Table of Contents" or a plain text mode,
207 // we're going to render the raw URI without modifications.
208 if ($is_toc || $is_text) {
209 $result = $this->renderRawLink($raw_uri, $is_embed);
210 $engine->overwriteStoredText($token, $result);
211 continue;
212 }
213
214 // If this URI doesn't use a whitelisted protocol, don't link it. This
215 // is primarily intended to prevent "javascript://" silliness.
216 $uri = new PhutilURI($raw_uri);
217 $protocol = $uri->getProtocol();
218 $valid_protocol = idx($protocols, $protocol);
219 if (!$valid_protocol) {
220 $result = $this->renderRawLink($raw_uri, $is_embed);
221 $engine->overwriteStoredText($token, $result);
222 continue;
223 }
224
225 // If the URI is written as "<uri>", we'll render it literally even if
226 // some handler would otherwise deal with it.
227 // If we're rendering for HTML mail, we also render literally.
228 if ($is_literal || $is_mail) {
229 $result = $this->renderHyperlink($raw_uri, $is_embed);
230 $engine->overwriteStoredText($token, $result);
231 continue;
232 }
233
234 // Otherwise, this link is a valid resource which extensions are allowed
235 // to handle.
236 $links[$key] = $link;
237 }
238
239 if (!$links) {
240 return;
241 }
242
243 foreach ($links as $key => $link) {
244 $links[$key] = new PhutilRemarkupHyperlinkRef($link);
245 }
246
247 $extensions = PhutilRemarkupHyperlinkEngineExtension::getAllLinkEngines();
248 foreach ($extensions as $extension) {
249 $extension = id(clone $extension)
250 ->setEngine($engine)
251 ->processHyperlinks($links);
252
253 foreach ($links as $key => $link) {
254 $result = $link->getResult();
255 if ($result !== null) {
256 $engine->overwriteStoredText($link->getToken(), $result);
257 unset($links[$key]);
258 }
259 }
260
261 if (!$links) {
262 break;
263 }
264 }
265
266 // Render any remaining links in a normal way.
267 foreach ($links as $link) {
268 $result = $this->renderHyperlink($link->getURI(), $link->isEmbed());
269 $engine->overwriteStoredText($link->getToken(), $result);
270 }
271 }
272
273}