···11-function merge_features_to_facets(features) {
11+export function merge_features_to_facets(features) {
22 // at begin, push its type to stack
33 // at end, pop
44 // make list of begins and ends
···66 // iterate
77 let edges = []
88 for (let x of features) {
99- edges.push([x.start,'start',x])
1010- edges.push([x.end,'end',x])
99+ edges.push([x.start,x])
1010+ edges.push([x.end,x])
1111 }
1212- edges.sort((a,b)=>{a[0]-b[0]})
1212+ edges.sort(([a],[b])=>a-b)
1313 let open = new Set()
1414 let facets = []
1515 let last = 0
1616- for (let [index,type,feature] of edges) {
1717- facets.push({
1818- range: [last, index],
1919- features: [...open],
2020- })
1616+ for (let [index,feature] of edges) {
1717+ console.log(index)
1818+ if (index > last && open.size) {
1919+ facets.push({
2020+ range: [last, index],
2121+ features: [...open],
2222+ })
2323+ }
2124 open.delete(feature) || open.add(feature)
2225 last = index
2326 }
2427 return facets
2528}
26292727-spans = [
3030+/*spans = [
2831 {type:"abc", start: 1, end: 5},
2932 {type:"def", start: 3, end: 5},
3033 {type:"zzz", start: 6, end: 10},
···3841 {start: 3, end: 5, types: ["abc","def"]},
3942 {start: 6, end: 10, types: ["zzz"]},
4043]
4444+*/
+37-46
parse.js
···11-class MarkupLanguage {
22- big_regex = /(?:)/g
33- constructor() {}
44- process_match(match) {
55- return [{text:match[0], features:[]}]
11+export class MarkupParser {
22+ constructor(big_regex, process_match) {
33+ this.process_match = process_match
44+ this.big_regex = big_regex
65 }
76 text_to_segments(text) {
87 let segments = []
···2120 }
2221}
23222424-class Markup1 extends MarkupLanguage {
2525- big_regex = ()=>{
2626- let r = String.raw
2727- let whitespace = r`\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2`
2828- let url = r`[-\w/%&=#+~@$*'!?,.;:]*`
2929- let url_final = r`[-\w/%&=#+~@$*']`
3030- return new RegExp([
3131- r`\b(?<link>https?://${url}${url_final}([(]${url}[)](${url}${url_final})?)?)(?:\[(?<link_text>.*?)\])?`,
3232- r`(?<=^|\s)[##](?<hashtag>(?!\uFE0F)[^${whitespace}]*[^\p{P}${whitespace}])`, // note: must filter out #123
3333- r`(?<=^|\s|[(])@(?<mention>[-a-zA-Z0-9]+([.][-a-zA-Z0-9]+)+)\b`,
3434- ].join("|"), 'gu')
3535- }()
3636- process_match(match) {
3737- let g = match.groups
3838- if (g.mention!=null) {
3939- return [{text:match[0], features:[{
4040- $type: 'app.bsky.richtext.facet#mention',
4141- did: g.mention, // hack, DID must be resolved afterwards
2323+let r = String.raw
2424+let whitespace = r`\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2`
2525+let url = r`[-\w/%&=#+~@$*'!?,.;:]*`
2626+let url_final = r`[-\w/%&=#+~@$*']`
2727+let big_regex = RegExp([
2828+ r`\b(?<link>https?://${url}${url_final}([(]${url}[)](${url}${url_final})?)?)(?:\[(?<link_text>.*?)\])?`,
2929+ r`(?<=^|\s)[##](?<hashtag>(?!\uFE0F)[^${whitespace}]*[^\p{P}${whitespace}])`, // note: must filter out #123
3030+ r`(?<=^|\s|[(])@(?<mention>[-a-zA-Z0-9]+([.][-a-zA-Z0-9]+)+)\b`,
3131+].join("|"), 'gu')
3232+function process_match(match) {
3333+ let g = match.groups
3434+ if (g.mention!=null) {
3535+ return [{text:match[0], features:[{
3636+ $type: 'app.bsky.richtext.facet#mention',
3737+ did: g.mention, // hack, DID must be resolved afterwards
3838+ }]}]
3939+ }
4040+ if (g.link!=null) {
4141+ if (g.link_text) {
4242+ return [{text:g.link_text, features:[{
4343+ $type: 'app.bsky.richtext.facet#link',
4444+ uri: g.link,
4245 }]}]
4343- }
4444- if (g.link!=null) {
4545- if (g.link_text) {
4646- return [{text:g.link_text, features:[{
4747- $type: 'app.bsky.richtext.facet#link',
4848- uri: g.link,
4949- }]}]
5050- } else {
5151- return [{text:match[0], features:[{
5252- $type: 'app.bsky.richtext.facet#link',
5353- uri: g.link,
5454- }]}]
5555- }
5656- }
5757- if (g.hashtag!=null && !/^#\d+$/.test(g.hashtag)) {
4646+ } else {
5847 return [{text:match[0], features:[{
5959- $type: 'app.bsky.richtext.facet#tag',
6060- tag: g.link,
4848+ $type: 'app.bsky.richtext.facet#link',
4949+ uri: g.link,
6150 }]}]
6251 }
6363- return super.process_match(match)
6452 }
5353+ if (g.hashtag!=null && !/^#\d+$/.test(g.hashtag)) {
5454+ return [{text:match[0], features:[{
5555+ $type: 'app.bsky.richtext.facet#tag',
5656+ tag: g.link,
5757+ }]}]
5858+ }
5959+ return [{text:match[0], features:[]}]
6560}
6666-6767-let text = "abc http://example.com[test] #hello @test.com.xyz"
6868-6969-console.log(segments)
7070-</script>
6161+export let markup1 = new MarkupParser(big_regex, process_match)
+8
test2.html
···11+<!doctype html>
22+33+<script type=module>
44+ import {markup1} from './parse.js'
55+ let text = "abc http://example.com[test] #hello @test.com.xyz"
66+ let x = markup1.text_to_segments(text)
77+ console.log(x)
88+</script>