Replace linkRegex with xurls library (#6261)

* Replace linkRegex with xurls library

Rather than maintaining a complicated regex to match URLs for
autolinking, gitea can use this existing go library that takes care of
the matching with very little code change to gitea itself. After
spending a while trying to find the perfect regex for all cases this library
still works better as it is more flexible than a single regex ever will be.

This will also fix the following issues: #5844 #3095 #3381

This passes all our current tests and I've added new ones mentioned in
those issues as well.

* Use xurls.StrictMatchingScheme instead of xurls.Strict

This is much faster and we only care about https? links to preserve
existing behavior.
release/v1.8
mrsdizzie 5 years ago committed by techknowlogick
parent 01bd1fcd33
commit f2de5dc8c8

9
Gopkg.lock generated

@ -725,6 +725,14 @@
pruneopts = "NUT"
revision = "02ccfbfaf0cc627aa3aec8ef7ed5cfeec5b43f63"
[[projects]]
digest = "1:63953ffb90bbc880c612d576fcfd973a5904277d25ec9e2d8d5719bf67969662"
name = "github.com/mvdan/xurls"
packages = ["."]
pruneopts = "NUT"
revision = "e52e821cbfe8fe163ff6f8628ab5869b11fc05af"
version = "v2.0.0"
[[projects]]
digest = "1:2be1d891535ce3d6d2a3db9087f07415e909744e9eff1a30f8f0b2519df60ae6"
name = "github.com/nfnt/resize"
@ -1293,6 +1301,7 @@
"github.com/mcuadros/go-version",
"github.com/microcosm-cc/bluemonday",
"github.com/msteinert/pam",
"github.com/mvdan/xurls",
"github.com/nfnt/resize",
"github.com/pquerna/otp",
"github.com/pquerna/otp/totp",

@ -113,3 +113,7 @@ ignored = ["google.golang.org/appengine*"]
[[constraint]]
name = "github.com/prometheus/client_golang"
version = "0.9.0"
[[constraint]]
name = "github.com/mvdan/xurls"
version = "2.0.0"

@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/util"
"github.com/Unknwon/com"
"github.com/mvdan/xurls"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
@ -64,9 +65,7 @@ var (
// https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail)
emailRegex = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
// matches http/https links. used for autlinking those. partly modified from
// the original present in autolink.js
linkRegex = regexp.MustCompile(`(?:(?:http|https):\/\/(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+(?:\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)(?:(?:\/[\+~%\/\.\w\-]*)?\??(?:[\-\+:=&;%@\.\w]*)#?(?:[\.\!\/\\\w]*))?`)
linkRegex, _ = xurls.StrictMatchingScheme("https?://")
)
// regexp for full links to issues/pulls

@ -104,6 +104,15 @@ func TestRender_links(t *testing.T) {
test(
"http://142.42.1.1/",
`<p><a href="http://142.42.1.1/" rel="nofollow">http://142.42.1.1/</a></p>`)
test(
"https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd",
`<p><a href="https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd" rel="nofollow">https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd</a></p>`)
test(
"https://en.wikipedia.org/wiki/URL_(disambiguation)",
`<p><a href="https://en.wikipedia.org/wiki/URL_(disambiguation)" rel="nofollow">https://en.wikipedia.org/wiki/URL_(disambiguation)</a></p>`)
test(
"https://foo_bar.example.com/",
`<p><a href="https://foo_bar.example.com/" rel="nofollow">https://foo_bar.example.com/</a></p>`)
// Test that should *not* be turned into URL
test(

@ -0,0 +1,27 @@
Copyright (c) 2015, Daniel Martí. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

@ -0,0 +1,299 @@
// Generated by schemesgen
package xurls
// Schemes is a sorted list of all IANA assigned schemes.
//
// Source:
// https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
var Schemes = []string{
`aaa`,
`aaas`,
`about`,
`acap`,
`acct`,
`acr`,
`adiumxtra`,
`afp`,
`afs`,
`aim`,
`appdata`,
`apt`,
`attachment`,
`aw`,
`barion`,
`beshare`,
`bitcoin`,
`bitcoincash`,
`blob`,
`bolo`,
`browserext`,
`callto`,
`cap`,
`chrome`,
`chrome-extension`,
`cid`,
`coap`,
`coap+tcp`,
`coap+ws`,
`coaps`,
`coaps+tcp`,
`coaps+ws`,
`com-eventbrite-attendee`,
`content`,
`conti`,
`crid`,
`cvs`,
`data`,
`dav`,
`diaspora`,
`dict`,
`did`,
`dis`,
`dlna-playcontainer`,
`dlna-playsingle`,
`dns`,
`dntp`,
`dtn`,
`dvb`,
`ed2k`,
`elsi`,
`example`,
`facetime`,
`fax`,
`feed`,
`feedready`,
`file`,
`filesystem`,
`finger`,
`fish`,
`ftp`,
`geo`,
`gg`,
`git`,
`gizmoproject`,
`go`,
`gopher`,
`graph`,
`gtalk`,
`h323`,
`ham`,
`hcap`,
`hcp`,
`http`,
`https`,
`hxxp`,
`hxxps`,
`hydrazone`,
`iax`,
`icap`,
`icon`,
`im`,
`imap`,
`info`,
`iotdisco`,
`ipn`,
`ipp`,
`ipps`,
`irc`,
`irc6`,
`ircs`,
`iris`,
`iris.beep`,
`iris.lwz`,
`iris.xpc`,
`iris.xpcs`,
`isostore`,
`itms`,
`jabber`,
`jar`,
`jms`,
`keyparc`,
`lastfm`,
`ldap`,
`ldaps`,
`lvlt`,
`magnet`,
`mailserver`,
`mailto`,
`maps`,
`market`,
`message`,
`microsoft.windows.camera`,
`microsoft.windows.camera.multipicker`,
`microsoft.windows.camera.picker`,
`mid`,
`mms`,
`modem`,
`mongodb`,
`moz`,
`ms-access`,
`ms-browser-extension`,
`ms-drive-to`,
`ms-enrollment`,
`ms-excel`,
`ms-gamebarservices`,
`ms-gamingoverlay`,
`ms-getoffice`,
`ms-help`,
`ms-infopath`,
`ms-inputapp`,
`ms-lockscreencomponent-config`,
`ms-media-stream-id`,
`ms-mixedrealitycapture`,
`ms-officeapp`,
`ms-people`,
`ms-project`,
`ms-powerpoint`,
`ms-publisher`,
`ms-restoretabcompanion`,
`ms-screenclip`,
`ms-screensketch`,
`ms-search`,
`ms-search-repair`,
`ms-secondary-screen-controller`,
`ms-secondary-screen-setup`,
`ms-settings`,
`ms-settings-airplanemode`,
`ms-settings-bluetooth`,
`ms-settings-camera`,
`ms-settings-cellular`,
`ms-settings-cloudstorage`,
`ms-settings-connectabledevices`,
`ms-settings-displays-topology`,
`ms-settings-emailandaccounts`,
`ms-settings-language`,
`ms-settings-location`,
`ms-settings-lock`,
`ms-settings-nfctransactions`,
`ms-settings-notifications`,
`ms-settings-power`,
`ms-settings-privacy`,
`ms-settings-proximity`,
`ms-settings-screenrotation`,
`ms-settings-wifi`,
`ms-settings-workplace`,
`ms-spd`,
`ms-sttoverlay`,
`ms-transit-to`,
`ms-useractivityset`,
`ms-virtualtouchpad`,
`ms-visio`,
`ms-walk-to`,
`ms-whiteboard`,
`ms-whiteboard-cmd`,
`ms-word`,
`msnim`,
`msrp`,
`msrps`,
`mtqp`,
`mumble`,
`mupdate`,
`mvn`,
`news`,
`nfs`,
`ni`,
`nih`,
`nntp`,
`notes`,
`ocf`,
`oid`,
`onenote`,
`onenote-cmd`,
`opaquelocktoken`,
`openpgp4fpr`,
`pack`,
`palm`,
`paparazzi`,
`pkcs11`,
`platform`,
`pop`,
`pres`,
`prospero`,
`proxy`,
`pwid`,
`psyc`,
`qb`,
`query`,
`redis`,
`rediss`,
`reload`,
`res`,
`resource`,
`rmi`,
`rsync`,
`rtmfp`,
`rtmp`,
`rtsp`,
`rtsps`,
`rtspu`,
`secondlife`,
`service`,
`session`,
`sftp`,
`sgn`,
`shttp`,
`sieve`,
`simpleledger`,
`sip`,
`sips`,
`skype`,
`smb`,
`sms`,
`smtp`,
`snews`,
`snmp`,
`soap.beep`,
`soap.beeps`,
`soldat`,
`spiffe`,
`spotify`,
`ssh`,
`steam`,
`stun`,
`stuns`,
`submit`,
`svn`,
`tag`,
`teamspeak`,
`tel`,
`teliaeid`,
`telnet`,
`tftp`,
`things`,
`thismessage`,
`tip`,
`tn3270`,
`tool`,
`turn`,
`turns`,
`tv`,
`udp`,
`unreal`,
`urn`,
`ut2004`,
`v-event`,
`vemmi`,
`ventrilo`,
`videotex`,
`vnc`,
`view-source`,
`wais`,
`webcal`,
`wpid`,
`ws`,
`wss`,
`wtai`,
`wyciwyg`,
`xcon`,
`xcon-userid`,
`xfire`,
`xmlrpc.beep`,
`xmlrpc.beeps`,
`xmpp`,
`xri`,
`ymsgr`,
`z39.50`,
`z39.50r`,
`z39.50s`,
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,24 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package xurls
// PseudoTLDs is a sorted list of some widely used unofficial TLDs.
//
// Sources:
// * https://en.wikipedia.org/wiki/Pseudo-top-level_domain
// * https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains
// * https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00
// * https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml
var PseudoTLDs = []string{
`bit`, // Namecoin
`example`, // Example domain
`exit`, // Tor exit node
`gnu`, // GNS by public key
`i2p`, // I2P network
`invalid`, // Invalid domain
`local`, // Local network
`localhost`, // Local network
`test`, // Test domain
`zkey`, // GNS domain name
}

@ -0,0 +1,107 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
// Package xurls extracts urls from plain text using regular expressions.
package xurls
import (
"bytes"
"regexp"
)
//go:generate go run generate/tldsgen/main.go
//go:generate go run generate/schemesgen/main.go
const (
letter = `\p{L}`
mark = `\p{M}`
number = `\p{N}`
iriChar = letter + mark + number
currency = `\p{Sc}`
otherSymb = `\p{So}`
endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb
otherPunc = `\p{Po}`
midChar = endChar + `|` + otherPunc
wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)`
wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]`
wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}`
wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+`
iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?`
domain = `(` + iri + `\.)+`
octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b`
ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:`
ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)`
port = `(:[0-9]*)?`
)
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
// scheme, and not just the known ones.
var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
// followed by ":" instead of "://".
var SchemesNoAuthority = []string{
`bitcoin`, // Bitcoin
`file`, // Files
`magnet`, // Torrent magnets
`mailto`, // Mail
`sms`, // SMS
`tel`, // Telephone
`xmpp`, // XMPP
}
func anyOf(strs ...string) string {
var b bytes.Buffer
b.WriteByte('(')
for i, s := range strs {
if i != 0 {
b.WriteByte('|')
}
b.WriteString(regexp.QuoteMeta(s))
}
b.WriteByte(')')
return b.String()
}
func strictExp() string {
schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)`
return `(?i)` + schemes + `(?-i)` + pathCont
}
func relaxedExp() string {
site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)`
hostName := `(` + site + `|` + ipAddr + `)`
webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)`
return strictExp() + `|` + webURL
}
// Strict produces a regexp that matches any URL with a scheme in either the
// Schemes or SchemesNoAuthority lists.
func Strict() *regexp.Regexp {
re := regexp.MustCompile(strictExp())
re.Longest()
return re
}
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
// URL with no scheme.
func Relaxed() *regexp.Regexp {
re := regexp.MustCompile(relaxedExp())
re.Longest()
return re
}
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
// the scheme match the given regular expression. See AnyScheme too.
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont
re, err := regexp.Compile(strictMatching)
if err != nil {
return nil, err
}
re.Longest()
return re, nil
}
Loading…
Cancel
Save