Skip to content

♻️ Refactor: Add linkifier rule to inline chain for full links #279

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions markdown_it/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,3 +304,15 @@ def normalizeReference(string: str) -> str:
# most notably, `__proto__`)
#
return string.lower().upper()


LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)


def isLinkOpen(string: str) -> bool:
return bool(LINK_OPEN_RE.search(string))


def isLinkClose(string: str) -> bool:
return bool(LINK_CLOSE_RE.search(string))
1 change: 1 addition & 0 deletions markdown_it/parser_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# Parser rules
_rules: list[tuple[str, RuleFunc]] = [
("text", rules_inline.text),
("linkify", rules_inline.linkify),
("newline", rules_inline.newline),
("escape", rules_inline.escape),
("backticks", rules_inline.backtick),
Expand Down
2 changes: 1 addition & 1 deletion markdown_it/presets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def make() -> PresetType:
config = commonmark.make()
config["components"]["core"]["rules"].append("linkify")
config["components"]["block"]["rules"].append("table")
config["components"]["inline"]["rules"].append("strikethrough")
config["components"]["inline"]["rules"].extend(["strikethrough", "linkify"])
config["components"]["inline"]["rules2"].append("strikethrough")
config["options"]["linkify"] = True
config["options"]["html"] = True
Expand Down
66 changes: 37 additions & 29 deletions markdown_it/rules_core/linkify.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,32 @@
from __future__ import annotations

import re
from typing import Protocol

from ..common.utils import arrayReplaceAt
from ..common.utils import arrayReplaceAt, isLinkClose, isLinkOpen
from ..token import Token
from .state_core import StateCore

LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)

HTTP_RE = re.compile(r"^http://")
MAILTO_RE = re.compile(r"^mailto:")
TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE)


def isLinkOpen(string: str) -> bool:
return bool(LINK_OPEN_RE.search(string))


def isLinkClose(string: str) -> bool:
return bool(LINK_CLOSE_RE.search(string))


def linkify(state: StateCore) -> None:
blockTokens = state.tokens

"""Rule for identifying plain-text links."""
if not state.md.options.linkify:
return

if not state.md.linkify:
raise ModuleNotFoundError("Linkify enabled but not installed.")

for j in range(len(blockTokens)):
if blockTokens[j].type != "inline" or not state.md.linkify.pretest(
blockTokens[j].content
for inline_token in state.tokens:
if inline_token.type != "inline" or not state.md.linkify.pretest(
inline_token.content
):
continue

tokens = blockTokens[j].children
tokens = inline_token.children

htmlLinkLevel = 0

Expand Down Expand Up @@ -71,38 +62,47 @@ def linkify(state: StateCore) -> None:
currentToken.content
):
text = currentToken.content
links = state.md.linkify.match(text)
links: list[_LinkType] = state.md.linkify.match(text) or []

# Now split string to nodes
nodes = []
level = currentToken.level
lastPos = 0

for ln in range(len(links)):
url = links[ln].url
# forbid escape sequence at the start of the string,
# this avoids http\://example.com/ from being linkified as
# http:<a href="//example.com/">//example.com/</a>
if (
links
and links[0].index == 0
and i > 0
and tokens[i - 1].type == "text_special"
):
links = links[1:]

for link in links:
url = link.url
fullUrl = state.md.normalizeLink(url)
if not state.md.validateLink(fullUrl):
continue

urlText = links[ln].text
urlText = link.text

# Linkifier might send raw hostnames like "example.com", where url
# starts with domain name. So we prepend http:// in those cases,
# and remove it afterwards.
if not links[ln].schema:
if not link.schema:
urlText = HTTP_RE.sub(
"", state.md.normalizeLinkText("http://" + urlText)
)
elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search(
urlText
):
elif link.schema == "mailto:" and TEST_MAILTO_RE.search(urlText):
urlText = MAILTO_RE.sub(
"", state.md.normalizeLinkText("mailto:" + urlText)
)
else:
urlText = state.md.normalizeLinkText(urlText)

pos = links[ln].index
pos = link.index

if pos > lastPos:
token = Token("text", "", 0)
Expand Down Expand Up @@ -130,12 +130,20 @@ def linkify(state: StateCore) -> None:
token.info = "auto"
nodes.append(token)

lastPos = links[ln].last_index
lastPos = link.last_index

if lastPos < len(text):
token = Token("text", "", 0)
token.content = text[lastPos:]
token.level = level
nodes.append(token)

blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes)
inline_token.children = tokens = arrayReplaceAt(tokens, i, nodes)


class _LinkType(Protocol):
url: str
text: str
index: int
last_index: int
schema: str | None
2 changes: 2 additions & 0 deletions markdown_it/rules_inline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"text",
"fragments_join",
"link_pairs",
"linkify",
"escape",
"newline",
"backtick",
Expand All @@ -24,6 +25,7 @@
from .html_inline import html_inline
from .image import image
from .link import link
from .linkify import linkify
from .newline import newline
from .state_inline import StateInline
from .text import text
6 changes: 6 additions & 0 deletions markdown_it/rules_inline/html_inline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Process html tags
from ..common.html_re import HTML_TAG_RE
from ..common.utils import isLinkClose, isLinkOpen
from .state_inline import StateInline


Expand Down Expand Up @@ -33,5 +34,10 @@ def html_inline(state: StateInline, silent: bool) -> bool:
token = state.push("html_inline", "", 0)
token.content = state.src[pos : pos + len(match.group(0))]

if isLinkOpen(token.content):
state.linkLevel += 1
if isLinkClose(token.content):
state.linkLevel -= 1

state.pos += len(match.group(0))
return True
2 changes: 2 additions & 0 deletions markdown_it/rules_inline/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,9 @@ def link(state: StateInline, silent: bool) -> bool:
if label and state.md.options.get("store_labels", False):
token.meta["label"] = label

state.linkLevel += 1
state.md.inline.tokenize(state)
state.linkLevel -= 1

token = state.push("link_close", "a", -1)

Expand Down
61 changes: 61 additions & 0 deletions markdown_it/rules_inline/linkify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Process links like https://example.org/"""
import re

from .state_inline import StateInline

# RFC3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
SCHEME_RE = re.compile(r"(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$", re.IGNORECASE)


def linkify(state: StateInline, silent: bool) -> bool:
"""Rule for identifying plain-text links."""
if not state.md.options.linkify:
return False
if state.linkLevel > 0:
return False
if not state.md.linkify:
raise ModuleNotFoundError("Linkify enabled but not installed.")

pos = state.pos
maximum = state.posMax

if (
(pos + 3) > maximum
or state.src[pos] != ":"
or state.src[pos + 1] != "/"
or state.src[pos + 2] != "/"
):
return False

if not (match := SCHEME_RE.match(state.pending)):
return False

proto = match.group(1)
if not (link := state.md.linkify.match_at_start(state.src[pos - len(proto) :])):
return False
url: str = link.url

# disallow '*' at the end of the link (conflicts with emphasis)
url = url.rstrip("*")

full_url = state.md.normalizeLink(url)
if not state.md.validateLink(full_url):
return False

if not silent:
state.pending = state.pending[: -len(proto)]

token = state.push("link_open", "a", 1)
token.attrs = {"href": full_url}
token.markup = "linkify"
token.info = "auto"

token = state.push("text", "", 0)
token.content = state.md.normalizeLinkText(url)

token = state.push("link_close", "a", -1)
token.markup = "linkify"
token.info = "auto"

state.pos += len(url) - len(proto)
return True
4 changes: 4 additions & 0 deletions markdown_it/rules_inline/state_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ def __init__(
self.backticks: dict[int, int] = {}
self.backticksScanned = False

# Counter used to disable inline linkify-it execution
# inside <a> and markdown links
self.linkLevel = 0

def __repr__(self) -> str:
return (
f"{self.__class__.__name__}"
Expand Down
1 change: 1 addition & 0 deletions tests/test_api/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_get_rules():
],
"inline": [
"text",
"linkify",
"newline",
"escape",
"backticks",
Expand Down
Loading