Skip to content

Commit 198c11f

Browse files
authored
Fix URL sanitizer for long URLs (#16026)
f1aa699 was fundamentally incorrect as it used `IdnToAscii` and `IdnToUnicode` on the entire URL, even though these functions only work on domain names. This commit fixes the issue by using the WinRT `Url` class and its `AbsoluteUri` and `AbsoluteCanonicalUri` getters. The algorithm still works the same way though. Closes #16017 ## Validation Steps Performed * ``"`e]8;;https://www.xn--fcbook-3nf5b.com/`e\test`e]8;;`e\"`` still shows as two URLs in the popup ✅ * Shows the given URI if it's canonical and not an IDN ✅ * Works with >100 char long file:// URIs ✅
1 parent cf19385 commit 198c11f

File tree

3 files changed

+48
-48
lines changed

3 files changed

+48
-48
lines changed

.github/actions/spelling/allow/apis.txt

+2
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ UFIELD
220220
ULARGE
221221
UOI
222222
UPDATEINIFILE
223+
urlmon
223224
userenv
224225
USEROBJECTFLAGS
225226
Vcpp
@@ -231,6 +232,7 @@ wcsstr
231232
wcstoui
232233
WDJ
233234
winhttp
235+
wininet
234236
winmain
235237
winsta
236238
winstamin

src/cascadia/TerminalControl/TermControl.cpp

+37-48
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,9 @@
44
#include "pch.h"
55
#include "TermControl.h"
66

7-
#include <unicode.hpp>
87
#include <LibraryResources.h>
98

109
#include "TermControlAutomationPeer.h"
11-
#include "../../types/inc/GlyphWidth.hpp"
1210
#include "../../renderer/atlas/AtlasEngine.h"
1311

1412
#include "TermControl.g.cpp"
@@ -3208,51 +3206,6 @@ namespace winrt::Microsoft::Terminal::Control::implementation
32083206
_core.ClearHoveredCell();
32093207
}
32103208

3211-
// Attackers abuse Unicode characters that happen to look similar to ASCII characters. Cyrillic for instance has
3212-
// its own glyphs for а, с, е, о, р, х, and у that look practically identical to their ASCII counterparts.
3213-
// This is called an "IDN homoglyph attack".
3214-
//
3215-
// But outright showing Punycode URIs only is similarly flawed as they can end up looking similar to valid ASCII URIs.
3216-
// xn--cnn.com for instance looks confusingly similar to cnn.com, but actually represents U+407E.
3217-
//
3218-
// An optimal solution would detect any URI that contains homoglyphs and show them in their Punycode form.
3219-
// Such a detector however is not quite trivial and requires constant maintenance, which this project's
3220-
// maintainers aren't currently well equipped to handle. As such we do the next best thing and show the
3221-
// Punycode encoding side-by-side with the Unicode string for any IDN.
3222-
static winrt::hstring sanitizeURI(winrt::hstring uri)
3223-
{
3224-
if (uri.empty())
3225-
{
3226-
return uri;
3227-
}
3228-
3229-
wchar_t punycodeBuffer[256];
3230-
wchar_t unicodeBuffer[256];
3231-
3232-
// These functions return int, but are documented to only return positive numbers.
3233-
// Better make sure though. It allows us to pass punycodeLength right into IdnToUnicode.
3234-
const auto punycodeLength = std::max(0, IdnToAscii(0, uri.data(), gsl::narrow<int>(uri.size()), &punycodeBuffer[0], 256));
3235-
const auto unicodeLength = std::max(0, IdnToUnicode(0, &punycodeBuffer[0], punycodeLength, &unicodeBuffer[0], 256));
3236-
3237-
if (punycodeLength <= 0 || unicodeLength <= 0)
3238-
{
3239-
return RS_(L"InvalidUri");
3240-
}
3241-
3242-
const std::wstring_view punycode{ &punycodeBuffer[0], gsl::narrow_cast<size_t>(punycodeLength) };
3243-
const std::wstring_view unicode{ &unicodeBuffer[0], gsl::narrow_cast<size_t>(unicodeLength) };
3244-
3245-
// IdnToAscii/IdnToUnicode return the input string as is if it's all
3246-
// plain ASCII. But we don't know if the input URI is Punycode or not.
3247-
// --> It's non-Punycode and ASCII if it round-trips.
3248-
if (uri == punycode && uri == unicode)
3249-
{
3250-
return uri;
3251-
}
3252-
3253-
return winrt::hstring{ fmt::format(FMT_COMPILE(L"{}\n({})"), punycode, unicode) };
3254-
}
3255-
32563209
void TermControl::_hoveredHyperlinkChanged(const IInspectable& /*sender*/, const IInspectable& /*args*/)
32573210
{
32583211
const auto lastHoveredCell = _core.HoveredCell();
@@ -3261,12 +3214,48 @@ namespace winrt::Microsoft::Terminal::Control::implementation
32613214
return;
32623215
}
32633216

3264-
const auto uriText = sanitizeURI(_core.HoveredUriText());
3217+
auto uriText = _core.HoveredUriText();
32653218
if (uriText.empty())
32663219
{
32673220
return;
32683221
}
32693222

3223+
// Attackers abuse Unicode characters that happen to look similar to ASCII characters. Cyrillic for instance has
3224+
// its own glyphs for а, с, е, о, р, х, and у that look practically identical to their ASCII counterparts.
3225+
// This is called an "IDN homoglyph attack".
3226+
//
3227+
// But outright showing Punycode URIs only is similarly flawed as they can end up looking similar to valid ASCII URIs.
3228+
// xn--cnn.com for instance looks confusingly similar to cnn.com, but actually represents U+407E.
3229+
//
3230+
// An optimal solution would detect any URI that contains homoglyphs and show them in their Punycode form.
3231+
// Such a detector however is not quite trivial and requires constant maintenance, which this project's
3232+
// maintainers aren't currently well equipped to handle. As such we do the next best thing and show the
3233+
// Punycode encoding side-by-side with the Unicode string for any IDN.
3234+
try
3235+
{
3236+
// DisplayUri/Iri drop authentication credentials, which is probably great, but AbsoluteCanonicalUri()
3237+
// is the only getter that returns a punycode encoding of the URL. AbsoluteUri() is the only possible
3238+
// counterpart, but as the name indicates, we'll end up hitting the != below for any non-canonical URL.
3239+
//
3240+
// This issue can be fixed by using the IUrl API from urlmon.h directly, which the WinRT API simply wraps.
3241+
// IUrl is a very complex system with a ton of useful functionality, but we don't rely on it (neither WinRT),
3242+
// so we could alternatively use its underlying API in wininet.h (InternetCrackUrlW, etc.).
3243+
// That API however is rather difficult to use for such seldom executed code.
3244+
const Windows::Foundation::Uri uri{ uriText };
3245+
const auto unicode = uri.AbsoluteUri();
3246+
const auto punycode = uri.AbsoluteCanonicalUri();
3247+
3248+
if (punycode != unicode)
3249+
{
3250+
const auto text = fmt::format(FMT_COMPILE(L"{}\n({})"), punycode, unicode);
3251+
uriText = winrt::hstring{ text };
3252+
}
3253+
}
3254+
catch (...)
3255+
{
3256+
uriText = RS_(L"InvalidUri");
3257+
}
3258+
32703259
const auto panel = SwapChainPanel();
32713260
const auto scale = panel.CompositionScaleX();
32723261
const auto offset = panel.ActualOffset();

src/cascadia/inc/cppwinrt_utils.h

+9
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,15 @@ Revision History:
1717

1818
#pragma once
1919

20+
template<>
21+
struct fmt::formatter<winrt::hstring, wchar_t> : fmt::formatter<fmt::wstring_view, wchar_t>
22+
{
23+
auto format(const winrt::hstring& str, auto& ctx)
24+
{
25+
return fmt::formatter<fmt::wstring_view, wchar_t>::format({ str.data(), str.size() }, ctx);
26+
}
27+
};
28+
2029
// This is a helper macro for both declaring the signature of an event, and
2130
// defining the body. Winrt events need a method for adding a callback to the
2231
// event and removing the callback. This macro will both declare the method

0 commit comments

Comments
 (0)