Skip to content

<regex>: regex_traits::transform_primary should yield primary sort keys appropriate for the imbued locale #5444

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 10, 2025
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions stl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ set(IMPLIB_SOURCES
${CMAKE_CURRENT_LIST_DIR}/src/locale0_implib.cpp
${CMAKE_CURRENT_LIST_DIR}/src/nothrow.cpp
${CMAKE_CURRENT_LIST_DIR}/src/print.cpp
${CMAKE_CURRENT_LIST_DIR}/src/regex.cpp
${CMAKE_CURRENT_LIST_DIR}/src/sharedmutex.cpp
${CMAKE_CURRENT_LIST_DIR}/src/stacktrace.cpp
${CMAKE_CURRENT_LIST_DIR}/src/syserror_import_lib.cpp
Expand Down
5 changes: 5 additions & 0 deletions stl/inc/locale
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ inline size_t __CRTDECL _LStrxfrm(_Out_writes_(_Last1 - _First1) _Post_readable_
}
#endif // defined(_CRTBLD)

template <class _Elem>
class _Regex_traits;

_EXPORT_STD template <class _Elem>
class collate : public locale::facet { // facet for ordering sequences of elements
public:
Expand Down Expand Up @@ -189,6 +192,8 @@ protected:

private:
_Locinfo::_Collvec _Coll; // used by _LStrcoll and _XStrxfrm

friend _Regex_traits<_Elem>;
};

#ifdef __clang__
Expand Down
92 changes: 73 additions & 19 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,17 @@ _STL_DISABLE_CLANG_WARNINGS
#endif // ^^^ !defined(_DEBUG) ^^^
#endif // !defined(_ENHANCED_REGEX_VISUALIZER)

#ifdef _CPPRTTI
extern "C" {
_STD size_t __stdcall __std_regex_transform_primary_char(
_Out_writes_(_Last1 - _First1) _Post_readable_size_(return) char* _First1, char* _Last1,
_In_reads_(_Last2 - _First2) const char* _First2, const char* _Last2, _In_opt_ const _Collvec*) noexcept;
_STD size_t __stdcall __std_regex_transform_primary_wchar_t(
_Out_writes_(_Last1 - _First1) _Post_readable_size_(return) wchar_t* _First1, wchar_t* _Last1,
_In_reads_(_Last2 - _First2) const wchar_t* _First2, const wchar_t* _Last2, _In_opt_ const _Collvec*) noexcept;
} // extern "C"
#endif // ^^^ defined(_CPPRTTI) ^^^

_STD_BEGIN

enum _Meta_type : int { // meta character representations for parser
Expand Down Expand Up @@ -267,6 +278,20 @@ struct _Regex_traits_base { // base of all regular expression traits
using char_class_type = ctype_base::mask;
};

#ifdef _CPPRTTI
inline size_t _Regex_transform_primary(_Out_writes_(_Last1 - _First1) _Post_readable_size_(return) char* _First1,
char* _Last1, _In_reads_(_Last2 - _First2) const char* _First2, const char* _Last2,
_In_opt_ const _Locinfo::_Collvec* _Vector) noexcept {
return __std_regex_transform_primary_char(_First1, _Last1, _First2, _Last2, _Vector);
}

inline size_t _Regex_transform_primary(_Out_writes_(_Last1 - _First1) _Post_readable_size_(return) wchar_t* _First1,
wchar_t* _Last1, _In_reads_(_Last2 - _First2) const wchar_t* _First2, const wchar_t* _Last2,
_In_opt_ const _Locinfo::_Collvec* _Vector) noexcept {
return __std_regex_transform_primary_wchar_t(_First1, _Last1, _First2, _Last2, _Vector);
}
#endif // ^^^ defined(_CPPRTTI) ^^^

template <class _Elem>
class _Regex_traits : public _Regex_traits_base { // base class for regular expression traits
public:
Expand Down Expand Up @@ -312,13 +337,38 @@ public:
string_type transform_primary(_FwdIt _First, _FwdIt _Last) const {
// apply locale-specific case-insensitive transformation
string_type _Res;

if (_First != _Last) { // non-empty string, transform it
vector<_Elem> _Temp(_First, _Last);

_Getctype()->tolower(_Temp.data(), _Temp.data() + _Temp.size());
_Res = _Getcoll()->transform(_Temp.data(), _Temp.data() + _Temp.size());
#ifdef _CPPRTTI
if (_First != _Last) {
const collate<_Elem>* _Coll = _Getcoll();
const auto& _Coll_type = typeid(*_Coll);
// TRANSITION, ABI: GH-5394: locale creates collate objects of type collate, not collate_byname.
// Depending on the resolution of LWG-2338, comparison to typeid(collate) might also become
// required by the standard.
if (_Coll_type == typeid(collate_byname<_Elem>) || _Coll_type == typeid(collate<_Elem>)) {
// non-empty string with known collate facet, transform it
const string_type _Src(_First, _Last);
const auto _Src_first = _Src.data();
const auto _Src_last = _Src_first + _Src.size();

size_t _Count = _Src.size();
while (_Res.size() < _Count) {
_Res.resize(_Count);
_Count = _STD _Regex_transform_primary(
&_Res[0], &_Res[0] + _Count, _Src_first, _Src_last, &_Coll->_Coll);

if (_Count == static_cast<size_t>(-1)) {
// return empty string in case of error
_Count = 0;
break;
}
}
_Res.resize(_Count);
}
}
#else // ^^^ defined(_CPPRTTI) / !defined(_CPPRTTI) vvv
(void) _First;
(void) _Last;
#endif // ^^^ !defined(_CPPRTTI) ^^^
return _Res;
}

Expand Down Expand Up @@ -4181,26 +4231,30 @@ _Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_Do_ex_class2(

_Elem* const _Coll_elem_first = &_Coll_elem.front();
const _Elem* const _Coll_elem_last = _Coll_elem_first + _Size;

if (_Size == 1 && _End_arg == _Meta_dot) {
// process single-element collating elements like individual characters
_Val = *_Coll_elem_first;
return _Prs_chr;
}

if (_Flags & regex_constants::icase) {
for (auto _Current = _Coll_elem_first; _Current != _Coll_elem_last; ++_Current) {
*_Current = _Traits.translate_nocase(*_Current);
}
} else if (_Flags & regex_constants::collate) {
for (auto _Current = _Coll_elem_first; _Current != _Coll_elem_last; ++_Current) {
*_Current = _Traits.translate(*_Current);
}
}

if (_End_arg == _Meta_equal) { // process equivalence
_Nfa._Add_equiv2(_Coll_elem_first, _Coll_elem_last);
return _Prs_set;
} else { // process collating element
if (_Size == 1) {
_Val = *_Coll_elem_first;
return _Prs_chr;
}

// Character ranges with multi-character bounds cannot be represented in NFA nodes yet (see GH-5391).
// Provisionally treat multi-character collating elements as character sets.
if (_Flags & regex_constants::icase) {
for (auto _Current = _Coll_elem_first; _Current != _Coll_elem_last; ++_Current) {
*_Current = _Traits.translate_nocase(*_Current);
}
} else if (_Flags & regex_constants::collate) {
for (auto _Current = _Coll_elem_first; _Current != _Coll_elem_last; ++_Current) {
*_Current = _Traits.translate(*_Current);
}
}
_Nfa._Add_coll2(_Coll_elem_first, _Coll_elem_last);
return _Prs_set;
}
Expand Down
3 changes: 2 additions & 1 deletion stl/inc/yvals.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// This header is used to compile the import library (via locale0_implib.cpp => locale0.cpp => xfacet => yvals.h).
// This header is used to compile the import library
// (via locale0_implib.cpp => locale0.cpp => xfacet => yvals.h and regex.cpp => awint.hpp => yvals.h).
// MAJOR LIMITATIONS apply to what can be included here!
// Before editing this file, read: /docs/import_library.md

Expand Down
1 change: 1 addition & 0 deletions stl/msbuild/stl_base/stl.files.settings.targets
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
$(CrtRoot)\github\stl\src\locale0_implib.cpp;
$(CrtRoot)\github\stl\src\nothrow.cpp;
$(CrtRoot)\github\stl\src\print.cpp;
$(CrtRoot)\github\stl\src\regex.cpp;
$(CrtRoot)\github\stl\src\sharedmutex.cpp;
$(CrtRoot)\github\stl\src\stacktrace.cpp;
$(CrtRoot)\github\stl\src\syserror_import_lib.cpp;
Expand Down
5 changes: 5 additions & 0 deletions stl/src/awint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// Internal definitions for A&W Win32 wrapper routines.

// This file is compiled into the import library (via regex.cpp => awint.hpp).
// MAJOR LIMITATIONS apply to what can be included here!
// Before editing this file, read: /docs/import_library.md

#pragma once

#include <yvals.h>
Expand Down
128 changes: 128 additions & 0 deletions stl/src/regex.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// This file is compiled into the import library.
// MAJOR LIMITATIONS apply to what can be included here!
// Before editing this file, read: /docs/import_library.md

#include <__msvc_xlocinfo_types.hpp>
#include <clocale>
#include <crtdefs.h>
#include <cstdlib>
#include <cstring>
#include <internal_shared.h>

#include <Windows.h>

#undef _ENFORCE_ONLY_CORE_HEADERS
#include "awint.hpp"

extern "C" {

// derived from xstrxfrm.cpp
size_t __stdcall __std_regex_transform_primary_char(
_Out_writes_(end1 - string1) _Post_readable_size_(return) char* string1, char* end1,
_In_reads_(end2 - string2) const char* string2, const char* end2, _In_opt_ const _Collvec* ploc) noexcept {
size_t n1 = end1 - string1;
size_t n2 = end2 - string2;
size_t retval = static_cast<size_t>(-1);
UINT codepage;
const wchar_t* locale_name;

if (ploc == nullptr) {
locale_name = ___lc_locale_name_func()[LC_COLLATE];
codepage = ___lc_collate_cp_func();
} else {
locale_name = ploc->_LocaleName;
codepage = ploc->_Page;
}

if (locale_name == nullptr && codepage == CP_ACP) {
if (n2 <= n1) {
memcpy(string1, string2, n2);
}
retval = n2;
} else {
// Inquire size of dst string in BYTES
const int dstlen = __crtLCMapStringA(locale_name,
LCMAP_SORTKEY | LINGUISTIC_IGNORECASE | LINGUISTIC_IGNOREDIACRITIC | NORM_IGNOREKANATYPE | NORM_IGNOREWIDTH,
string2, static_cast<int>(n2), nullptr, 0, codepage, TRUE);

if (dstlen != 0) {
retval = dstlen;

// if not enough room, return amount needed
if (dstlen <= static_cast<int>(n1)) {
// Map src string to dst string
__crtLCMapStringA(locale_name,
LCMAP_SORTKEY | LINGUISTIC_IGNORECASE | LINGUISTIC_IGNOREDIACRITIC | NORM_IGNOREKANATYPE
| NORM_IGNOREWIDTH,
string2, static_cast<int>(n2), string1, static_cast<int>(n1), codepage, TRUE);
}
}
}

return retval;
}

// derived from xwcsxfrm.cpp
size_t __stdcall __std_regex_transform_primary_wchar_t(
_Out_writes_(end1 - string1) _Post_readable_size_(return) wchar_t* string1, wchar_t* end1,
_In_reads_(end2 - string2) const wchar_t* string2, const wchar_t* end2, _In_opt_ const _Collvec* ploc) noexcept {
size_t n1 = end1 - string1;
size_t n2 = end2 - string2;
size_t size = static_cast<size_t>(-1);
const wchar_t* locale_name;

if (ploc == nullptr) {
locale_name = ___lc_locale_name_func()[LC_COLLATE];
} else {
locale_name = ploc->_LocaleName;
}

if (locale_name == nullptr) {
if (n2 <= n1) {
memcpy(string1, string2, n2 * sizeof(wchar_t));
}
size = n2;
} else {
// When using LCMAP_SORTKEY, LCMapStringW handles BYTES not wide
// chars. We use a byte buffer to hold bytes and then convert the
// byte string to a wide char string and return this so it can be
// compared using wcscmp(). User's buffer is n1 wide chars, so
// use an internal buffer of n1 bytes.

auto bbuffer = _malloc_crt_t(unsigned char, n1);

if (bbuffer) {
#pragma warning(push)
#pragma warning(disable : 6386) // PREfast doesn't understand LCMAP_SORTKEY
size = __crtLCMapStringW(locale_name,
LCMAP_SORTKEY | LINGUISTIC_IGNORECASE | LINGUISTIC_IGNOREDIACRITIC | NORM_IGNOREKANATYPE
| NORM_IGNOREWIDTH,
string2, static_cast<int>(n2), reinterpret_cast<wchar_t*>(bbuffer.get()), static_cast<int>(n1));
#pragma warning(pop)

if (size == 0) {
// buffer not big enough, get size required.
size = __crtLCMapStringW(locale_name,
LCMAP_SORTKEY | LINGUISTIC_IGNORECASE | LINGUISTIC_IGNOREDIACRITIC | NORM_IGNOREKANATYPE
| NORM_IGNOREWIDTH,
string2, static_cast<int>(n2), nullptr, 0);

if (size == 0) {
size = static_cast<size_t>(-1); // default error
}
} else {
// string successfully mapped, convert to wide char

for (size_t i = 0; i < size; ++i) {
string1[i] = static_cast<wchar_t>(bbuffer.get()[i]);
}
}
}
}

return size;
}
} // extern "C"
1 change: 0 additions & 1 deletion tests/libcxx/expected_results.txt
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,6 @@ std/re/re.alg/re.alg.search/basic.pass.cpp FAIL
std/re/re.alg/re.alg.search/ecma.pass.cpp FAIL
std/re/re.alg/re.alg.search/extended.pass.cpp FAIL
std/re/re.traits/lookup_collatename.pass.cpp FAIL
std/re/re.traits/transform_primary.pass.cpp FAIL

# Not analyzed, likely STL bugs. Various assertions.
std/numerics/complex.number/complex.ops/complex_divide_complex.pass.cpp FAIL
Expand Down
Loading