Skip to content

Base64 functions #3350

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,6 @@
[submodule "contrib/unixodbc"]
path = contrib/unixodbc
url = https://github.com/ClickHouse-Extras/UnixODBC.git
[submodule "contrib/base64"]
path = contrib/base64
url = https://github.com/aklomp/base64.git
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ include (cmake/find_capnp.cmake)
include (cmake/find_llvm.cmake)
include (cmake/find_cpuid.cmake)
include (cmake/find_consistent-hashing.cmake)
include (cmake/find_base64.cmake)
if (ENABLE_TESTS)
include (cmake/find_gtest.cmake)
endif ()
Expand Down
12 changes: 12 additions & 0 deletions cmake/find_base64.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
option (ENABLE_BASE64 "Enable base64" ON)

if (ENABLE_BASE64)
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/base64")
message (WARNING "submodule contrib/base64 is missing. to fix try run: \n git submodule update --init --recursive")
else()
set (BASE64_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/base64/include)
set (BASE64_LIBRARY base64)
set (USE_BASE64 1)
endif()
endif ()

34 changes: 32 additions & 2 deletions cmake/test_cpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,38 @@ if (HAVE_SSE42)
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
endif ()

set (TEST_FLAG "-mssse3")
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0")
check_cxx_source_compiles("
#include <tmmintrin.h>
int main() {
__m64 a = _mm_abs_pi8(__m64());
(void)a;
return 0;
}
" HAVE_SSSE3)

set (TEST_FLAG "-mavx")
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0")
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
auto a = _mm256_insert_epi8(__m256i(), 0, 0);
(void)a;
return 0;
}
" HAVE_AVX)

set (TEST_FLAG "-mavx2")
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0")
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
auto a = _mm256_add_epi16(__m256i(), __m256i());
(void)a;
return 0;
}
" HAVE_AVX2)

# gcc -dM -E -mpopcnt - < /dev/null | sort > gcc-dump-popcnt
#define __POPCNT__ 1
Expand All @@ -65,5 +97,3 @@ if (HAVE_POPCNT AND NOT ARCH_AARCH64)
endif ()

cmake_pop_check_state ()

# TODO: add here sse3 test if you want use it
3 changes: 3 additions & 0 deletions contrib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,6 @@ if (USE_INTERNAL_LLVM_LIBRARY)
add_subdirectory (llvm/llvm)
endif ()

if (USE_BASE64)
add_subdirectory (base64-cmake)
endif()
1 change: 1 addition & 0 deletions contrib/base64
Submodule base64 added at a27c56
1 change: 1 addition & 0 deletions contrib/base64-cmake/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
config.h
52 changes: 52 additions & 0 deletions contrib/base64-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
SET(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/base64)

set(base64_compile_instructions "")
LIST(LENGTH base64_compile_instructions 0)
macro(cast_to_bool var instruction)
if (HAVE_${var})
set(base64_${var} 1)
set(base64_${var}_opt ${instruction})
else()
set(base64_${var} 0)
endif()
endmacro()

cast_to_bool(SSSE3 "-mssse3")
cast_to_bool(SSE41 "-msse4.1")
cast_to_bool(SSE42 "-msse4.2")
cast_to_bool(AVX "-mavx")
cast_to_bool(AVX2 "-mavx2")

# write config.h file, to include it in application
file(READ config-header.tpl header)
file(WRITE config.h ${header})
file(APPEND config.h "#define HAVE_SSSE3 ${base64_SSSE3}\n")
file(APPEND config.h "#define HAVE_SSE41 ${base64_SSE41}\n")
file(APPEND config.h "#define HAVE_SSE42 ${base64_SSE42}\n")
file(APPEND config.h "#define HAVE_AVX ${base64_AVX}\n")
file(APPEND config.h "#define HAVE_AVX2 ${base64_AVX2}\n")

set(HAVE_FAST_UNALIGNED_ACCESS 0)
if (${base64_SSSE3} OR ${base64_SSE41} OR ${base64_SSE42} OR ${base64_AVX} OR ${base64_AVX2})
set(HAVE_FAST_UNALIGNED_ACCESS 1)
endif ()

file(APPEND config.h "#define HAVE_FAST_UNALIGNED_ACCESS " ${HAVE_FAST_UNALIGNED_ACCESS} "\n")

add_library(base64 ${LINK_MODE}
${LIBRARY_DIR}/lib/lib.c
${LIBRARY_DIR}/lib/codec_choose.c
${LIBRARY_DIR}/lib/arch/avx/codec.c
${LIBRARY_DIR}/lib/arch/avx2/codec.c
${LIBRARY_DIR}/lib/arch/generic/codec.c
${LIBRARY_DIR}/lib/arch/neon32/codec.c
${LIBRARY_DIR}/lib/arch/neon64/codec.c
${LIBRARY_DIR}/lib/arch/sse41/codec.c
${LIBRARY_DIR}/lib/arch/sse42/codec.c
${LIBRARY_DIR}/lib/arch/ssse3/codec.c

${LIBRARY_DIR}/lib/codecs.h
config.h)

target_compile_options(base64 PRIVATE ${base64_SSSE3_opt} ${base64_SSE41_opt} ${base64_SSE42_opt} ${base64_AVX_opt} ${base64_AVX2_opt})
target_include_directories(base64 PRIVATE ${LIBRARY_DIR}/include .)
2 changes: 2 additions & 0 deletions contrib/base64-cmake/config-header.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#define HAVE_NEON32 0
#define HAVE_NEON64 0
1 change: 1 addition & 0 deletions dbms/src/Common/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
#cmakedefine01 USE_POCO_MONGODB
#cmakedefine01 USE_POCO_NETSSL
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
#cmakedefine01 USE_BASE64
6 changes: 5 additions & 1 deletion dbms/src/Functions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h Func

add_library(clickhouse_functions ${LINK_MODE} ${clickhouse_functions_sources})

target_link_libraries(clickhouse_functions PUBLIC dbms PRIVATE ${CONSISTENT_HASHING_LIBRARY} consistent-hashing-sumbur ${FARMHASH_LIBRARIES} ${METROHASH_LIBRARIES} murmurhash)
target_link_libraries(clickhouse_functions PUBLIC dbms PRIVATE ${CONSISTENT_HASHING_LIBRARY} consistent-hashing-sumbur ${FARMHASH_LIBRARIES} ${METROHASH_LIBRARIES} murmurhash ${BASE64_LIBRARY})

target_include_directories (clickhouse_functions SYSTEM BEFORE PUBLIC ${DIVIDE_INCLUDE_DIR})

Expand Down Expand Up @@ -81,3 +81,7 @@ endif ()
if (USE_EMBEDDED_COMPILER)
target_include_directories (clickhouse_functions SYSTEM BEFORE PUBLIC ${LLVM_INCLUDE_DIRS})
endif ()

if (USE_BASE64)
target_include_directories (clickhouse_functions SYSTEM PRIVATE ${BASE64_INCLUDE_DIR})
endif()
166 changes: 166 additions & 0 deletions dbms/src/Functions/FunctionBase64Conversion.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#include <Common/config.h>
#if USE_BASE64
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/GatherUtils/Algorithms.h>
#include <IO/WriteHelpers.h>
#include <libbase64.h>


namespace DB
{
using namespace GatherUtils;

namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int INCORRECT_DATA;
}

struct Base64Encode
{
static constexpr auto name = "base64Encode";
static size_t getBufferSize(size_t string_length, size_t string_count)
{
return ( ( string_length - string_count ) / 3 + string_count ) * 4 + string_count ;
}
};

struct Base64Decode
{
static constexpr auto name = "base64Decode";

static size_t getBufferSize(size_t string_length, size_t string_count)
{
return ( ( string_length - string_count) / 4 + string_count) * 3 + string_count;
}
};

struct TryBase64Decode
{
static constexpr auto name = "tryBase64Decode";

static size_t getBufferSize(size_t string_length, size_t string_count)
{
return Base64Decode::getBufferSize(string_length, string_count);
}
};

template <typename Func>
class FunctionBase64Conversion : public IFunction
{
public:
static constexpr auto name = Func::name;

static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionBase64Conversion>();
}

String getName() const override
{
return Func::name;
}

size_t getNumberOfArguments() const override
{
return 1;
}

bool useDefaultImplementationForConstants() const override
{
return true;
}

DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (!WhichDataType(arguments[0].type).isString())
throw Exception(
"Illegal type " + arguments[0].type->getName() + " of 1 argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

return std::make_shared<DataTypeString>();
}

void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
const ColumnPtr column_string = block.getByPosition(arguments[0]).column;
const ColumnString * input = checkAndGetColumn<ColumnString>(column_string.get());

if (!input)
throw Exception(
"Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);

auto dst_column = ColumnString::create();
auto & dst_data = dst_column->getChars();
auto & dst_offsets = dst_column->getOffsets();

size_t reserve = Func::getBufferSize(input->getChars().size(), input->size());
dst_data.resize(reserve);
dst_offsets.resize(input_rows_count);

const ColumnString::Offsets & src_offsets = input->getOffsets();

auto source = reinterpret_cast<const char *>(input->getChars().data());
auto dst = reinterpret_cast<char *>(dst_data.data());
auto dst_pos = dst;

size_t src_offset_prev = 0;

int codec = getCodec();
for (size_t row = 0; row < input_rows_count; ++row)
{
size_t srclen = src_offsets[row] - src_offset_prev - 1;
size_t outlen = 0;

if constexpr (std::is_same_v<Func, Base64Encode>)
{
base64_encode(source, srclen, dst_pos, &outlen, codec);
}
else if constexpr (std::is_same_v<Func, Base64Decode>)
{
if (!base64_decode(source, srclen, dst_pos, &outlen, codec))
{
throw Exception("Failed to " + getName() + " input '" + String(source, srclen) + "'", ErrorCodes::INCORRECT_DATA);
}
}
else
{
// during decoding character array can be partially polluted
// if fail, revert back and clean
auto savepoint = dst_pos;
if (!base64_decode(source, srclen, dst_pos, &outlen, codec))
{
outlen = 0;
dst_pos = savepoint;
// clean the symbol
dst_pos[0] = 0;
}
}

source += srclen + 1;
dst_pos += outlen + 1;

dst_offsets[row] = dst_pos - dst;
src_offset_prev = src_offsets[row];
}

dst_data.resize(dst_pos - dst);

block.getByPosition(result).column = std::move(dst_column);
}

private:
static int getCodec()
{
return 0;
}
};
}
#endif
14 changes: 14 additions & 0 deletions dbms/src/Functions/base64Decode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include <Functions/FunctionBase64Conversion.h>
#if USE_BASE64
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>

namespace DB
{

void registerFunctionBase64Decode(FunctionFactory & factory)
{
factory.registerFunction<FunctionBase64Conversion<Base64Decode>>();
}
}
#endif
14 changes: 14 additions & 0 deletions dbms/src/Functions/base64Encode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include <Functions/FunctionBase64Conversion.h>
#if USE_BASE64
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>

namespace DB
{

void registerFunctionBase64Encode(FunctionFactory & factory)
{
factory.registerFunction<FunctionBase64Conversion<Base64Encode>>();
}
}
#endif
10 changes: 10 additions & 0 deletions dbms/src/Functions/registerFunctionsString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ void registerFunctionSubstringUTF8(FunctionFactory &);
void registerFunctionAppendTrailingCharIfAbsent(FunctionFactory &);
void registerFunctionStartsWith(FunctionFactory &);
void registerFunctionEndsWith(FunctionFactory &);
#if USE_BASE64
void registerFunctionBase64Encode(FunctionFactory &);
void registerFunctionBase64Decode(FunctionFactory &);
void registerFunctionTryBase64Decode(FunctionFactory &);
#endif

void registerFunctionsString(FunctionFactory & factory)
{
Expand All @@ -38,6 +43,11 @@ void registerFunctionsString(FunctionFactory & factory)
registerFunctionAppendTrailingCharIfAbsent(factory);
registerFunctionStartsWith(factory);
registerFunctionEndsWith(factory);
#if USE_BASE64
registerFunctionBase64Encode(factory);
registerFunctionBase64Decode(factory);
registerFunctionTryBase64Decode(factory);
#endif
}

}
Expand Down
Loading