mirror of
https://github.com/pacnpal/thrillwiki_django_no_react.git
synced 2025-12-23 03:11:08 -05:00
okay fine
This commit is contained in:
29
.venv/lib/python3.12/site-packages/autobahn/nvx/__init__.py
Normal file
29
.venv/lib/python3.12/site-packages/autobahn/nvx/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
###############################################################################
|
||||
#
|
||||
# The MIT License (MIT)
|
||||
#
|
||||
# Copyright (c) typedef int GmbH
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from autobahn.nvx._utf8validator import Utf8Validator # noqa
|
||||
|
||||
__all__ = ('Utf8Validator',)
|
||||
648
.venv/lib/python3.12/site-packages/autobahn/nvx/_utf8validator.c
Normal file
648
.venv/lib/python3.12/site-packages/autobahn/nvx/_utf8validator.c
Normal file
@@ -0,0 +1,648 @@
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The MIT License (MIT)
|
||||
//
|
||||
// Copyright (c) typedef int GmbH
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// http://stackoverflow.com/questions/11228855/header-files-for-simd-intrinsics
|
||||
#if defined(__SSE2__) || defined(__SSE4_1__)
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define UTF8_ACCEPT 0
|
||||
#define UTF8_REJECT 1
|
||||
|
||||
|
||||
typedef struct {
|
||||
size_t current_index;
|
||||
size_t total_index;
|
||||
int state;
|
||||
int impl;
|
||||
} utf8_validator_t;
|
||||
|
||||
|
||||
#define UTF8_VALIDATOR_OPTIMAL 0
|
||||
#define UTF8_VALIDATOR_TABLE_DFA 1
|
||||
#define UTF8_VALIDATOR_UNROLLED_DFA 2
|
||||
#define UTF8_VALIDATOR_SSE2_DFA 3
|
||||
#define UTF8_VALIDATOR_SSE41_DFA 4
|
||||
|
||||
|
||||
int nvx_utf8vld_get_impl (void* utf8vld) {
|
||||
utf8_validator_t* vld = (utf8_validator_t*) utf8vld;
|
||||
|
||||
return vld->impl;
|
||||
}
|
||||
|
||||
int nvx_utf8vld_set_impl (void* utf8vld, int impl) {
|
||||
utf8_validator_t* vld = (utf8_validator_t*) utf8vld;
|
||||
|
||||
if (impl) {
|
||||
// set requested implementation
|
||||
//
|
||||
#ifndef __SSE4_1__
|
||||
# ifdef __SSE2__
|
||||
if (impl <= UTF8_VALIDATOR_SSE2_DFA) {
|
||||
vld->impl = impl;
|
||||
}
|
||||
# else
|
||||
if (impl <= UTF8_VALIDATOR_UNROLLED_DFA) {
|
||||
vld->impl = impl;
|
||||
}
|
||||
# endif
|
||||
#else
|
||||
if (impl <= UTF8_VALIDATOR_SSE41_DFA) {
|
||||
vld->impl = impl;
|
||||
}
|
||||
#endif
|
||||
|
||||
} else {
|
||||
// set optimal implementation
|
||||
//
|
||||
#ifndef __SSE4_1__
|
||||
# ifdef __SSE2__
|
||||
vld->impl = UTF8_VALIDATOR_SSE2_DFA;
|
||||
# else
|
||||
vld->impl = UTF8_VALIDATOR_UNROLLED_DFA;
|
||||
# endif
|
||||
#else
|
||||
vld->impl = UTF8_VALIDATOR_SSE41_DFA;
|
||||
#endif
|
||||
|
||||
}
|
||||
return vld->impl;
|
||||
}
|
||||
|
||||
|
||||
void nvx_utf8vld_reset (void* utf8vld) {
|
||||
utf8_validator_t* vld = (utf8_validator_t*) utf8vld;
|
||||
|
||||
vld->state = 0;
|
||||
vld->current_index = -1;
|
||||
vld->total_index = -1;
|
||||
}
|
||||
|
||||
|
||||
void* nvx_utf8vld_new () {
|
||||
void* p = malloc(sizeof(utf8_validator_t));
|
||||
nvx_utf8vld_reset(p);
|
||||
nvx_utf8vld_set_impl(p, 0);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
void nvx_utf8vld_free (void* utf8vld) {
|
||||
free (utf8vld);
|
||||
}
|
||||
|
||||
|
||||
// unrolled DFA from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||
//
|
||||
static const uint8_t UTF8VALIDATOR_DFA[] __attribute__((aligned(64))) =
|
||||
{
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
||||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
||||
|
||||
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
||||
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
||||
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
||||
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
||||
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // s7..s8
|
||||
};
|
||||
|
||||
|
||||
int _nvx_utf8vld_validate_table (void* utf8vld, const uint8_t* data, size_t length) {
|
||||
|
||||
utf8_validator_t* vld = (utf8_validator_t*) utf8vld;
|
||||
|
||||
int state = vld->state;
|
||||
|
||||
const uint8_t* end = data + length;
|
||||
|
||||
while (data < end && state != 1) {
|
||||
state = UTF8VALIDATOR_DFA[256 + state * 16 + UTF8VALIDATOR_DFA[*data++]];
|
||||
}
|
||||
|
||||
vld->state = state;
|
||||
|
||||
if (state == 0) {
|
||||
// UTF8 is valid and ends on codepoint
|
||||
return 0;
|
||||
} else {
|
||||
if (state == 1) {
|
||||
// UTF8 is invalid
|
||||
return -1;
|
||||
} else {
|
||||
// UTF8 is valid, but does not end on codepoint (needs more data)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// unrolled DFA from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||
//
|
||||
#define DFA_TRANSITION(state, octet) \
|
||||
if (state == 0) { \
|
||||
if (octet >= 0x00 && octet <= 0x7f) { \
|
||||
/* reflective state 0 */ \
|
||||
} else if (octet >= 0xc2 && octet <= 0xdf) { \
|
||||
state = 2; \
|
||||
} else if ((octet >= 0xe1 && octet <= 0xec) || octet == 0xee || octet == 0xef) { \
|
||||
state = 3; \
|
||||
} else if (octet == 0xe0) { \
|
||||
state = 4; \
|
||||
} else if (octet == 0xed) { \
|
||||
state = 5; \
|
||||
} else if (octet == 0xf4) { \
|
||||
state = 8; \
|
||||
} else if (octet == 0xf1 || octet == 0xf2 || octet == 0xf3) { \
|
||||
state = 7; \
|
||||
} else if (octet == 0xf0) { \
|
||||
state = 6; \
|
||||
} else { \
|
||||
state = 1; \
|
||||
} \
|
||||
} else if (state == 2) { \
|
||||
if (octet >= 0x80 && octet <= 0xbf) { \
|
||||
state = 0; \
|
||||
} else { \
|
||||
state = 1; \
|
||||
} \
|
||||
} else if (state == 3) { \
|
||||
if (octet >= 0x80 && octet <= 0xbf) { \
|
||||
state = 2; \
|
||||
} else { \
|
||||
state = 1; \
|
||||
} \
|
||||
} else if (state == 4) { \
|
||||
if (octet >= 0xa0 && octet <= 0xbf) { \
|
||||
state = 2; \
|
||||
} else { \
|
||||
state = 1; \
|
||||
} \
|
||||
} else if (state == 5) { \
|
||||
if (octet >= 0x80 && octet <= 0x9f) { \
|
||||
state = 2; \
|
||||
} else { \
|
||||
state = 1; \
|
||||
} \
|
||||
} else if (state == 6) { \
|
||||
if (octet >= 0x90 && octet <= 0xbf) { \
|
||||
state = 3; \
|
||||
} else { \
|
||||
state = 1; \
|
||||
} \
|
||||
} else if (state == 7) { \
|
||||
if (octet >= 0x80 && octet <= 0xbf) { \
|
||||
state = 3; \
|
||||
} else { \
|
||||
state = 1; \
|
||||
} \
|
||||
} else if (state == 8) { \
|
||||
if (octet >= 0x80 && octet <= 0x8f) { \
|
||||
state = 3; \
|
||||
} else { \
|
||||
state = 1; \
|
||||
} \
|
||||
} else if (state == 1) { \
|
||||
/* refective state 1 */ \
|
||||
} else { \
|
||||
/* should not arrive here */ \
|
||||
}
|
||||
|
||||
|
||||
int _nvx_utf8vld_validate_unrolled (void* utf8vld, const uint8_t* data, size_t length) {
|
||||
|
||||
utf8_validator_t* vld = (utf8_validator_t*) utf8vld;
|
||||
|
||||
int state = vld->state;
|
||||
|
||||
const uint8_t* tail_end = data + length;
|
||||
|
||||
while (data < tail_end && state != 1) {
|
||||
|
||||
// get tail octet
|
||||
int octet = *data;
|
||||
|
||||
// do the DFA
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
++data;
|
||||
}
|
||||
|
||||
vld->state = state;
|
||||
|
||||
if (state == 0) {
|
||||
// UTF8 is valid and ends on codepoint
|
||||
return 0;
|
||||
} else {
|
||||
if (state == 1) {
|
||||
// UTF8 is invalid
|
||||
return -1;
|
||||
} else {
|
||||
// UTF8 is valid, but does not end on codepoint (needs more data)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
__m128i _mm_load_si128 (__m128i const* mem_addr)
|
||||
#include "emmintrin.h"
|
||||
Instruction: movdqa
|
||||
CPUID Feature Flag: SSE2
|
||||
|
||||
int _mm_movemask_epi8 (__m128i a)
|
||||
#include "emmintrin.h"
|
||||
Instruction: pmovmskb
|
||||
CPUID Feature Flag: SSE2
|
||||
|
||||
__m128i _mm_srli_si128 (__m128i a, int imm)
|
||||
#include "emmintrin.h"
|
||||
Instruction: psrldq
|
||||
CPUID Feature Flag: SSE2
|
||||
|
||||
int _mm_cvtsi128_si32 (__m128i a)
|
||||
#include "emmintrin.h"
|
||||
Instruction: movd
|
||||
CPUID Feature Flag: SSE2
|
||||
|
||||
int _mm_extract_epi16 (__m128i a, int imm)
|
||||
#include "emmintrin.h"
|
||||
Instruction: pextrw
|
||||
CPUID Feature Flag: SSE2
|
||||
|
||||
int _mm_extract_epi8 (__m128i a, const int imm)
|
||||
#include "smmintrin.h"
|
||||
Instruction: pextrb
|
||||
CPUID Feature Flag: SSE4.1
|
||||
*/
|
||||
|
||||
#ifdef __SSE2__
|
||||
int _nvx_utf8vld_validate_sse2 (void* utf8vld, const uint8_t* data, size_t length) {
|
||||
|
||||
utf8_validator_t* vld = (utf8_validator_t*) utf8vld;
|
||||
|
||||
int state = vld->state;
|
||||
|
||||
const uint8_t* tail_end = data + length;
|
||||
|
||||
// process unaligned head (sub 16 octets)
|
||||
//
|
||||
size_t head_len = ((size_t) data) % sizeof(__m128i);
|
||||
if (head_len) {
|
||||
|
||||
const uint8_t* head_end = data + head_len;
|
||||
|
||||
while (data < head_end && state != UTF8_REJECT) {
|
||||
|
||||
// get head octet
|
||||
int octet = *data;
|
||||
|
||||
// do the DFA
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
++data;
|
||||
}
|
||||
}
|
||||
|
||||
// process aligned middle (16 octet chunks)
|
||||
//
|
||||
const __m128i* ptr = ((const __m128i*) data);
|
||||
const __m128i* end = ((const __m128i*) data) + ((length - head_len) / sizeof(__m128i));
|
||||
|
||||
while (ptr < end && state != UTF8_REJECT) {
|
||||
|
||||
__builtin_prefetch(ptr + 1, 0, 3);
|
||||
//__builtin_prefetch(ptr + 4, 0, 3); // 16*4=64: cache-line prefetch
|
||||
|
||||
__m128i xmm1 = _mm_load_si128(ptr);
|
||||
|
||||
if (__builtin_expect(state || _mm_movemask_epi8(xmm1), 0)) {
|
||||
|
||||
// copy to different reg - this allows the prefetching to
|
||||
// do its job in the meantime (I guess ..)
|
||||
|
||||
// SSE2 variant
|
||||
//
|
||||
int octet;
|
||||
|
||||
// octet 0
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 1
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 2
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 3
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 4
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 5
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 6
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 7
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 8
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 9
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 10
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 11
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 12
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 13
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 14
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 15
|
||||
xmm1 = _mm_srli_si128(xmm1, 1);
|
||||
octet = 0xff & _mm_cvtsi128_si32(xmm1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
}
|
||||
++ptr;
|
||||
}
|
||||
|
||||
// process unaligned tail (sub 16 octets)
|
||||
//
|
||||
const uint8_t* tail_ptr = (const uint8_t*) ptr;
|
||||
|
||||
while (tail_ptr < tail_end && state != UTF8_REJECT) {
|
||||
|
||||
// get tail octet
|
||||
int octet = *tail_ptr;
|
||||
|
||||
// do the DFA
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
++tail_ptr;
|
||||
}
|
||||
|
||||
vld->state = state;
|
||||
|
||||
if (state == UTF8_ACCEPT) {
|
||||
// UTF8 is valid and ends on codepoint
|
||||
return 0;
|
||||
} else {
|
||||
if (state == UTF8_REJECT) {
|
||||
// UTF8 is invalid
|
||||
return -1;
|
||||
} else {
|
||||
// UTF8 is valid, but does not end on codepoint (needs more data)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
int _nvx_utf8vld_validate_sse4 (void* utf8vld, const uint8_t* data, size_t length) {
|
||||
|
||||
utf8_validator_t* vld = (utf8_validator_t*) utf8vld;
|
||||
|
||||
int state = vld->state;
|
||||
|
||||
const uint8_t* tail_end = data + length;
|
||||
|
||||
// process unaligned head (sub 16 octets)
|
||||
//
|
||||
size_t head_len = ((size_t) data) % sizeof(__m128i);
|
||||
if (head_len) {
|
||||
|
||||
const uint8_t* head_end = data + head_len;
|
||||
|
||||
while (data < head_end && state != UTF8_REJECT) {
|
||||
|
||||
// get head octet
|
||||
int octet = *data;
|
||||
|
||||
// do the DFA
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
++data;
|
||||
}
|
||||
}
|
||||
|
||||
// process aligned middle (16 octet chunks)
|
||||
//
|
||||
const __m128i* ptr = ((const __m128i*) data);
|
||||
const __m128i* end = ((const __m128i*) data) + ((length - head_len) / sizeof(__m128i));
|
||||
|
||||
while (ptr < end && state != UTF8_REJECT) {
|
||||
|
||||
__builtin_prefetch(ptr + 1, 0, 3);
|
||||
//__builtin_prefetch(ptr + 4, 0, 3); // 16*4=64: cache-line prefetch
|
||||
|
||||
__m128i xmm1 = _mm_load_si128(ptr);
|
||||
|
||||
|
||||
if (__builtin_expect(state || _mm_movemask_epi8(xmm1), 0)) {
|
||||
|
||||
// copy to different reg - this allows the prefetching to
|
||||
// do its job in the meantime (I guess ..)
|
||||
|
||||
// SSE4.1 variant
|
||||
//
|
||||
int octet;
|
||||
|
||||
// octet 0
|
||||
octet = _mm_extract_epi8(xmm1, 0);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 1
|
||||
octet = _mm_extract_epi8(xmm1, 1);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 2
|
||||
octet = _mm_extract_epi8(xmm1, 2);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 3
|
||||
octet = _mm_extract_epi8(xmm1, 3);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 4
|
||||
octet = _mm_extract_epi8(xmm1, 4);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 5
|
||||
octet = _mm_extract_epi8(xmm1, 5);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 6
|
||||
octet = _mm_extract_epi8(xmm1, 6);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 7
|
||||
octet = _mm_extract_epi8(xmm1, 7);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 8
|
||||
octet = _mm_extract_epi8(xmm1, 8);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 9
|
||||
octet = _mm_extract_epi8(xmm1, 9);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 10
|
||||
octet = _mm_extract_epi8(xmm1, 10);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 11
|
||||
octet = _mm_extract_epi8(xmm1, 11);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 12
|
||||
octet = _mm_extract_epi8(xmm1, 12);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 13
|
||||
octet = _mm_extract_epi8(xmm1, 13);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 14
|
||||
octet = _mm_extract_epi8(xmm1, 14);
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
// octet 15
|
||||
octet = _mm_extract_epi8(xmm1, 15);
|
||||
DFA_TRANSITION(state, octet);
|
||||
}
|
||||
++ptr;
|
||||
}
|
||||
|
||||
// process unaligned tail (sub 16 octets)
|
||||
//
|
||||
const uint8_t* tail_ptr = (const uint8_t*) ptr;
|
||||
|
||||
while (tail_ptr < tail_end && state != UTF8_REJECT) {
|
||||
|
||||
// get tail octet
|
||||
int octet = *tail_ptr;
|
||||
|
||||
// do the DFA
|
||||
DFA_TRANSITION(state, octet);
|
||||
|
||||
++tail_ptr;
|
||||
}
|
||||
|
||||
vld->state = state;
|
||||
|
||||
if (state == UTF8_ACCEPT) {
|
||||
// UTF8 is valid and ends on codepoint
|
||||
return 0;
|
||||
} else {
|
||||
if (state == UTF8_REJECT) {
|
||||
// UTF8 is invalid
|
||||
return -1;
|
||||
} else {
|
||||
// UTF8 is valid, but does not end on codepoint (needs more data)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int nvx_utf8vld_validate (void* utf8vld, const uint8_t* data, size_t length) {
|
||||
|
||||
utf8_validator_t* vld = (utf8_validator_t*) utf8vld;
|
||||
|
||||
switch (vld->impl) {
|
||||
case UTF8_VALIDATOR_TABLE_DFA:
|
||||
return _nvx_utf8vld_validate_table(utf8vld, data, length);
|
||||
case UTF8_VALIDATOR_UNROLLED_DFA:
|
||||
return _nvx_utf8vld_validate_unrolled(utf8vld, data, length);
|
||||
#ifdef __SSE2__
|
||||
case UTF8_VALIDATOR_SSE2_DFA:
|
||||
return _nvx_utf8vld_validate_table(utf8vld, data, length);
|
||||
#endif
|
||||
#ifdef __SSE4_1__
|
||||
case UTF8_VALIDATOR_SSE41_DFA:
|
||||
return _nvx_utf8vld_validate_table(utf8vld, data, length);
|
||||
#endif
|
||||
default:
|
||||
return _nvx_utf8vld_validate_table(utf8vld, data, length);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
###############################################################################
|
||||
#
|
||||
# The MIT License (MIT)
|
||||
#
|
||||
# Copyright (c) typedef int GmbH
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import os
|
||||
from cffi import FFI
|
||||
|
||||
|
||||
ffi = FFI()
|
||||
|
||||
ffi.cdef("""
|
||||
void* nvx_utf8vld_new ();
|
||||
|
||||
void nvx_utf8vld_reset (void* utf8vld);
|
||||
|
||||
int nvx_utf8vld_validate (void* utf8vld, const uint8_t* data, size_t length);
|
||||
|
||||
void nvx_utf8vld_free (void* utf8vld);
|
||||
|
||||
int nvx_utf8vld_set_impl(void* utf8vld, int impl);
|
||||
|
||||
int nvx_utf8vld_get_impl(void* utf8vld);
|
||||
""")
|
||||
|
||||
if 'AUTOBAHN_USE_NVX' in os.environ and os.environ['AUTOBAHN_USE_NVX'] in ['1', 'true']:
|
||||
optional = False # :noindex:
|
||||
else:
|
||||
optional = True # :noindex:
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), '_utf8validator.c')) as fd:
|
||||
c_source = fd.read()
|
||||
ffi.set_source(
|
||||
"_nvx_utf8validator",
|
||||
c_source,
|
||||
libraries=[],
|
||||
extra_compile_args=['-std=c99', '-Wall', '-Wno-strict-prototypes', '-O3', '-march=native'],
|
||||
optional=optional
|
||||
)
|
||||
|
||||
|
||||
class Utf8Validator:
|
||||
"""
|
||||
:noindex:
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.ffi = ffi
|
||||
|
||||
from _nvx_utf8validator import lib
|
||||
self.lib = lib
|
||||
|
||||
self._vld = self.ffi.gc(self.lib.nvx_utf8vld_new(), self.lib.nvx_utf8vld_free)
|
||||
# print(self.lib.nvx_utf8vld_get_impl(self._vld))
|
||||
|
||||
def reset(self):
|
||||
self.lib.nvx_utf8vld_reset(self._vld)
|
||||
|
||||
def validate(self, ba):
|
||||
res = self.lib.nvx_utf8vld_validate(self._vld, ba, len(ba))
|
||||
return (res >= 0, res == 0, None, None)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ffi.compile()
|
||||
@@ -0,0 +1,25 @@
|
||||
###############################################################################
|
||||
#
|
||||
# The MIT License (MIT)
|
||||
#
|
||||
# Copyright (c) typedef int GmbH
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
#
|
||||
###############################################################################
|
||||
@@ -0,0 +1,359 @@
|
||||
# coding=utf-8
|
||||
|
||||
###############################################################################
|
||||
#
|
||||
# The MIT License (MIT)
|
||||
#
|
||||
# Copyright (c) typedef int GmbH
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import struct
|
||||
import unittest
|
||||
|
||||
from autobahn.websocket.utf8validator import Utf8Validator as StandardUtf8Validator
|
||||
|
||||
try:
|
||||
from _nvx_utf8validator import lib # noqa
|
||||
from autobahn.nvx import Utf8Validator as NvxUtf8Validator
|
||||
except ImportError:
|
||||
HAS_NVX = False
|
||||
else:
|
||||
HAS_NVX = True
|
||||
|
||||
|
||||
def _create_utf8_test_sequences():
|
||||
"""
|
||||
Create test sequences for UTF-8 decoder tests from
|
||||
http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
||||
"""
|
||||
|
||||
UTF8_TEST_SEQUENCES = []
|
||||
|
||||
# 1 Some correct UTF-8 text
|
||||
vss = b'\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5'
|
||||
vs = [b"Some valid UTF-8 sequences", []]
|
||||
vs[1].append((True, b'hello\x24world')) # U+0024
|
||||
vs[1].append((True, b'hello\xC2\xA2world')) # U+00A2
|
||||
vs[1].append((True, b'hello\xE2\x82\xACworld')) # U+20AC
|
||||
vs[1].append((True, b'hello\xF0\xA4\xAD\xA2world')) # U+24B62
|
||||
vs[1].append((True, vss))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# All prefixes of correct UTF-8 text
|
||||
vs = [
|
||||
b"All prefixes of a valid UTF-8 string that contains multi-byte code points",
|
||||
[]]
|
||||
v = StandardUtf8Validator()
|
||||
for i in range(1, len(vss) + 1):
|
||||
v.reset()
|
||||
res = v.validate(vss[:i])
|
||||
vs[1].append((res[0] and res[1], vss[:i]))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 2.1 First possible sequence of a certain length
|
||||
vs = [b"First possible sequence of a certain length", []]
|
||||
vs[1].append((True, b'\x00'))
|
||||
vs[1].append((True, b'\xc2\x80'))
|
||||
vs[1].append((True, b'\xe0\xa0\x80'))
|
||||
vs[1].append((True, b'\xf0\x90\x80\x80'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# the following conform to the UTF-8 integer encoding scheme, but
|
||||
# valid UTF-8 only allows for Unicode code points up to U+10FFFF
|
||||
vs = [b"First possible sequence length 5/6 (invalid codepoints)", []]
|
||||
vs[1].append((False, b'\xf8\x88\x80\x80\x80'))
|
||||
vs[1].append((False, b'\xfc\x84\x80\x80\x80\x80'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 2.2 Last possible sequence of a certain length
|
||||
vs = [b"Last possible sequence of a certain length", []]
|
||||
vs[1].append((True, b'\x7f'))
|
||||
vs[1].append((True, b'\xdf\xbf'))
|
||||
vs[1].append((True, b'\xef\xbf\xbf'))
|
||||
vs[1].append((True, b'\xf4\x8f\xbf\xbf'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# the following conform to the UTF-8 integer encoding scheme, but
|
||||
# valid UTF-8 only allows for Unicode code points up to U+10FFFF
|
||||
vs = [b"Last possible sequence length 4/5/6 (invalid codepoints)", []]
|
||||
vs[1].append((False, b'\xf7\xbf\xbf\xbf'))
|
||||
vs[1].append((False, b'\xfb\xbf\xbf\xbf\xbf'))
|
||||
vs[1].append((False, b'\xfd\xbf\xbf\xbf\xbf\xbf'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 2.3 Other boundary conditions
|
||||
vs = [b"Other boundary conditions", []]
|
||||
vs[1].append((True, b'\xed\x9f\xbf'))
|
||||
vs[1].append((True, b'\xee\x80\x80'))
|
||||
vs[1].append((True, b'\xef\xbf\xbd'))
|
||||
vs[1].append((True, b'\xf4\x8f\xbf\xbf'))
|
||||
vs[1].append((False, b'\xf4\x90\x80\x80'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 3.1 Unexpected continuation bytes
|
||||
vs = [b"Unexpected continuation bytes", []]
|
||||
vs[1].append((False, b'\x80'))
|
||||
vs[1].append((False, b'\xbf'))
|
||||
vs[1].append((False, b'\x80\xbf'))
|
||||
vs[1].append((False, b'\x80\xbf\x80'))
|
||||
vs[1].append((False, b'\x80\xbf\x80\xbf'))
|
||||
vs[1].append((False, b'\x80\xbf\x80\xbf\x80'))
|
||||
vs[1].append((False, b'\x80\xbf\x80\xbf\x80\xbf'))
|
||||
s = b''
|
||||
|
||||
# 3.2 Lonely start characters
|
||||
vs = [b"Lonely start characters", []]
|
||||
m = [(0xc0, 0xdf), (0xe0, 0xef), (0xf0, 0xf7), (0xf8, 0xfb), (0xfc, 0xfd)]
|
||||
for mm in m:
|
||||
s = b''
|
||||
for i in range(mm[0], mm[1]):
|
||||
s += struct.pack('BB', i, 0x20)
|
||||
# s += chr(i)
|
||||
# s += chr(0x20)
|
||||
vs[1].append((False, s))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 3.3 Sequences with last continuation byte missing
|
||||
vs = [b"Sequences with last continuation byte missing", []]
|
||||
k = [b'\xc0', b'\xe0\x80', b'\xf0\x80\x80', b'\xf8\x80\x80\x80', b'\xfc\x80\x80\x80\x80',
|
||||
b'\xdf', b'\xef\xbf', b'\xf7\xbf\xbf', b'\xfb\xbf\xbf\xbf', b'\xfd\xbf\xbf\xbf\xbf']
|
||||
for kk in k:
|
||||
vs[1].append((False, kk))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 3.4 Concatenation of incomplete sequences
|
||||
vs = [b"Concatenation of incomplete sequences", []]
|
||||
vs[1].append((False, b''.join(k)))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 3.5 Impossible bytes
|
||||
vs = [b"Impossible bytes", []]
|
||||
vs[1].append((False, b'\xfe'))
|
||||
vs[1].append((False, b'\xff'))
|
||||
vs[1].append((False, b'\xfe\xfe\xff\xff'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 4.1 Examples of an overlong ASCII character
|
||||
vs = [b"Examples of an overlong ASCII character", []]
|
||||
vs[1].append((False, b'\xc0\xaf'))
|
||||
vs[1].append((False, b'\xe0\x80\xaf'))
|
||||
vs[1].append((False, b'\xf0\x80\x80\xaf'))
|
||||
vs[1].append((False, b'\xf8\x80\x80\x80\xaf'))
|
||||
vs[1].append((False, b'\xfc\x80\x80\x80\x80\xaf'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 4.2 Maximum overlong sequences
|
||||
vs = [b"Maximum overlong sequences", []]
|
||||
vs[1].append((False, b'\xc1\xbf'))
|
||||
vs[1].append((False, b'\xe0\x9f\xbf'))
|
||||
vs[1].append((False, b'\xf0\x8f\xbf\xbf'))
|
||||
vs[1].append((False, b'\xf8\x87\xbf\xbf\xbf'))
|
||||
vs[1].append((False, b'\xfc\x83\xbf\xbf\xbf\xbf'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 4.3 Overlong representation of the NUL character
|
||||
vs = [b"Overlong representation of the NUL character", []]
|
||||
vs[1].append((False, b'\xc0\x80'))
|
||||
vs[1].append((False, b'\xe0\x80\x80'))
|
||||
vs[1].append((False, b'\xf0\x80\x80\x80'))
|
||||
vs[1].append((False, b'\xf8\x80\x80\x80\x80'))
|
||||
vs[1].append((False, b'\xfc\x80\x80\x80\x80\x80'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 5.1 Single UTF-16 surrogates
|
||||
vs = [b"Single UTF-16 surrogates", []]
|
||||
vs[1].append((False, b'\xed\xa0\x80'))
|
||||
vs[1].append((False, b'\xed\xad\xbf'))
|
||||
vs[1].append((False, b'\xed\xae\x80'))
|
||||
vs[1].append((False, b'\xed\xaf\xbf'))
|
||||
vs[1].append((False, b'\xed\xb0\x80'))
|
||||
vs[1].append((False, b'\xed\xbe\x80'))
|
||||
vs[1].append((False, b'\xed\xbf\xbf'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 5.2 Paired UTF-16 surrogates
|
||||
vs = [b"Paired UTF-16 surrogates", []]
|
||||
vs[1].append((False, b'\xed\xa0\x80\xed\xb0\x80'))
|
||||
vs[1].append((False, b'\xed\xa0\x80\xed\xbf\xbf'))
|
||||
vs[1].append((False, b'\xed\xad\xbf\xed\xb0\x80'))
|
||||
vs[1].append((False, b'\xed\xad\xbf\xed\xbf\xbf'))
|
||||
vs[1].append((False, b'\xed\xae\x80\xed\xb0\x80'))
|
||||
vs[1].append((False, b'\xed\xae\x80\xed\xbf\xbf'))
|
||||
vs[1].append((False, b'\xed\xaf\xbf\xed\xb0\x80'))
|
||||
vs[1].append((False, b'\xed\xaf\xbf\xed\xbf\xbf'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# 5.3 Other illegal code positions
|
||||
# Those are non-character code points and valid UTF-8 by RFC 3629
|
||||
vs = [b"Non-character code points (valid UTF-8)", []]
|
||||
# https://bug686312.bugzilla.mozilla.org/attachment.cgi?id=561257
|
||||
# non-characters: EF BF [BE-BF]
|
||||
vs[1].append((True, b'\xef\xbf\xbe'))
|
||||
vs[1].append((True, b'\xef\xbf\xbf'))
|
||||
# non-characters: F[0-7] [89AB]F BF [BE-BF]
|
||||
for z1 in [b'\xf0', b'\xf1', b'\xf2', b'\xf3', b'\xf4']:
|
||||
for z2 in [b'\x8f', b'\x9f', b'\xaf', b'\xbf']:
|
||||
# those encode codepoints >U+10FFFF
|
||||
if not (z1 == b'\xf4' and z2 != b'\x8f'):
|
||||
for z3 in [b'\xbe', b'\xbf']:
|
||||
zz = z1 + z2 + b'\xbf' + z3
|
||||
if zz not in [b'\xf0\x8f\xbf\xbe',
|
||||
b'\xf0\x8f\xbf\xbf']: # filter overlong sequences
|
||||
vs[1].append((True, zz))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
# Unicode "specials", such as replacement char etc
|
||||
# http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
|
||||
vs = [b"Unicode specials (i.e. replacement char)", []]
|
||||
vs[1].append((True, b'\xef\xbf\xb9'))
|
||||
vs[1].append((True, b'\xef\xbf\xba'))
|
||||
vs[1].append((True, b'\xef\xbf\xbb'))
|
||||
vs[1].append((True, b'\xef\xbf\xbc'))
|
||||
vs[1].append((True, b'\xef\xbf\xbd')) # replacement char
|
||||
vs[1].append((True, b'\xef\xbf\xbe'))
|
||||
vs[1].append((True, b'\xef\xbf\xbf'))
|
||||
UTF8_TEST_SEQUENCES.append(vs)
|
||||
|
||||
return UTF8_TEST_SEQUENCES
|
||||
|
||||
|
||||
def _create_valid_utf8_test_sequences():
|
||||
"""
|
||||
Generate some exotic, but valid UTF8 test strings.
|
||||
"""
|
||||
VALID_UTF8_TEST_SEQUENCES = []
|
||||
for test in _create_utf8_test_sequences():
|
||||
valids = [x[1] for x in test[1] if x[0]]
|
||||
if len(valids) > 0:
|
||||
VALID_UTF8_TEST_SEQUENCES.append([test[0], valids])
|
||||
return VALID_UTF8_TEST_SEQUENCES
|
||||
|
||||
|
||||
@unittest.skipIf(not HAS_NVX, 'NVX native extensions not present')
|
||||
class TestNvxUtf8Validator(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
# These tests verify the UTF-8 decoder/validator on the various test cases from
|
||||
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
||||
vs = []
|
||||
for k in _create_utf8_test_sequences():
|
||||
vs.extend(k[1])
|
||||
|
||||
# All Unicode code points
|
||||
for i in range(
|
||||
0, 0xffff): # should by 0x10ffff, but non-wide Python build is limited to 16-bits
|
||||
if i < 0xD800 or i > 0xDFFF: # filter surrogate code points, which are disallowed to encode in UTF-8
|
||||
vs.append((True, chr(i).encode("utf-8")))
|
||||
|
||||
# FIXME: UnicodeEncodeError: 'utf-8' codec can't encode character '\ud800'
|
||||
# in position 0: surrogates not allowed
|
||||
if False:
|
||||
# 5.1 Single UTF-16 surrogates
|
||||
for i in range(0xD800, 0xDBFF): # high-surrogate
|
||||
ss = chr(i).encode("utf-8")
|
||||
vs.append((False, ss))
|
||||
for i in range(0xDC00, 0xDFFF): # low-surrogate
|
||||
ss = chr(i).encode("utf-8")
|
||||
vs.append((False, ss))
|
||||
|
||||
# 5.2 Paired UTF-16 surrogates
|
||||
for i in range(0xD800, 0xDBFF): # high-surrogate
|
||||
for j in range(0xDC00, 0xDFFF): # low-surrogate
|
||||
ss1 = chr(i).encode("utf-8")
|
||||
ss2 = chr(j).encode("utf-8")
|
||||
vs.append((False, ss1 + ss2))
|
||||
vs.append((False, ss2 + ss1))
|
||||
|
||||
self._TEST_SEQUENCES = vs
|
||||
|
||||
def test_standard_utf8validator(self):
|
||||
"""
|
||||
Test standard implementation of UTF8 validator.
|
||||
"""
|
||||
validator = StandardUtf8Validator()
|
||||
return self._test_utf8(validator)
|
||||
|
||||
def test_nvx_utf8validator(self):
|
||||
"""
|
||||
Test NVX implementation of UTF8 validator.
|
||||
"""
|
||||
validator = NvxUtf8Validator()
|
||||
return self._test_utf8(validator)
|
||||
|
||||
def test_standard_utf8validator_incremental(self):
|
||||
"""
|
||||
Test standard implementation of UTF8 validator in incremental mode.
|
||||
"""
|
||||
validator = StandardUtf8Validator()
|
||||
return self._test_utf8_incremental(validator)
|
||||
|
||||
# FIXME
|
||||
# see also (I think ..): https://twistedmatrix.com/trac/ticket/4811
|
||||
#
|
||||
# import pytest
|
||||
#
|
||||
# @pytest.mark.xfail(reason='NVX UTF8 validator lacks incremental mode implementation')
|
||||
# @unittest.expectedFailure
|
||||
# def test_nvx_utf8validator_incremental(self):
|
||||
# """
|
||||
# Test NVX implementation of UTF8 validator in incremental mode.
|
||||
# """
|
||||
# validator = NvxUtf8Validator()
|
||||
# return self._test_utf8_incremental(validator)
|
||||
|
||||
def _test_utf8(self, validator):
|
||||
for s in self._TEST_SEQUENCES:
|
||||
validator.reset()
|
||||
r = validator.validate(s[1])
|
||||
|
||||
# no UTF-8 decode error _and_ everything consumed
|
||||
res = r[0] and r[1]
|
||||
|
||||
self.assertEqual(res, s[0])
|
||||
|
||||
def _test_utf8_incremental(self, validator, withPositions=True):
|
||||
# These tests verify that the UTF-8 decoder/validator can operate incrementally.
|
||||
if withPositions:
|
||||
# testing validator 4 on incremental detection with positions
|
||||
k = 4
|
||||
else:
|
||||
# testing validator 2 on incremental detection without positions
|
||||
k = 2
|
||||
|
||||
validator.reset()
|
||||
self.assertEqual((True, True, 15, 15)[:k], validator.validate('µ@ßöäüàá'.encode('utf8'))[:k])
|
||||
|
||||
validator.reset()
|
||||
self.assertEqual((False, False, 0, 0)[:k], validator.validate(b"\xF5")[:k])
|
||||
|
||||
# the following 3 all fail on eating byte 7 (0xA0)
|
||||
validator.reset()
|
||||
self.assertEqual((True, True, 6, 6)[:k], validator.validate(b"\x65\x64\x69\x74\x65\x64")[:k])
|
||||
self.assertEqual((False, False, 1, 7)[:k], validator.validate(b"\xED\xA0\x80")[:k])
|
||||
|
||||
validator.reset()
|
||||
self.assertEqual((True, True, 4, 4)[:k], validator.validate(b"\x65\x64\x69\x74")[:k])
|
||||
self.assertEqual((False, False, 3, 7)[:k], validator.validate(b"\x65\x64\xED\xA0\x80")[:k])
|
||||
|
||||
validator.reset()
|
||||
self.assertEqual((True, False, 7, 7)[:k], validator.validate(b"\x65\x64\x69\x74\x65\x64\xED")[:k])
|
||||
self.assertEqual((False, False, 0, 7)[:k], validator.validate(b"\xA0\x80")[:k])
|
||||
Reference in New Issue
Block a user