◐ Shell
clean mode source ↗

Make b64decode with validate=True faster by compiling regex by jaysonsantos · Pull Request #11634 · python/cpython

Hi there, I was running some code that had to de code a lot of base64 encoded data and
I was using validate=True to also make sure it had valida data but it seemed to be way
slower than validate=False and I saw that it uses a regex without compiling it and
thought of sending a PR.
I didn't put any issue number because I didn't find any and the docs
say that for trivial changes there is no need to do it. Let me know if it would be needed.
And I also didn't create a test because Lib/test/test_base64.py already covers validate=True.

With the following benchmark the change goes from 15.174073122 to 8.906353553999999.

import timeit

import re
import binascii
from base64 import _bytes_from_decode_data

def original_base64(s, altchars=None, validate=False):
    s = _bytes_from_decode_data(s)
    if altchars is not None:
        altchars = _bytes_from_decode_data(altchars)
        assert len(altchars) == 2, repr(altchars)
        s = s.translate(bytes.maketrans(altchars, b'+/'))
    if validate and not re.match(b'^[A-Za-z0-9+/]*={0,2}$', s):
        raise binascii.Error('Non-base64 digit found')
    return binascii.a2b_base64(s)

VALID_BASE64_REGEX = re.compile(b'^[A-Za-z0-9+/]*={0,2}$')

def changed_base64(s, altchars=None, validate=False):
    s = _bytes_from_decode_data(s)
    if altchars is not None:
        altchars = _bytes_from_decode_data(altchars)
        assert len(altchars) == 2, repr(altchars)
        s = s.translate(bytes.maketrans(altchars, b'+/'))
    if validate and not VALID_BASE64_REGEX.match(s):
        raise binascii.Error('Non-base64 digit found')
    return binascii.a2b_base64(s)

for b64 in (original_base64, changed_base64):
    print(b64)
    print(timeit.timeit('assert b64(b"UHl0aG9u", validate=True) == b"Python"', globals={'b64': b64}, number=10_000_000))