Make b64decode with validate=True faster by compiling regex by jaysonsantos · Pull Request #11634 · python/cpython
Hi there, I was running some code that had to de code a lot of base64 encoded data and
I was using validate=True to also make sure it had valida data but it seemed to be way
slower than validate=False and I saw that it uses a regex without compiling it and
thought of sending a PR.
I didn't put any issue number because I didn't find any and the docs
say that for trivial changes there is no need to do it. Let me know if it would be needed.
And I also didn't create a test because Lib/test/test_base64.py already covers validate=True.
With the following benchmark the change goes from 15.174073122 to 8.906353553999999.
import timeit import re import binascii from base64 import _bytes_from_decode_data def original_base64(s, altchars=None, validate=False): s = _bytes_from_decode_data(s) if altchars is not None: altchars = _bytes_from_decode_data(altchars) assert len(altchars) == 2, repr(altchars) s = s.translate(bytes.maketrans(altchars, b'+/')) if validate and not re.match(b'^[A-Za-z0-9+/]*={0,2}$', s): raise binascii.Error('Non-base64 digit found') return binascii.a2b_base64(s) VALID_BASE64_REGEX = re.compile(b'^[A-Za-z0-9+/]*={0,2}$') def changed_base64(s, altchars=None, validate=False): s = _bytes_from_decode_data(s) if altchars is not None: altchars = _bytes_from_decode_data(altchars) assert len(altchars) == 2, repr(altchars) s = s.translate(bytes.maketrans(altchars, b'+/')) if validate and not VALID_BASE64_REGEX.match(s): raise binascii.Error('Non-base64 digit found') return binascii.a2b_base64(s) for b64 in (original_base64, changed_base64): print(b64) print(timeit.timeit('assert b64(b"UHl0aG9u", validate=True) == b"Python"', globals={'b64': b64}, number=10_000_000))