Merge pull request #6182 from str4d/secp256k1-updates

Update the `libsecp256k1` subtree
2022-09-30 22:02:45 +01:00 · 2022-09-30 22:02:45 +01:00 · d41916d26b
parent cfe027bce5 69ebc1fe34
commit d41916d26b
62 changed files with 4618 additions and 1929 deletions
--- a/configure.ac
+++ b/configure.ac
@ -1195,7 +1195,7 @@ PKGCONFIG_LIBDIR_TEMP="$PKG_CONFIG_LIBDIR"
 unset PKG_CONFIG_LIBDIR
 PKG_CONFIG_LIBDIR="$PKGCONFIG_LIBDIR_TEMP"
-ac_configure_args="${ac_configure_args} --disable-shared --with-pic --enable-benchmark=no --with-bignum=no --enable-module-recovery"
+ac_configure_args="${ac_configure_args} --disable-shared --with-pic --enable-benchmark=no --enable-module-recovery --disable-openssl-tests"
 AC_CONFIG_SUBDIRS([src/secp256k1 src/univalue])
 AC_OUTPUT
--- a/src/secp256k1/.cirrus.yml
+++ b/src/secp256k1/.cirrus.yml
@ -0,0 +1,315 @@
 env:
  WIDEMUL: auto
  STATICPRECOMPUTATION: yes
  ECMULTGENPRECISION: auto
  ASM: no
  BUILD: check
  WITH_VALGRIND: yes
  EXTRAFLAGS:
  HOST:
  ECDH: no
  RECOVERY: no
  SCHNORRSIG: no
  EXPERIMENTAL: no
  CTIMETEST: yes
  BENCH: yes
  TEST_ITERS:
  BENCH_ITERS: 2
  MAKEFLAGS: -j2
 cat_logs_snippet: &CAT_LOGS
  always:
    cat_tests_log_script:
      - cat tests.log || true
    cat_exhaustive_tests_log_script:
      - cat exhaustive_tests.log || true
    cat_valgrind_ctime_test_log_script:
      - cat valgrind_ctime_test.log || true
    cat_bench_log_script:
      - cat bench.log || true
  on_failure:
    cat_config_log_script:
      - cat config.log || true
    cat_test_env_script:
      - cat test_env.log || true
    cat_ci_env_script:
      - env
 merge_base_script_snippet: &MERGE_BASE
  merge_base_script:
    - if [ "$CIRRUS_PR" = "" ]; then exit 0; fi
    - git fetch $CIRRUS_REPO_CLONE_URL $CIRRUS_BASE_BRANCH
    - git config --global user.email "ci@ci.ci"
    - git config --global user.name "ci"
    - git merge FETCH_HEAD  # Merge base to detect silent merge conflicts
 task:
  name: "x86_64: Linux (Debian stable)"
  container:
    dockerfile: ci/linux-debian.Dockerfile
    # Reduce number of CPUs to be able to do more builds in parallel.
    cpu: 1
    # More than enough for our scripts.
    memory: 1G
  matrix: &ENV_MATRIX
    - env: {WIDEMUL:  int64,  RECOVERY: yes}
    - env: {WIDEMUL:  int64,                 ECDH: yes, EXPERIMENTAL: yes, SCHNORRSIG: yes}
    - env: {WIDEMUL: int128}
    - env: {WIDEMUL: int128,  RECOVERY: yes,            EXPERIMENTAL: yes, SCHNORRSIG: yes}
    - env: {WIDEMUL: int128,                 ECDH: yes, EXPERIMENTAL: yes, SCHNORRSIG: yes}
    - env: {WIDEMUL: int128,  ASM: x86_64}
    - env: {                  RECOVERY: yes,            EXPERIMENTAL: yes, SCHNORRSIG: yes}
    - env: {                  STATICPRECOMPUTATION: no}
    - env: {BUILD: distcheck, WITH_VALGRIND: no, CTIMETEST: no, BENCH: no}
    - env: {CPPFLAGS: -DDETERMINISTIC}
    - env: {CFLAGS: -O0, CTIMETEST: no}
    - env: { ECMULTGENPRECISION: 2 }
    - env: { ECMULTGENPRECISION: 8 }
  matrix:
    - env:
        CC: gcc
    - env:
        CC: clang
  << : *MERGE_BASE
  test_script:
    - ./ci/cirrus.sh
  << : *CAT_LOGS
 task:
  name: "i686: Linux (Debian stable)"
  container:
    dockerfile: ci/linux-debian.Dockerfile
    cpu: 1
    memory: 1G
  env:
    HOST: i686-linux-gnu
    ECDH: yes
    RECOVERY: yes
    EXPERIMENTAL: yes
    SCHNORRSIG: yes
  matrix:
    - env:
        CC: i686-linux-gnu-gcc
    - env:
        CC: clang --target=i686-pc-linux-gnu -isystem /usr/i686-linux-gnu/include
  << : *MERGE_BASE
  test_script:
    - ./ci/cirrus.sh
  << : *CAT_LOGS
 task:
  name: "x86_64: macOS Catalina"
  macos_instance:
    image: catalina-base
  env:
    HOMEBREW_NO_AUTO_UPDATE: 1
    HOMEBREW_NO_INSTALL_CLEANUP: 1
    # Cirrus gives us a fixed number of 12 virtual CPUs. Not that we even have that many jobs at the moment...
    MAKEFLAGS: -j13
  matrix:
    << : *ENV_MATRIX
  matrix:
    - env:
        CC: gcc-9
    - env:
        CC: clang
  # Update Command Line Tools
  # Uncomment this if the Command Line Tools on the CirrusCI macOS image are too old to brew valgrind.
  # See https://apple.stackexchange.com/a/195963 for the implementation.
  ## update_clt_script:
  ##   - system_profiler SPSoftwareDataType
  ##   - touch /tmp/.com.apple.dt.CommandLineTools.installondemand.in-progress
  ##   - |-
  ##     PROD=$(softwareupdate -l | grep "*.*Command Line" | tail -n 1 | awk -F"*" '{print $2}' | sed -e 's/^ *//' | sed 's/Label: //g' | tr -d '\n')
  ##   # For debugging
  ##   - softwareupdate -l && echo "PROD: $PROD"
  ##   - softwareupdate -i "$PROD" --verbose
  ##   - rm /tmp/.com.apple.dt.CommandLineTools.installondemand.in-progress
  ##
  brew_valgrind_pre_script:
    - brew config
    - brew tap --shallow LouisBrunner/valgrind
    # Fetch valgrind source but don't build it yet.
    - brew fetch --HEAD LouisBrunner/valgrind/valgrind
  brew_valgrind_cache:
    # This is $(brew --cellar valgrind) but command substition does not work here.
    folder: /usr/local/Cellar/valgrind
    # Rebuild cache if ...
    fingerprint_script:
      # ... macOS version changes:
      - sw_vers
      # ... brew changes:
      - brew config
      # ... valgrind changes:
      - git -C "$(brew --cache)/valgrind--git" rev-parse HEAD
    populate_script:
      # If there's no hit in the cache, build and install valgrind.
      - brew install --HEAD LouisBrunner/valgrind/valgrind
  brew_valgrind_post_script:
    # If we have restored valgrind from the cache, tell brew to create symlink to the PATH.
    # If we haven't restored from cached (and just run brew install), this is a no-op.
    - brew link valgrind
  brew_script:
    - brew install automake libtool gcc@9
  << : *MERGE_BASE
  test_script:
    - ./ci/cirrus.sh
  << : *CAT_LOGS
 task:
  name: "s390x (big-endian): Linux (Debian stable, QEMU)"
  container:
    dockerfile: ci/linux-debian.Dockerfile
    cpu: 1
    memory: 1G
  env:
    WRAPPER_CMD: qemu-s390x
    TEST_ITERS: 16
    HOST: s390x-linux-gnu
    WITH_VALGRIND: no
    ECDH: yes
    RECOVERY: yes
    EXPERIMENTAL: yes
    SCHNORRSIG: yes
    CTIMETEST: no
  << : *MERGE_BASE
  test_script:
    # https://sourceware.org/bugzilla/show_bug.cgi?id=27008
    - rm /etc/ld.so.cache
    - ./ci/cirrus.sh
  << : *CAT_LOGS
 task:
  name: "ARM32: Linux (Debian stable, QEMU)"
  container:
    dockerfile: ci/linux-debian.Dockerfile
    cpu: 1
    memory: 1G
  env:
    WRAPPER_CMD: qemu-arm
    TEST_ITERS: 16
    HOST: arm-linux-gnueabihf
    WITH_VALGRIND: no
    ECDH: yes
    RECOVERY: yes
    EXPERIMENTAL: yes
    SCHNORRSIG: yes
    CTIMETEST: no
  matrix:
    - env: {}
    - env: {ASM: arm}
  << : *MERGE_BASE
  test_script:
    - ./ci/cirrus.sh
  << : *CAT_LOGS
 task:
  name: "ARM64: Linux (Debian stable, QEMU)"
  container:
    dockerfile: ci/linux-debian.Dockerfile
    cpu: 1
    memory: 1G
  env:
    WRAPPER_CMD: qemu-aarch64
    TEST_ITERS: 16
    HOST: aarch64-linux-gnu
    WITH_VALGRIND: no
    ECDH: yes
    RECOVERY: yes
    EXPERIMENTAL: yes
    SCHNORRSIG: yes
    CTIMETEST: no
  << : *MERGE_BASE
  test_script:
    - ./ci/cirrus.sh
  << : *CAT_LOGS
 task:
  name: "ppc64le: Linux (Debian stable, QEMU)"
  container:
    dockerfile: ci/linux-debian.Dockerfile
    cpu: 1
    memory: 1G
  env:
    WRAPPER_CMD: qemu-ppc64le
    TEST_ITERS: 16
    HOST: powerpc64le-linux-gnu
    WITH_VALGRIND: no
    ECDH: yes
    RECOVERY: yes
    EXPERIMENTAL: yes
    SCHNORRSIG: yes
    CTIMETEST: no
  << : *MERGE_BASE
  test_script:
    - ./ci/cirrus.sh
  << : *CAT_LOGS
 task:
  name: "x86_64 (mingw32-w64): Windows (Debian stable, Wine)"
  container:
    dockerfile: ci/linux-debian.Dockerfile
    cpu: 1
    memory: 1G
  env:
    WRAPPER_CMD: wine64-stable
    TEST_ITERS: 16
    HOST: x86_64-w64-mingw32
    WITH_VALGRIND: no
    ECDH: yes
    RECOVERY: yes
    EXPERIMENTAL: yes
    SCHNORRSIG: yes
    CTIMETEST: no
  << : *MERGE_BASE
  test_script:
    - ./ci/cirrus.sh
  << : *CAT_LOGS
 # Sanitizers
 task:
  container:
    dockerfile: ci/linux-debian.Dockerfile
    cpu: 1
    memory: 1G
  env:
    ECDH: yes
    RECOVERY: yes
    EXPERIMENTAL: yes
    SCHNORRSIG: yes
    CTIMETEST: no
    EXTRAFLAGS: "--disable-openssl-tests"
  matrix:
    - name: "Valgrind (memcheck)"
      env:
        # The `--error-exitcode` is required to make the test fail if valgrind found errors, otherwise it'll return 0 (https://www.valgrind.org/docs/manual/manual-core.html)
        WRAPPER_CMD: "valgrind --error-exitcode=42"
        TEST_ITERS: 16
    - name: "UBSan, ASan, LSan"
      env:
        CFLAGS: "-fsanitize=undefined,address"
        CFLAGS_FOR_BUILD: "-fsanitize=undefined,address"
        UBSAN_OPTIONS: "print_stacktrace=1:halt_on_error=1"
        ASAN_OPTIONS: "strict_string_checks=1:detect_stack_use_after_return=1:detect_leaks=1"
        LSAN_OPTIONS: "use_unaligned=1"
        TEST_ITERS: 32
  # Try to cover many configurations with just a tiny matrix.
  matrix:
    - env:
        ASM: auto
        STATICPRECOMPUTATION: yes
    - env:
        ASM: no
        STATICPRECOMPUTATION: no
        ECMULTGENPRECISION: 2
  matrix:
    - env:
        CC: clang
    - env:
        HOST: i686-linux-gnu
        CC: i686-linux-gnu-gcc
  << : *MERGE_BASE
  test_script:
    - ./ci/cirrus.sh
  << : *CAT_LOGS
--- a/src/secp256k1/.gitignore
+++ b/src/secp256k1/.gitignore
@ -33,6 +33,14 @@ libtool
 *~
 *.log
 *.trs
 coverage/
 coverage.html
 coverage.*.html
 *.gcda
 *.gcno
 *.gcov
 src/libsecp256k1-config.h
 src/libsecp256k1-config.h.in
 src/ecmult_static_context.h
--- a/src/secp256k1/.travis.yml
+++ b/src/secp256k1/.travis.yml
@ -1,109 +0,0 @@
 language: c
 os:
  - linux
  - osx
 dist: bionic
 # Valgrind currently supports upto macOS 10.13, the latest xcode of that version is 10.1
 osx_image: xcode10.1
 addons:
  apt:
    packages:
      - libgmp-dev
      - valgrind
      - libtool-bin
 compiler:
  - clang
  - gcc
 env:
  global:
    - WIDEMUL=auto  BIGNUM=auto  STATICPRECOMPUTATION=yes  ECMULTGENPRECISION=auto  ASM=no  BUILD=check  WITH_VALGRIND=yes RUN_VALGRIND=no EXTRAFLAGS=  HOST=  ECDH=no  RECOVERY=no SCHNORRSIG=no EXPERIMENTAL=no CTIMETEST=yes BENCH=yes ITERS=2
  matrix:
    - WIDEMUL=int64   RECOVERY=yes
    - WIDEMUL=int64   ECDH=yes  EXPERIMENTAL=yes SCHNORRSIG=yes
    - WIDEMUL=int128
    - WIDEMUL=int128  RECOVERY=yes EXPERIMENTAL=yes SCHNORRSIG=yes
    - WIDEMUL=int128  ECDH=yes EXPERIMENTAL=yes SCHNORRSIG=yes
    - WIDEMUL=int128                    ASM=x86_64
    - BIGNUM=no
    - BIGNUM=no       RECOVERY=yes EXPERIMENTAL=yes SCHNORRSIG=yes
    - BIGNUM=no       STATICPRECOMPUTATION=no
    - BUILD=distcheck WITH_VALGRIND=no CTIMETEST=no BENCH=no
    - CPPFLAGS=-DDETERMINISTIC
    - CFLAGS=-O0 CTIMETEST=no
    - CFLAGS="-fsanitize=undefined -fno-omit-frame-pointer" LDFLAGS="-fsanitize=undefined -fno-omit-frame-pointer" UBSAN_OPTIONS="print_stacktrace=1:halt_on_error=1" BIGNUM=no ASM=x86_64 ECDH=yes RECOVERY=yes EXPERIMENTAL=yes SCHNORRSIG=yes CTIMETEST=no
    - ECMULTGENPRECISION=2
    - ECMULTGENPRECISION=8
    - RUN_VALGRIND=yes BIGNUM=no ASM=x86_64 ECDH=yes  RECOVERY=yes EXPERIMENTAL=yes SCHNORRSIG=yes EXTRAFLAGS="--disable-openssl-tests" BUILD=
 matrix:
  fast_finish: true
  include:
    - compiler: clang
      os: linux
      env: HOST=i686-linux-gnu
      addons:
        apt:
          packages:
            - gcc-multilib
            - libgmp-dev:i386
            - valgrind
            - libtool-bin
            - libc6-dbg:i386
    - compiler: clang
      env: HOST=i686-linux-gnu
      os: linux
      addons:
        apt:
          packages:
            - gcc-multilib
            - valgrind
            - libtool-bin
            - libc6-dbg:i386
    - compiler: gcc
      env: HOST=i686-linux-gnu
      os: linux
      addons:
        apt:
          packages:
            - gcc-multilib
            - valgrind
            - libtool-bin
            - libc6-dbg:i386
    - compiler: gcc
      os: linux
      env: HOST=i686-linux-gnu
      addons:
        apt:
          packages:
            - gcc-multilib
            - libgmp-dev:i386
            - valgrind
            - libtool-bin
            - libc6-dbg:i386
    # S390x build (big endian system)
    - compiler: gcc
      env: HOST=s390x-unknown-linux-gnu ECDH=yes RECOVERY=yes EXPERIMENTAL=yes SCHNORRSIG=yes CTIMETEST=
      arch: s390x
 # We use this to install macOS dependencies instead of the built in `homebrew` plugin,
 # because in xcode earlier than 11 they have a bug requiring updating the system which overall takes ~8 minutes.
 # https://travis-ci.community/t/macos-build-fails-because-of-homebrew-bundle-unknown-command/7296
 before_install:
 - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then HOMEBREW_NO_AUTO_UPDATE=1 brew install gmp valgrind gcc@9; fi
 before_script: ./autogen.sh
 # travis auto terminates jobs that go for 10 minutes without printing to stdout, but travis_wait doesn't work well with forking programs like valgrind (https://docs.travis-ci.com/user/common-build-problems/#build-times-out-because-no-output-was-received https://github.com/bitcoin-core/secp256k1/pull/750#issuecomment-623476860)
 script:
  - function keep_alive() { while true; do echo -en "\a"; sleep 60; done }
  - keep_alive &
  - ./contrib/travis.sh
  - kill %keep_alive
 after_script:
    - cat ./tests.log
    - cat ./exhaustive_tests.log
    - cat ./valgrind_ctime_test.log
    - cat ./bench.log
    - $CC --version
    - valgrind --version
--- a/src/secp256k1/Makefile.am
+++ b/src/secp256k1/Makefile.am
@ -14,8 +14,6 @@ noinst_HEADERS += src/scalar_8x32_impl.h
 noinst_HEADERS += src/scalar_low_impl.h
 noinst_HEADERS += src/group.h
 noinst_HEADERS += src/group_impl.h
 noinst_HEADERS += src/num_gmp.h
 noinst_HEADERS += src/num_gmp_impl.h
 noinst_HEADERS += src/ecdsa.h
 noinst_HEADERS += src/ecdsa_impl.h
 noinst_HEADERS += src/eckey.h
@ -26,14 +24,16 @@ noinst_HEADERS += src/ecmult_const.h
 noinst_HEADERS += src/ecmult_const_impl.h
 noinst_HEADERS += src/ecmult_gen.h
 noinst_HEADERS += src/ecmult_gen_impl.h
 noinst_HEADERS += src/num.h
 noinst_HEADERS += src/num_impl.h
 noinst_HEADERS += src/field_10x26.h
 noinst_HEADERS += src/field_10x26_impl.h
 noinst_HEADERS += src/field_5x52.h
 noinst_HEADERS += src/field_5x52_impl.h
 noinst_HEADERS += src/field_5x52_int128_impl.h
 noinst_HEADERS += src/field_5x52_asm_impl.h
 noinst_HEADERS += src/modinv32.h
 noinst_HEADERS += src/modinv32_impl.h
 noinst_HEADERS += src/modinv64.h
 noinst_HEADERS += src/modinv64_impl.h
 noinst_HEADERS += src/assumptions.h
 noinst_HEADERS += src/util.h
 noinst_HEADERS += src/scratch.h
@ -68,7 +68,7 @@ endif
 endif
 libsecp256k1_la_SOURCES = src/secp256k1.c
-libsecp256k1_la_CPPFLAGS = -DSECP256K1_BUILD -I$(top_srcdir)/include -I$(top_srcdir)/src $(SECP_INCLUDES)
+libsecp256k1_la_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/src $(SECP_INCLUDES)
 libsecp256k1_la_LIBADD = $(SECP_LIBS) $(COMMON_LIB)
 if VALGRIND_ENABLED
@ -81,27 +81,27 @@ noinst_PROGRAMS += bench_verify bench_sign bench_internal bench_ecmult
 bench_verify_SOURCES = src/bench_verify.c
 bench_verify_LDADD = libsecp256k1.la $(SECP_LIBS) $(SECP_TEST_LIBS) $(COMMON_LIB)
 # SECP_TEST_INCLUDES are only used here for CRYPTO_CPPFLAGS
-bench_verify_CPPFLAGS = -DSECP256K1_BUILD $(SECP_TEST_INCLUDES)
+bench_verify_CPPFLAGS = $(SECP_TEST_INCLUDES)
 bench_sign_SOURCES = src/bench_sign.c
 bench_sign_LDADD = libsecp256k1.la $(SECP_LIBS) $(SECP_TEST_LIBS) $(COMMON_LIB)
 bench_internal_SOURCES = src/bench_internal.c
 bench_internal_LDADD = $(SECP_LIBS) $(COMMON_LIB)
-bench_internal_CPPFLAGS = -DSECP256K1_BUILD $(SECP_INCLUDES)
+bench_internal_CPPFLAGS = $(SECP_INCLUDES)
 bench_ecmult_SOURCES = src/bench_ecmult.c
 bench_ecmult_LDADD = $(SECP_LIBS) $(COMMON_LIB)
-bench_ecmult_CPPFLAGS = -DSECP256K1_BUILD $(SECP_INCLUDES)
+bench_ecmult_CPPFLAGS = $(SECP_INCLUDES)
 endif
 TESTS =
 if USE_TESTS
 noinst_PROGRAMS += tests
 tests_SOURCES = src/tests.c
-tests_CPPFLAGS = -DSECP256K1_BUILD -I$(top_srcdir)/src -I$(top_srcdir)/include $(SECP_INCLUDES) $(SECP_TEST_INCLUDES)
+tests_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/include $(SECP_INCLUDES) $(SECP_TEST_INCLUDES)
 if VALGRIND_ENABLED
 tests_CPPFLAGS += -DVALGRIND
 noinst_PROGRAMS += valgrind_ctime_test
 valgrind_ctime_test_SOURCES = src/valgrind_ctime_test.c
-valgrind_ctime_test_LDADD = libsecp256k1.la $(SECP_LIBS) $(SECP_LIBS) $(COMMON_LIB)
+valgrind_ctime_test_LDADD = libsecp256k1.la $(SECP_LIBS) $(COMMON_LIB)
 endif
 if !ENABLE_COVERAGE
 tests_CPPFLAGS += -DVERIFY
@ -114,7 +114,7 @@ endif
 if USE_EXHAUSTIVE_TESTS
 noinst_PROGRAMS += exhaustive_tests
 exhaustive_tests_SOURCES = src/tests_exhaustive.c
-exhaustive_tests_CPPFLAGS = -DSECP256K1_BUILD -I$(top_srcdir)/src $(SECP_INCLUDES)
+exhaustive_tests_CPPFLAGS = -I$(top_srcdir)/src $(SECP_INCLUDES)
 if !ENABLE_COVERAGE
 exhaustive_tests_CPPFLAGS += -DVERIFY
 endif
@ -129,7 +129,7 @@ CPPFLAGS_FOR_BUILD +=-I$(top_srcdir) -I$(builddir)/src
 gen_context_OBJECTS = gen_context.o
 gen_context_BIN = gen_context$(BUILD_EXEEXT)
 gen_%.o: src/gen_%.c src/libsecp256k1-config.h
-	$(CC_FOR_BUILD) $(CPPFLAGS_FOR_BUILD) $(CFLAGS_FOR_BUILD) -c $< -o $@
+	$(CC_FOR_BUILD) $(DEFS) $(CPPFLAGS_FOR_BUILD) $(CFLAGS_FOR_BUILD) -c $< -o $@
 $(gen_context_BIN): $(gen_context_OBJECTS)
 	$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) $^ -o $@
--- a/src/secp256k1/README.md
+++ b/src/secp256k1/README.md
@ -1,7 +1,7 @@
 libsecp256k1
 ============
-[![Build Status](https://travis-ci.org/bitcoin-core/secp256k1.svg?branch=master)](https://travis-ci.org/bitcoin-core/secp256k1)
+[![Build Status](https://api.cirrus-ci.com/github/bitcoin-core/secp256k1.svg?branch=master)](https://cirrus-ci.com/github/bitcoin-core/secp256k1)
 Optimized C library for ECDSA signatures and secret/public key operations on curve secp256k1.
@ -34,11 +34,11 @@ Implementation details
  * Optimized implementation of arithmetic modulo the curve's field size (2^256 - 0x1000003D1).
    * Using 5 52-bit limbs (including hand-optimized assembly for x86_64, by Diederik Huys).
    * Using 10 26-bit limbs (including hand-optimized assembly for 32-bit ARM, by Wladimir J. van der Laan).
  * Field inverses and square roots using a sliding window over blocks of 1s (by Peter Dettman).
 * Scalar operations
  * Optimized implementation without data-dependent branches of arithmetic modulo the curve's order.
    * Using 4 64-bit limbs (relying on __int128 support in the compiler).
    * Using 8 32-bit limbs.
 * Modular inverses (both field elements and scalars) based on [safegcd](https://gcd.cr.yp.to/index.html) with some modifications, and a variable-time variant (by Peter Dettman).
 * Group operations
  * Point addition formula specifically simplified for the curve equation (y^2 = x^3 + 7).
  * Use addition between points in Jacobian and affine coordinates where possible.
@ -96,7 +96,8 @@ To create a report, `gcovr` is recommended, as it includes branch coverage repor
 To create a HTML report with coloured and annotated source code:
-    $ gcovr --exclude 'src/bench*' --html --html-details -o coverage.html
+    $ mkdir -p coverage
    $ gcovr --exclude 'src/bench*' --html --html-details -o coverage/coverage.html
 Reporting a vulnerability
 ------------
--- a/src/secp256k1/build-aux/m4/bitcoin_secp.m4
+++ b/src/secp256k1/build-aux/m4/bitcoin_secp.m4
@ -75,19 +75,6 @@ if test x"$has_libcrypto" = x"yes" && test x"$has_openssl_ec" = x; then
 fi
 ])
 dnl
 AC_DEFUN([SECP_GMP_CHECK],[
 if test x"$has_gmp" != x"yes"; then
  CPPFLAGS_TEMP="$CPPFLAGS"
  CPPFLAGS="$GMP_CPPFLAGS $CPPFLAGS"
  LIBS_TEMP="$LIBS"
  LIBS="$GMP_LIBS $LIBS"
  AC_CHECK_HEADER(gmp.h,[AC_CHECK_LIB(gmp, __gmpz_init,[has_gmp=yes; GMP_LIBS="$GMP_LIBS -lgmp"; AC_DEFINE(HAVE_LIBGMP,1,[Define this symbol if libgmp is installed])])])
  CPPFLAGS="$CPPFLAGS_TEMP"
  LIBS="$LIBS_TEMP"
 fi
 ])
 AC_DEFUN([SECP_VALGRIND_CHECK],[
 if test x"$has_valgrind" != x"yes"; then
  CPPFLAGS_TEMP="$CPPFLAGS"
--- a/src/secp256k1/contrib/travis.sh
+++ b/src/secp256k1/contrib/travis.sh
@ -3,46 +3,49 @@
 set -e
 set -x
-if [ "$HOST" = "i686-linux-gnu" ]
+export LC_ALL=C
-then
+
-    export CC="$CC -m32"
+env >> test_env.log
-fi
+
-if [ "$TRAVIS_OS_NAME" = "osx" ] && [ "$TRAVIS_COMPILER" = "gcc" ]
+$CC -v || true
-then
+valgrind --version || true
-    export CC="gcc-9"
+
-fi
+./autogen.sh
 ./configure \
    --enable-experimental="$EXPERIMENTAL" \
-    --with-test-override-wide-multiply="$WIDEMUL" --with-bignum="$BIGNUM" --with-asm="$ASM" \
+    --with-test-override-wide-multiply="$WIDEMUL" --with-asm="$ASM" \
    --enable-ecmult-static-precomputation="$STATICPRECOMPUTATION" --with-ecmult-gen-precision="$ECMULTGENPRECISION" \
    --enable-module-ecdh="$ECDH" --enable-module-recovery="$RECOVERY" \
    --enable-module-schnorrsig="$SCHNORRSIG" \
    --with-valgrind="$WITH_VALGRIND" \
    --host="$HOST" $EXTRAFLAGS
-if [ -n "$BUILD" ]
+# We have set "-j<n>" in MAKEFLAGS.
-then
+make
-    make -j2 "$BUILD"
+
-fi
+# Print information about binaries so that we can see that the architecture is correct
-if [ "$RUN_VALGRIND" = "yes" ]
+file *tests* || true
-then
+file bench_* || true
-    make -j2
+file .libs/* || true
-    # the `--error-exitcode` is required to make the test fail if valgrind found errors, otherwise it'll return 0 (https://www.valgrind.org/docs/manual/manual-core.html)
+
-    valgrind --error-exitcode=42 ./tests 16
+# This tells `make check` to wrap test invocations.
-    valgrind --error-exitcode=42 ./exhaustive_tests
+export LOG_COMPILER="$WRAPPER_CMD"
-fi
+
 # This limits the iterations in the tests and benchmarks.
 export SECP256K1_TEST_ITERS="$TEST_ITERS"
 export SECP256K1_BENCH_ITERS="$BENCH_ITERS"
 make "$BUILD"
 if [ "$BENCH" = "yes" ]
 then
-    if [ "$RUN_VALGRIND" = "yes" ]
+    # Using the local `libtool` because on macOS the system's libtool has nothing to do with GNU libtool
    EXEC='./libtool --mode=execute'
    if [ -n "$WRAPPER_CMD" ]
    then
-        # Using the local `libtool` because on macOS the system's libtool has nothing to do with GNU libtool
+        EXEC="$EXEC $WRAPPER_CMD"
        EXEC='./libtool --mode=execute valgrind --error-exitcode=42'
    else
        EXEC=
    fi
    # This limits the iterations in the benchmarks below to ITER(set in .travis.yml) iterations.
    export SECP256K1_BENCH_ITERS="$ITERS"
    {
        $EXEC ./bench_ecmult
        $EXEC ./bench_internal
--- a/src/secp256k1/ci/linux-debian.Dockerfile
+++ b/src/secp256k1/ci/linux-debian.Dockerfile
@ -0,0 +1,24 @@
 FROM debian:stable
 RUN dpkg --add-architecture i386
 RUN dpkg --add-architecture s390x
 RUN dpkg --add-architecture armhf
 RUN dpkg --add-architecture arm64
 RUN dpkg --add-architecture ppc64el
 RUN apt-get update
 # dkpg-dev: to make pkg-config work in cross-builds
 # llvm: for llvm-symbolizer, which is used by clang's UBSan for symbolized stack traces
 RUN apt-get install --no-install-recommends --no-upgrade -y \
        git ca-certificates \
        make automake libtool pkg-config dpkg-dev valgrind qemu-user \
        gcc clang llvm libc6-dbg \
        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 libubsan1:i386 libasan5:i386 \
        gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x \
        gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6-dbg:armhf \
        gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6-dbg:arm64 \
        gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross libc6-dbg:ppc64el \
        wine gcc-mingw-w64-x86-64
 # Run a dummy command in wine to make it set up configuration
 RUN wine64-stable xcopy || true
--- a/src/secp256k1/configure.ac
+++ b/src/secp256k1/configure.ac
@ -23,7 +23,15 @@ AC_PATH_TOOL(AR, ar)
 AC_PATH_TOOL(RANLIB, ranlib)
 AC_PATH_TOOL(STRIP, strip)
 # Save definition of AC_PROG_CC because AM_PROG_CC_C_O in automake<=1.13 will
 # redefine AC_PROG_CC to exit with an error, which avoids the user calling it
 # accidently and screwing up the effect of AM_PROG_CC_C_O. However, we'll need
 # AC_PROG_CC later on in AX_PROG_CC_FOR_BUILD, where its usage is fine, and
 # we'll carefully make sure not to call AC_PROG_CC anywhere else.
 m4_copy([AC_PROG_CC], [saved_AC_PROG_CC])
 AM_PROG_CC_C_O
 # Restore AC_PROG_CC
 m4_rename_force([saved_AC_PROG_CC], [AC_PROG_CC])
 AC_PROG_CC_C99
 if test x"$ac_cv_prog_cc_c99" = x"no"; then
@ -40,17 +48,12 @@ case $host_os in
         # in expected paths because they may conflict with system files. Ask
         # Homebrew where each one is located, then adjust paths accordingly.
         openssl_prefix=`$BREW --prefix openssl 2>/dev/null`
         gmp_prefix=`$BREW --prefix gmp 2>/dev/null`
         valgrind_prefix=`$BREW --prefix valgrind 2>/dev/null`
         if test x$openssl_prefix != x; then
           PKG_CONFIG_PATH="$openssl_prefix/lib/pkgconfig:$PKG_CONFIG_PATH"
           export PKG_CONFIG_PATH
           CRYPTO_CPPFLAGS="-I$openssl_prefix/include"
         fi
         if test x$gmp_prefix != x; then
           GMP_CPPFLAGS="-I$gmp_prefix/include"
           GMP_LIBS="-L$gmp_prefix/lib"
         fi
         if test x$valgrind_prefix != x; then
           VALGRIND_CPPFLAGS="-I$valgrind_prefix/include"
         fi
@ -79,6 +82,15 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[char foo;]])],
      CFLAGS="$saved_CFLAGS"
    ])
 saved_CFLAGS="$CFLAGS"
 CFLAGS="-Wconditional-uninitialized $CFLAGS"
 AC_MSG_CHECKING([if ${CC} supports -Wconditional-uninitialized])
 AC_COMPILE_IFELSE([AC_LANG_SOURCE([[char foo;]])],
    [ AC_MSG_RESULT([yes]) ],
    [ AC_MSG_RESULT([no])
      CFLAGS="$saved_CFLAGS"
    ])
 saved_CFLAGS="$CFLAGS"
 CFLAGS="-fvisibility=hidden $CFLAGS"
 AC_MSG_CHECKING([if ${CC} supports -fvisibility=hidden])
@ -156,9 +168,6 @@ AC_ARG_ENABLE(external_default_callbacks,
 # Legal values are int64 (for [u]int64_t), int128 (for [unsigned] __int128), and auto (the default).
 AC_ARG_WITH([test-override-wide-multiply], [] ,[set_widemul=$withval], [set_widemul=auto])
 AC_ARG_WITH([bignum], [AS_HELP_STRING([--with-bignum=gmp|no|auto],
 [bignum implementation to use [default=auto]])],[req_bignum=$withval], [req_bignum=auto])
 AC_ARG_WITH([asm], [AS_HELP_STRING([--with-asm=x86_64|arm|no|auto],
 [assembly optimizations to use (experimental: arm) [default=auto]])],[req_asm=$withval], [req_asm=auto])
@ -237,32 +246,6 @@ else
  esac
 fi
 if test x"$req_bignum" = x"auto"; then
  SECP_GMP_CHECK
  if test x"$has_gmp" = x"yes"; then
    set_bignum=gmp
  fi
  if test x"$set_bignum" = x; then
    set_bignum=no
  fi
 else
  set_bignum=$req_bignum
  case $set_bignum in
  gmp)
    SECP_GMP_CHECK
    if test x"$has_gmp" != x"yes"; then
      AC_MSG_ERROR([gmp bignum explicitly requested but libgmp not available])
    fi
    ;;
  no)
    ;;
  *)
    AC_MSG_ERROR([invalid bignum implementation selection])
    ;;
  esac
 fi
 # Select assembly optimization
 use_external_asm=no
@ -300,24 +283,6 @@ auto)
  ;;
 esac
 # Select bignum implementation
 case $set_bignum in
 gmp)
  AC_DEFINE(HAVE_LIBGMP, 1, [Define this symbol if libgmp is installed])
  AC_DEFINE(USE_NUM_GMP, 1, [Define this symbol to use the gmp implementation for num])
  AC_DEFINE(USE_FIELD_INV_NUM, 1, [Define this symbol to use the num-based field inverse implementation])
  AC_DEFINE(USE_SCALAR_INV_NUM, 1, [Define this symbol to use the num-based scalar inverse implementation])
  ;;
 no)
  AC_DEFINE(USE_NUM_NONE, 1, [Define this symbol to use no num implementation])
  AC_DEFINE(USE_FIELD_INV_BUILTIN, 1, [Define this symbol to use the native field inverse implementation])
  AC_DEFINE(USE_SCALAR_INV_BUILTIN, 1, [Define this symbol to use the native scalar inverse implementation])
  ;;
 *)
  AC_MSG_ERROR([invalid bignum implementation])
  ;;
 esac
 # Set ecmult window size
 if test x"$req_ecmult_window" = x"auto"; then
  set_ecmult_window=15
@ -382,11 +347,6 @@ else
  enable_openssl_tests=no
 fi
 if test x"$set_bignum" = x"gmp"; then
  SECP_LIBS="$SECP_LIBS $GMP_LIBS"
  SECP_INCLUDES="$SECP_INCLUDES $GMP_CPPFLAGS"
 fi
 if test x"$enable_valgrind" = x"yes"; then
  SECP_INCLUDES="$SECP_INCLUDES $VALGRIND_CPPFLAGS"
 fi
@ -563,7 +523,6 @@ echo "  module extrakeys        = $enable_module_extrakeys"
 echo "  module schnorrsig       = $enable_module_schnorrsig"
 echo
 echo "  asm                     = $set_asm"
 echo "  bignum                  = $set_bignum"
 echo "  ecmult window size      = $set_ecmult_window"
 echo "  ecmult gen prec. bits   = $set_ecmult_gen_precision"
 # Hide test-only options unless they're used.
--- a/src/secp256k1/contrib/lax_der_parsing.c
+++ b/src/secp256k1/contrib/lax_der_parsing.c
@ -5,7 +5,6 @@
 ***********************************************************************/
 #include <string.h>
 #include <secp256k1.h>
 #include "lax_der_parsing.h"
--- a/src/secp256k1/contrib/lax_der_parsing.h
+++ b/src/secp256k1/contrib/lax_der_parsing.h
@ -51,7 +51,13 @@
 #ifndef SECP256K1_CONTRIB_LAX_DER_PARSING_H
 #define SECP256K1_CONTRIB_LAX_DER_PARSING_H
 /* #include secp256k1.h only when it hasn't been included yet.
   This enables this file to be #included directly in other project
   files (such as tests.c) without the need to set an explicit -I flag,
   which would be necessary to locate secp256k1.h. */
 #ifndef SECP256K1_H
 #include <secp256k1.h>
 #endif
 #ifdef __cplusplus
 extern "C" {
--- a/src/secp256k1/contrib/lax_der_privatekey_parsing.c
+++ b/src/secp256k1/contrib/lax_der_privatekey_parsing.c
@ -5,7 +5,6 @@
 ***********************************************************************/
 #include <string.h>
 #include <secp256k1.h>
 #include "lax_der_privatekey_parsing.h"
--- a/src/secp256k1/contrib/lax_der_privatekey_parsing.h
+++ b/src/secp256k1/contrib/lax_der_privatekey_parsing.h
@ -28,7 +28,13 @@
 #ifndef SECP256K1_CONTRIB_BER_PRIVATEKEY_H
 #define SECP256K1_CONTRIB_BER_PRIVATEKEY_H
 /* #include secp256k1.h only when it hasn't been included yet.
   This enables this file to be #included directly in other project
   files (such as tests.c) without the need to set an explicit -I flag,
   which would be necessary to locate secp256k1.h. */
 #ifndef SECP256K1_H
 #include <secp256k1.h>
 #endif
 #ifdef __cplusplus
 extern "C" {
--- a/src/secp256k1/doc/safegcd_implementation.md
+++ b/src/secp256k1/doc/safegcd_implementation.md
@ -0,0 +1,765 @@
 # The safegcd implementation in libsecp256k1 explained
 This document explains the modular inverse implementation in the `src/modinv*.h` files. It is based
 on the paper
 ["Fast constant-time gcd computation and modular inversion"](https://gcd.cr.yp.to/papers.html#safegcd)
 by Daniel J. Bernstein and Bo-Yin Yang. The references below are for the Date: 2019.04.13 version.
 The actual implementation is in C of course, but for demonstration purposes Python3 is used here.
 Most implementation aspects and optimizations are explained, except those that depend on the specific
 number representation used in the C code.
 ## 1. Computing the Greatest Common Divisor (GCD) using divsteps
 The algorithm from the paper (section 11), at a very high level, is this:
 ```python
 def gcd(f, g):
    """Compute the GCD of an odd integer f and another integer g."""
    assert f & 1  # require f to be odd
    delta = 1     # additional state variable
    while g != 0:
        assert f & 1  # f will be odd in every iteration
        if delta > 0 and g & 1:
            delta, f, g = 1 - delta, g, (g - f) // 2
        elif g & 1:
            delta, f, g = 1 + delta, f, (g + f) // 2
        else:
            delta, f, g = 1 + delta, f, (g    ) // 2
    return abs(f)
 ```
 It computes the greatest common divisor of an odd integer *f* and any integer *g*. Its inner loop
 keeps rewriting the variables *f* and *g* alongside a state variable *&delta;* that starts at *1*, until
 *g=0* is reached. At that point, *|f|* gives the GCD. Each of the transitions in the loop is called a
 "division step" (referred to as divstep in what follows).
 For example, *gcd(21, 14)* would be computed as:
 - Start with *&delta;=1 f=21 g=14*
 - Take the third branch: *&delta;=2 f=21 g=7*
 - Take the first branch: *&delta;=-1 f=7 g=-7*
 - Take the second branch: *&delta;=0 f=7 g=0*
 - The answer *|f| = 7*.
 Why it works:
 - Divsteps can be decomposed into two steps (see paragraph 8.2 in the paper):
  - (a) If *g* is odd, replace *(f,g)* with *(g,g-f)* or (f,g+f), resulting in an even *g*.
  - (b) Replace *(f,g)* with *(f,g/2)* (where *g* is guaranteed to be even).
 - Neither of those two operations change the GCD:
  - For (a), assume *gcd(f,g)=c*, then it must be the case that *f=a&thinsp;c* and *g=b&thinsp;c* for some integers *a*
    and *b*. As *(g,g-f)=(b&thinsp;c,(b-a)c)* and *(f,f+g)=(a&thinsp;c,(a+b)c)*, the result clearly still has
    common factor *c*. Reasoning in the other direction shows that no common factor can be added by
    doing so either.
  - For (b), we know that *f* is odd, so *gcd(f,g)* clearly has no factor *2*, and we can remove
    it from *g*.
 - The algorithm will eventually converge to *g=0*. This is proven in the paper (see theorem G.3).
 - It follows that eventually we find a final value *f'* for which *gcd(f,g) = gcd(f',0)*. As the
  gcd of *f'* and *0* is *|f'|* by definition, that is our answer.
 Compared to more [traditional GCD algorithms](https://en.wikipedia.org/wiki/Euclidean_algorithm), this one has the property of only ever looking at
 the low-order bits of the variables to decide the next steps, and being easy to make
 constant-time (in more low-level languages than Python). The *&delta;* parameter is necessary to
 guide the algorithm towards shrinking the numbers' magnitudes without explicitly needing to look
 at high order bits.
 Properties that will become important later:
 - Performing more divsteps than needed is not a problem, as *f* does not change anymore after *g=0*.
 - Only even numbers are divided by *2*. This means that when reasoning about it algebraically we
  do not need to worry about rounding.
 - At every point during the algorithm's execution the next *N* steps only depend on the bottom *N*
  bits of *f* and *g*, and on *&delta;*.
 ## 2. From GCDs to modular inverses
 We want an algorithm to compute the inverse *a* of *x* modulo *M*, i.e. the number a such that *a&thinsp;x=1
 mod M*. This inverse only exists if the GCD of *x* and *M* is *1*, but that is always the case if *M* is
 prime and *0 < x < M*. In what follows, assume that the modular inverse exists.
 It turns out this inverse can be computed as a side effect of computing the GCD by keeping track
 of how the internal variables can be written as linear combinations of the inputs at every step
 (see the [extended Euclidean algorithm](https://en.wikipedia.org/wiki/Extended_Euclidean_algorithm)).
 Since the GCD is *1*, such an algorithm will compute numbers *a* and *b* such that a&thinsp;x + b&thinsp;M = 1*.
 Taking that expression *mod M* gives *a&thinsp;x mod M = 1*, and we see that *a* is the modular inverse of *x
 mod M*.
 A similar approach can be used to calculate modular inverses using the divsteps-based GCD
 algorithm shown above, if the modulus *M* is odd. To do so, compute *gcd(f=M,g=x)*, while keeping
 track of extra variables *d* and *e*, for which at every step *d = f/x (mod M)* and *e = g/x (mod M)*.
 *f/x* here means the number which multiplied with *x* gives *f mod M*. As *f* and *g* are initialized to *M*
 and *x* respectively, *d* and *e* just start off being *0* (*M/x mod M = 0/x mod M = 0*) and *1* (*x/x mod M
 = 1*).
 ```python
 def div2(M, x):
    """Helper routine to compute x/2 mod M (where M is odd)."""
    assert M & 1
    if x & 1: # If x is odd, make it even by adding M.
        x += M
    # x must be even now, so a clean division by 2 is possible.
    return x // 2
 def modinv(M, x):
    """Compute the inverse of x mod M (given that it exists, and M is odd)."""
    assert M & 1
    delta, f, g, d, e = 1, M, x, 0, 1
    while g != 0:
        # Note that while division by two for f and g is only ever done on even inputs, this is
        # not true for d and e, so we need the div2 helper function.
        if delta > 0 and g & 1:
            delta, f, g, d, e = 1 - delta, g, (g - f) // 2, e, div2(M, e - d)
        elif g & 1:
            delta, f, g, d, e = 1 + delta, f, (g + f) // 2, d, div2(M, e + d)
        else:
            delta, f, g, d, e = 1 + delta, f, (g    ) // 2, d, div2(M, e    )
        # Verify that the invariants d=f/x mod M, e=g/x mod M are maintained.
        assert f % M == (d * x) % M
        assert g % M == (e * x) % M
    assert f == 1 or f == -1  # |f| is the GCD, it must be 1
    # Because of invariant d = f/x (mod M), 1/x = d/f (mod M). As |f|=1, d/f = d*f.
    return (d * f) % M
 ```
 Also note that this approach to track *d* and *e* throughout the computation to determine the inverse
 is different from the paper. There (see paragraph 12.1 in the paper) a transition matrix for the
 entire computation is determined (see section 3 below) and the inverse is computed from that.
 The approach here avoids the need for 2x2 matrix multiplications of various sizes, and appears to
 be faster at the level of optimization we're able to do in C.
 ## 3. Batching multiple divsteps
 Every divstep can be expressed as a matrix multiplication, applying a transition matrix *(1/2 t)*
 to both vectors *[f, g]* and *[d, e]* (see paragraph 8.1 in the paper):
 ```
  t = [ u,  v ]
      [ q,  r ]
  [ out_f ] = (1/2 * t) * [ in_f ]
  [ out_g ] =             [ in_g ]
  [ out_d ] = (1/2 * t) * [ in_d ]  (mod M)
  [ out_e ]               [ in_e ]
 ```
 where *(u, v, q, r)* is *(0, 2, -1, 1)*, *(2, 0, 1, 1)*, or *(2, 0, 0, 1)*, depending on which branch is
 taken. As above, the resulting *f* and *g* are always integers.
 Performing multiple divsteps corresponds to a multiplication with the product of all the
 individual divsteps' transition matrices. As each transition matrix consists of integers
 divided by *2*, the product of these matrices will consist of integers divided by *2<sup>N</sup>* (see also
 theorem 9.2 in the paper). These divisions are expensive when updating *d* and *e*, so we delay
 them: we compute the integer coefficients of the combined transition matrix scaled by *2<sup>N</sup>*, and
 do one division by *2<sup>N</sup>* as a final step:
 ```python
 def divsteps_n_matrix(delta, f, g):
    """Compute delta and transition matrix t after N divsteps (multiplied by 2^N)."""
    u, v, q, r = 1, 0, 0, 1 # start with identity matrix
    for _ in range(N):
        if delta > 0 and g & 1:
            delta, f, g, u, v, q, r = 1 - delta, g, (g - f) // 2, 2*q, 2*r, q-u, r-v
        elif g & 1:
            delta, f, g, u, v, q, r = 1 + delta, f, (g + f) // 2, 2*u, 2*v, q+u, r+v
        else:
            delta, f, g, u, v, q, r = 1 + delta, f, (g    ) // 2, 2*u, 2*v, q  , r
    return delta, (u, v, q, r)
 ```
 As the branches in the divsteps are completely determined by the bottom *N* bits of *f* and *g*, this
 function to compute the transition matrix only needs to see those bottom bits. Furthermore all
 intermediate results and outputs fit in *(N+1)*-bit numbers (unsigned for *f* and *g*; signed for *u*, *v*,
 *q*, and *r*) (see also paragraph 8.3 in the paper). This means that an implementation using 64-bit
 integers could set *N=62* and compute the full transition matrix for 62 steps at once without any
 big integer arithmetic at all. This is the reason why this algorithm is efficient: it only needs
 to update the full-size *f*, *g*, *d*, and *e* numbers once every *N* steps.
 We still need functions to compute:
 ```
  [ out_f ] = (1/2^N * [ u,  v ]) * [ in_f ]
  [ out_g ]   (        [ q,  r ])   [ in_g ]
  [ out_d ] = (1/2^N * [ u,  v ]) * [ in_d ]  (mod M)
  [ out_e ]   (        [ q,  r ])   [ in_e ]
 ```
 Because the divsteps transformation only ever divides even numbers by two, the result of *t&thinsp;[f,g]* is always even. When *t* is a composition of *N* divsteps, it follows that the resulting *f*
 and *g* will be multiple of *2<sup>N</sup>*, and division by *2<sup>N</sup>* is simply shifting them down:
 ```python
 def update_fg(f, g, t):
    """Multiply matrix t/2^N with [f, g]."""
    u, v, q, r = t
    cf, cg = u*f + v*g, q*f + r*g
    # (t / 2^N) should cleanly apply to [f,g] so the result of t*[f,g] should have N zero
    # bottom bits.
    assert cf % 2**N == 0
    assert cg % 2**N == 0
    return cf >> N, cg >> N
 ```
 The same is not true for *d* and *e*, and we need an equivalent of the `div2` function for division by *2<sup>N</sup> mod M*.
 This is easy if we have precomputed *1/M mod 2<sup>N</sup>* (which always exists for odd *M*):
 ```python
 def div2n(M, Mi, x):
    """Compute x/2^N mod M, given Mi = 1/M mod 2^N."""
    assert (M * Mi) % 2**N == 1
    # Find a factor m such that m*M has the same bottom N bits as x. We want:
    #     (m * M) mod 2^N = x mod 2^N
    # <=> m mod 2^N = (x / M) mod 2^N
    # <=> m mod 2^N = (x * Mi) mod 2^N
    m = (Mi * x) % 2**N
    # Subtract that multiple from x, cancelling its bottom N bits.
    x -= m * M
    # Now a clean division by 2^N is possible.
    assert x % 2**N == 0
    return (x >> N) % M
 def update_de(d, e, t, M, Mi):
    """Multiply matrix t/2^N with [d, e], modulo M."""
    u, v, q, r = t
    cd, ce = u*d + v*e, q*d + r*e
    return div2n(M, Mi, cd), div2n(M, Mi, ce)
 ```
 With all of those, we can write a version of `modinv` that performs *N* divsteps at once:
 ```python3
 def modinv(M, Mi, x):
    """Compute the modular inverse of x mod M, given Mi=1/M mod 2^N."""
    assert M & 1
    delta, f, g, d, e = 1, M, x, 0, 1
    while g != 0:
        # Compute the delta and transition matrix t for the next N divsteps (this only needs
        # (N+1)-bit signed integer arithmetic).
        delta, t = divsteps_n_matrix(delta, f % 2**N, g % 2**N)
        # Apply the transition matrix t to [f, g]:
        f, g = update_fg(f, g, t)
        # Apply the transition matrix t to [d, e]:
        d, e = update_de(d, e, t, M, Mi)
    return (d * f) % M
 ```
 This means that in practice we'll always perform a multiple of *N* divsteps. This is not a problem
 because once *g=0*, further divsteps do not affect *f*, *g*, *d*, or *e* anymore (only *&delta;* keeps
 increasing). For variable time code such excess iterations will be mostly optimized away in later
 sections.
 ## 4. Avoiding modulus operations
 So far, there are two places where we compute a remainder of big numbers modulo *M*: at the end of
 `div2n` in every `update_de`, and at the very end of `modinv` after potentially negating *d* due to the
 sign of *f*. These are relatively expensive operations when done generically.
 To deal with the modulus operation in `div2n`, we simply stop requiring *d* and *e* to be in range
 *[0,M)* all the time. Let's start by inlining `div2n` into `update_de`, and dropping the modulus
 operation at the end:
 ```python
 def update_de(d, e, t, M, Mi):
    """Multiply matrix t/2^N with [d, e] mod M, given Mi=1/M mod 2^N."""
    u, v, q, r = t
    cd, ce = u*d + v*e, q*d + r*e
    # Cancel out bottom N bits of cd and ce.
    md = -((Mi * cd) % 2**N)
    me = -((Mi * ce) % 2**N)
    cd += md * M
    ce += me * M
    # And cleanly divide by 2**N.
    return cd >> N, ce >> N
 ```
 Let's look at bounds on the ranges of these numbers. It can be shown that *|u|+|v|* and *|q|+|r|*
 never exceed *2<sup>N</sup>* (see paragraph 8.3 in the paper), and thus a multiplication with *t* will have
 outputs whose absolute values are at most *2<sup>N</sup>* times the maximum absolute input value. In case the
 inputs *d* and *e* are in *(-M,M)*, which is certainly true for the initial values *d=0* and *e=1* assuming
 *M > 1*, the multiplication results in numbers in range *(-2<sup>N</sup>M,2<sup>N</sup>M)*. Subtracting less than *2<sup>N</sup>*
 times *M* to cancel out *N* bits brings that up to *(-2<sup>N+1</sup>M,2<sup>N</sup>M)*, and
 dividing by *2<sup>N</sup>* at the end takes it to *(-2M,M)*. Another application of `update_de` would take that
 to *(-3M,2M)*, and so forth. This progressive expansion of the variables' ranges can be
 counteracted by incrementing *d* and *e* by *M* whenever they're negative:
 ```python
    ...
    if d < 0:
        d += M
    if e < 0:
        e += M
    cd, ce = u*d + v*e, q*d + r*e
    # Cancel out bottom N bits of cd and ce.
    ...
 ```
 With inputs in *(-2M,M)*, they will first be shifted into range *(-M,M)*, which means that the
 output will again be in *(-2M,M)*, and this remains the case regardless of how many `update_de`
 invocations there are. In what follows, we will try to make this more efficient.
 Note that increasing *d* by *M* is equal to incrementing *cd* by *u&thinsp;M* and *ce* by *q&thinsp;M*. Similarly,
 increasing *e* by *M* is equal to incrementing *cd* by *v&thinsp;M* and *ce* by *r&thinsp;M*. So we could instead write:
 ```python
    ...
    cd, ce = u*d + v*e, q*d + r*e
    # Perform the equivalent of incrementing d, e by M when they're negative.
    if d < 0:
        cd += u*M
        ce += q*M
    if e < 0:
        cd += v*M
        ce += r*M
    # Cancel out bottom N bits of cd and ce.
    md = -((Mi * cd) % 2**N)
    me = -((Mi * ce) % 2**N)
    cd += md * M
    ce += me * M
    ...
 ```
 Now note that we have two steps of corrections to *cd* and *ce* that add multiples of *M*: this
 increment, and the decrement that cancels out bottom bits. The second one depends on the first
 one, but they can still be efficiently combined by only computing the bottom bits of *cd* and *ce*
 at first, and using that to compute the final *md*, *me* values:
 ```python
 def update_de(d, e, t, M, Mi):
    """Multiply matrix t/2^N with [d, e], modulo M."""
    u, v, q, r = t
    md, me = 0, 0
    # Compute what multiples of M to add to cd and ce.
    if d < 0:
        md += u
        me += q
    if e < 0:
        md += v
        me += r
    # Compute bottom N bits of t*[d,e] + M*[md,me].
    cd, ce = (u*d + v*e + md*M) % 2**N, (q*d + r*e + me*M) % 2**N
    # Correct md and me such that the bottom N bits of t*[d,e] + M*[md,me] are zero.
    md -= (Mi * cd) % 2**N
    me -= (Mi * ce) % 2**N
    # Do the full computation.
    cd, ce = u*d + v*e + md*M, q*d + r*e + me*M
    # And cleanly divide by 2**N.
    return cd >> N, ce >> N
 ```
 One last optimization: we can avoid the *md&thinsp;M* and *me&thinsp;M* multiplications in the bottom bits of *cd*
 and *ce* by moving them to the *md* and *me* correction:
 ```python
    ...
    # Compute bottom N bits of t*[d,e].
    cd, ce = (u*d + v*e) % 2**N, (q*d + r*e) % 2**N
    # Correct md and me such that the bottom N bits of t*[d,e]+M*[md,me] are zero.
    # Note that this is not the same as {md = (-Mi * cd) % 2**N} etc. That would also result in N
    # zero bottom bits, but isn't guaranteed to be a reduction of [0,2^N) compared to the
    # previous md and me values, and thus would violate our bounds analysis.
    md -= (Mi*cd + md) % 2**N
    me -= (Mi*ce + me) % 2**N
    ...
 ```
 The resulting function takes *d* and *e* in range *(-2M,M)* as inputs, and outputs values in the same
 range. That also means that the *d* value at the end of `modinv` will be in that range, while we want
 a result in *[0,M)*. To do that, we need a normalization function. It's easy to integrate the
 conditional negation of *d* (based on the sign of *f*) into it as well:
 ```python
 def normalize(sign, v, M):
    """Compute sign*v mod M, where v is in range (-2*M,M); output in [0,M)."""
    assert sign == 1 or sign == -1
    # v in (-2*M,M)
    if v < 0:
        v += M
    # v in (-M,M). Now multiply v with sign (which can only be 1 or -1).
    if sign == -1:
        v = -v
    # v in (-M,M)
    if v < 0:
        v += M
    # v in [0,M)
    return v
 ```
 And calling it in `modinv` is simply:
 ```python
   ...
   return normalize(f, d, M)
 ```
 ## 5. Constant-time operation
 The primary selling point of the algorithm is fast constant-time operation. What code flow still
 depends on the input data so far?
 - the number of iterations of the while *g &ne; 0* loop in `modinv`
 - the branches inside `divsteps_n_matrix`
 - the sign checks in `update_de`
 - the sign checks in `normalize`
 To make the while loop in `modinv` constant time it can be replaced with a constant number of
 iterations. The paper proves (Theorem 11.2) that *741* divsteps are sufficient for any *256*-bit
 inputs, and [safegcd-bounds](https://github.com/sipa/safegcd-bounds) shows that the slightly better bound *724* is
 sufficient even. Given that every loop iteration performs *N* divsteps, it will run a total of
 *&lceil;724/N&rceil;* times.
 To deal with the branches in `divsteps_n_matrix` we will replace them with constant-time bitwise
 operations (and hope the C compiler isn't smart enough to turn them back into branches; see
 `valgrind_ctime_test.c` for automated tests that this isn't the case). To do so, observe that a
 divstep can be written instead as (compare to the inner loop of `gcd` in section 1).
 ```python
    x = -f if delta > 0 else f         # set x equal to (input) -f or f
    if g & 1:
        g += x                         # set g to (input) g-f or g+f
        if delta > 0:
            delta = -delta
            f += g                     # set f to (input) g (note that g was set to g-f before)
    delta += 1
    g >>= 1
 ```
 To convert the above to bitwise operations, we rely on a trick to negate conditionally: per the
 definition of negative numbers in two's complement, (*-v == ~v + 1*) holds for every number *v*. As
 *-1* in two's complement is all *1* bits, bitflipping can be expressed as xor with *-1*. It follows
 that *-v == (v ^ -1) - (-1)*. Thus, if we have a variable *c* that takes on values *0* or *-1*, then
 *(v ^ c) - c* is *v* if *c=0* and *-v* if *c=-1*.
 Using this we can write:
 ```python
    x = -f if delta > 0 else f
 ```
 in constant-time form as:
 ```python
    c1 = (-delta) >> 63
    # Conditionally negate f based on c1:
    x = (f ^ c1) - c1
 ```
 To use that trick, we need a helper mask variable *c1* that resolves the condition *&delta;>0* to *-1*
 (if true) or *0* (if false). We compute *c1* using right shifting, which is equivalent to dividing by
 the specified power of *2* and rounding down (in Python, and also in C under the assumption of a typical two's complement system; see
 `assumptions.h` for tests that this is the case). Right shifting by *63* thus maps all
 numbers in range *[-2<sup>63</sup>,0)* to *-1*, and numbers in range *[0,2<sup>63</sup>)* to *0*.
 Using the facts that *x&0=0* and *x&(-1)=x* (on two's complement systems again), we can write:
 ```python
    if g & 1:
        g += x
 ```
 as:
 ```python
    # Compute c2=0 if g is even and c2=-1 if g is odd.
    c2 = -(g & 1)
    # This masks out x if g is even, and leaves x be if g is odd.
    g += x & c2
 ```
 Using the conditional negation trick again we can write:
 ```python
    if g & 1:
        if delta > 0:
            delta = -delta
 ```
 as:
 ```python
    # Compute c3=-1 if g is odd and delta>0, and 0 otherwise.
    c3 = c1 & c2
    # Conditionally negate delta based on c3:
    delta = (delta ^ c3) - c3
 ```
 Finally:
 ```python
    if g & 1:
        if delta > 0:
            f += g
 ```
 becomes:
 ```python
    f += g & c3
 ```
 It turns out that this can be implemented more efficiently by applying the substitution
 *&eta;=-&delta;*. In this representation, negating *&delta;* corresponds to negating *&eta;*, and incrementing
 *&delta;* corresponds to decrementing *&eta;*. This allows us to remove the negation in the *c1*
 computation:
 ```python
    # Compute a mask c1 for eta < 0, and compute the conditional negation x of f:
    c1 = eta >> 63
    x = (f ^ c1) - c1
    # Compute a mask c2 for odd g, and conditionally add x to g:
    c2 = -(g & 1)
    g += x & c2
    # Compute a mask c for (eta < 0) and odd (input) g, and use it to conditionally negate eta,
    # and add g to f:
    c3 = c1 & c2
    eta = (eta ^ c3) - c3
    f += g & c3
    # Incrementing delta corresponds to decrementing eta.
    eta -= 1
    g >>= 1
 ```
 A variant of divsteps with better worst-case performance can be used instead: starting *&delta;* at
 *1/2* instead of *1*. This reduces the worst case number of iterations to *590* for *256*-bit inputs
 (which can be shown using convex hull analysis). In this case, the substitution *&zeta;=-(&delta;+1/2)*
 is used instead to keep the variable integral. Incrementing *&delta;* by *1* still translates to
 decrementing *&zeta;* by *1*, but negating *&delta;* now corresponds to going from *&zeta;* to *-(&zeta;+1)*, or
 *~&zeta;*. Doing that conditionally based on *c3* is simply:
 ```python
    ...
    c3 = c1 & c2
    zeta ^= c3
    ...
 ```
 By replacing the loop in `divsteps_n_matrix` with a variant of the divstep code above (extended to
 also apply all *f* operations to *u*, *v* and all *g* operations to *q*, *r*), a constant-time version of
 `divsteps_n_matrix` is obtained. The full code will be in section 7.
 These bit fiddling tricks can also be used to make the conditional negations and additions in
 `update_de` and `normalize` constant-time.
 ## 6. Variable-time optimizations
 In section 5, we modified the `divsteps_n_matrix` function (and a few others) to be constant time.
 Constant time operations are only necessary when computing modular inverses of secret data. In
 other cases, it slows down calculations unnecessarily. In this section, we will construct a
 faster non-constant time `divsteps_n_matrix` function.
 To do so, first consider yet another way of writing the inner loop of divstep operations in
 `gcd` from section 1. This decomposition is also explained in the paper in section 8.2. We use
 the original version with initial *&delta;=1* and *&eta;=-&delta;* here.
 ```python
 for _ in range(N):
    if g & 1 and eta < 0:
        eta, f, g = -eta, g, -f
    if g & 1:
        g += f
    eta -= 1
    g >>= 1
 ```
 Whenever *g* is even, the loop only shifts *g* down and decreases *&eta;*. When *g* ends in multiple zero
 bits, these iterations can be consolidated into one step. This requires counting the bottom zero
 bits efficiently, which is possible on most platforms; it is abstracted here as the function
 `count_trailing_zeros`.
 ```python
 def count_trailing_zeros(v):
    """For a non-zero value v, find z such that v=(d<<z) for some odd d."""
    return (v & -v).bit_length() - 1
 i = N # divsteps left to do
 while True:
    # Get rid of all bottom zeros at once. In the first iteration, g may be odd and the following
    # lines have no effect (until "if eta < 0").
    zeros = min(i, count_trailing_zeros(g))
    eta -= zeros
    g >>= zeros
    i -= zeros
    if i == 0:
        break
    # We know g is odd now
    if eta < 0:
        eta, f, g = -eta, g, -f
    g += f
    # g is even now, and the eta decrement and g shift will happen in the next loop.
 ```
 We can now remove multiple bottom *0* bits from *g* at once, but still need a full iteration whenever
 there is a bottom *1* bit. In what follows, we will get rid of multiple *1* bits simultaneously as
 well.
 Observe that as long as *&eta; &geq; 0*, the loop does not modify *f*. Instead, it cancels out bottom
 bits of *g* and shifts them out, and decreases *&eta;* and *i* accordingly - interrupting only when *&eta;*
 becomes negative, or when *i* reaches *0*. Combined, this is equivalent to adding a multiple of *f* to
 *g* to cancel out multiple bottom bits, and then shifting them out.
 It is easy to find what that multiple is: we want a number *w* such that *g+w&thinsp;f* has a few bottom
 zero bits. If that number of bits is *L*, we want *g+w&thinsp;f mod 2<sup>L</sup> = 0*, or *w = -g/f mod 2<sup>L</sup>*. Since *f*
 is odd, such a *w* exists for any *L*. *L* cannot be more than *i* steps (as we'd finish the loop before
 doing more) or more than *&eta;+1* steps (as we'd run `eta, f, g = -eta, g, f` at that point), but
 apart from that, we're only limited by the complexity of computing *w*.
 This code demonstrates how to cancel up to 4 bits per step:
 ```python
 NEGINV16 = [15, 5, 3, 9, 7, 13, 11, 1] # NEGINV16[n//2] = (-n)^-1 mod 16, for odd n
 i = N
 while True:
    zeros = min(i, count_trailing_zeros(g))
    eta -= zeros
    g >>= zeros
    i -= zeros
    if i == 0:
        break
    # We know g is odd now
    if eta < 0:
        eta, f, g = -eta, g, f
    # Compute limit on number of bits to cancel
    limit = min(min(eta + 1, i), 4)
    # Compute w = -g/f mod 2**limit, using the table value for -1/f mod 2**4. Note that f is
    # always odd, so its inverse modulo a power of two always exists.
    w = (g * NEGINV16[(f & 15) // 2]) % (2**limit)
    # As w = -g/f mod (2**limit), g+w*f mod 2**limit = 0 mod 2**limit.
    g += w * f
    assert g % (2**limit) == 0
    # The next iteration will now shift out at least limit bottom zero bits from g.
 ```
 By using a bigger table more bits can be cancelled at once. The table can also be implemented
 as a formula. Several formulas are known for computing modular inverses modulo powers of two;
 some can be found in Hacker's Delight second edition by Henry S. Warren, Jr. pages 245-247.
 Here we need the negated modular inverse, which is a simple transformation of those:
 - Instead of a 3-bit table:
  - *-f* or *f ^ 6*
 - Instead of a 4-bit table:
  - *1 - f(f + 1)*
  - *-(f + (((f + 1) & 4) << 1))*
 - For larger tables the following technique can be used: if *w=-1/f mod 2<sup>L</sup>*, then *w(w&thinsp;f+2)* is
  *-1/f mod 2<sup>2L</sup>*. This allows extending the previous formulas (or tables). In particular we
  have this 6-bit function (based on the 3-bit function above):
  - *f(f<sup>2</sup> - 2)*
 This loop, again extended to also handle *u*, *v*, *q*, and *r* alongside *f* and *g*, placed in
 `divsteps_n_matrix`, gives a significantly faster, but non-constant time version.
 ## 7. Final Python version
 All together we need the following functions:
 - A way to compute the transition matrix in constant time, using the `divsteps_n_matrix` function
  from section 2, but with its loop replaced by a variant of the constant-time divstep from
  section 5, extended to handle *u*, *v*, *q*, *r*:
 ```python
 def divsteps_n_matrix(zeta, f, g):
    """Compute zeta and transition matrix t after N divsteps (multiplied by 2^N)."""
    u, v, q, r = 1, 0, 0, 1 # start with identity matrix
    for _ in range(N):
        c1 = zeta >> 63
        # Compute x, y, z as conditionally-negated versions of f, u, v.
        x, y, z = (f ^ c1) - c1, (u ^ c1) - c1, (v ^ c1) - c1
        c2 = -(g & 1)
        # Conditionally add x, y, z to g, q, r.
        g, q, r = g + (x & c2), q + (y & c2), r + (z & c2)
        c1 &= c2                     # reusing c1 here for the earlier c3 variable
        zeta = (zeta ^ c1) - 1       # inlining the unconditional zeta decrement here
        # Conditionally add g, q, r to f, u, v.
        f, u, v = f + (g & c1), u + (q & c1), v + (r & c1)
        # When shifting g down, don't shift q, r, as we construct a transition matrix multiplied
        # by 2^N. Instead, shift f's coefficients u and v up.
        g, u, v = g >> 1, u << 1, v << 1
    return zeta, (u, v, q, r)
 ```
 - The functions to update *f* and *g*, and *d* and *e*, from section 2 and section 4, with the constant-time
  changes to `update_de` from section 5:
 ```python
 def update_fg(f, g, t):
    """Multiply matrix t/2^N with [f, g]."""
    u, v, q, r = t
    cf, cg = u*f + v*g, q*f + r*g
    return cf >> N, cg >> N
 def update_de(d, e, t, M, Mi):
    """Multiply matrix t/2^N with [d, e], modulo M."""
    u, v, q, r = t
    d_sign, e_sign = d >> 257, e >> 257
    md, me = (u & d_sign) + (v & e_sign), (q & d_sign) + (r & e_sign)
    cd, ce = (u*d + v*e) % 2**N, (q*d + r*e) % 2**N
    md -= (Mi*cd + md) % 2**N
    me -= (Mi*ce + me) % 2**N
    cd, ce = u*d + v*e + M*md, q*d + r*e + M*me
    return cd >> N, ce >> N
 ```
 - The `normalize` function from section 4, made constant time as well:
 ```python
 def normalize(sign, v, M):
    """Compute sign*v mod M, where v in (-2*M,M); output in [0,M)."""
    v_sign = v >> 257
    # Conditionally add M to v.
    v += M & v_sign
    c = (sign - 1) >> 1
    # Conditionally negate v.
    v = (v ^ c) - c
    v_sign = v >> 257
    # Conditionally add M to v again.
    v += M & v_sign
    return v
 ```
 - And finally the `modinv` function too, adapted to use *&zeta;* instead of *&delta;*, and using the fixed
  iteration count from section 5:
 ```python
 def modinv(M, Mi, x):
    """Compute the modular inverse of x mod M, given Mi=1/M mod 2^N."""
    zeta, f, g, d, e = -1, M, x, 0, 1
    for _ in range((590 + N - 1) // N):
        zeta, t = divsteps_n_matrix(zeta, f % 2**N, g % 2**N)
        f, g = update_fg(f, g, t)
        d, e = update_de(d, e, t, M, Mi)
    return normalize(f, d, M)
 ```
 - To get a variable time version, replace the `divsteps_n_matrix` function with one that uses the
  divsteps loop from section 5, and a `modinv` version that calls it without the fixed iteration
  count:
 ```python
 NEGINV16 = [15, 5, 3, 9, 7, 13, 11, 1] # NEGINV16[n//2] = (-n)^-1 mod 16, for odd n
 def divsteps_n_matrix_var(eta, f, g):
    """Compute eta and transition matrix t after N divsteps (multiplied by 2^N)."""
    u, v, q, r = 1, 0, 0, 1
    i = N
    while True:
        zeros = min(i, count_trailing_zeros(g))
        eta, i = eta - zeros, i - zeros
        g, u, v = g >> zeros, u << zeros, v << zeros
        if i == 0:
            break
        if eta < 0:
            eta, f, u, v, g, q, r = -eta, g, q, r, -f, -u, -v
        limit = min(min(eta + 1, i), 4)
        w = (g * NEGINV16[(f & 15) // 2]) % (2**limit)
        g, q, r = g + w*f, q + w*u, r + w*v
    return eta, (u, v, q, r)
 def modinv_var(M, Mi, x):
    """Compute the modular inverse of x mod M, given Mi = 1/M mod 2^N."""
    eta, f, g, d, e = -1, M, x, 0, 1
    while g != 0:
        eta, t = divsteps_n_matrix_var(eta, f % 2**N, g % 2**N)
        f, g = update_fg(f, g, t)
        d, e = update_de(d, e, t, M, Mi)
    return normalize(f, d, Mi)
 ```
--- a/src/secp256k1/include/secp256k1.h
+++ b/src/secp256k1/include/secp256k1.h
@ -7,7 +7,9 @@ extern "C" {
 #include <stddef.h>
-/* These rules specify the order of arguments in API calls:
+/* Unless explicitly stated all pointer arguments must not be NULL.
 *
 * The following rules specify the order of arguments in API calls:
 *
 * 1. Context pointers go first, followed by output arguments, combined
 *    output/input arguments, and finally input-only arguments.
@ -61,8 +63,9 @@ typedef struct secp256k1_scratch_space_struct secp256k1_scratch_space;
 *  The exact representation of data inside is implementation defined and not
 *  guaranteed to be portable between different platforms or versions. It is
 *  however guaranteed to be 64 bytes in size, and can be safely copied/moved.
- *  If you need to convert to a format suitable for storage, transmission, or
+ *  If you need to convert to a format suitable for storage or transmission,
- *  comparison, use secp256k1_ec_pubkey_serialize and secp256k1_ec_pubkey_parse.
+ *  use secp256k1_ec_pubkey_serialize and secp256k1_ec_pubkey_parse. To
 *  compare keys, use secp256k1_ec_pubkey_cmp.
 */
 typedef struct {
    unsigned char data[64];
@ -127,6 +130,17 @@ typedef int (*secp256k1_nonce_function)(
 #  define SECP256K1_INLINE inline
 # endif
 /** When this header is used at build-time the SECP256K1_BUILD define needs to be set
 *  to correctly setup export attributes and nullness checks.  This is normally done
 *  by secp256k1.c but to guard against this header being included before secp256k1.c
 *  has had a chance to set the define (e.g. via test harnesses that just includes
 *  secp256k1.c) we set SECP256K1_NO_BUILD when this header is processed without the
 *  BUILD define so this condition can be caught.
 */
 #ifndef SECP256K1_BUILD
 # define SECP256K1_NO_BUILD
 #endif
 #ifndef SECP256K1_API
 # if defined(_WIN32)
 #  ifdef SECP256K1_BUILD
@ -370,6 +384,21 @@ SECP256K1_API int secp256k1_ec_pubkey_serialize(
    unsigned int flags
 ) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4);
 /** Compare two public keys using lexicographic (of compressed serialization) order
 *
 *  Returns: <0 if the first public key is less than the second
 *           >0 if the first public key is greater than the second
 *           0 if the two public keys are equal
 *  Args: ctx:      a secp256k1 context object.
 *  In:   pubkey1:  first public key to compare
 *        pubkey2:  second public key to compare
 */
 SECP256K1_API SECP256K1_WARN_UNUSED_RESULT int secp256k1_ec_pubkey_cmp(
    const secp256k1_context* ctx,
    const secp256k1_pubkey* pubkey1,
    const secp256k1_pubkey* pubkey2
 ) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
 /** Parse an ECDSA signature in compact (64 bytes) format.
 *
 *  Returns: 1 when the signature could be parsed, 0 otherwise.
--- a/src/secp256k1/include/secp256k1_extrakeys.h
+++ b/src/secp256k1/include/secp256k1_extrakeys.h
@ -15,9 +15,9 @@ extern "C" {
 *  The exact representation of data inside is implementation defined and not
 *  guaranteed to be portable between different platforms or versions. It is
 *  however guaranteed to be 64 bytes in size, and can be safely copied/moved.
- *  If you need to convert to a format suitable for storage, transmission, or
+ *  If you need to convert to a format suitable for storage, transmission, use
- *  comparison, use secp256k1_xonly_pubkey_serialize and
+ *  use secp256k1_xonly_pubkey_serialize and secp256k1_xonly_pubkey_parse. To
- *  secp256k1_xonly_pubkey_parse.
+ *  compare keys, use secp256k1_xonly_pubkey_cmp.
 */
 typedef struct {
    unsigned char data[64];
@ -67,6 +67,21 @@ SECP256K1_API int secp256k1_xonly_pubkey_serialize(
    const secp256k1_xonly_pubkey* pubkey
 ) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
 /** Compare two x-only public keys using lexicographic order
 *
 *  Returns: <0 if the first public key is less than the second
 *           >0 if the first public key is greater than the second
 *           0 if the two public keys are equal
 *  Args: ctx:      a secp256k1 context object.
 *  In:   pubkey1:  first public key to compare
 *        pubkey2:  second public key to compare
 */
 SECP256K1_API int secp256k1_xonly_pubkey_cmp(
    const secp256k1_context* ctx,
    const secp256k1_xonly_pubkey* pk1,
    const secp256k1_xonly_pubkey* pk2
 ) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
 /** Converts a secp256k1_pubkey into a secp256k1_xonly_pubkey.
 *
 *  Returns: 1 if the public key was successfully converted
--- a/src/secp256k1/obj/.gitignore
+++ b/src/secp256k1/obj/.gitignore
--- a/src/secp256k1/src/basic-config.h
+++ b/src/secp256k1/src/basic-config.h
@ -9,25 +9,8 @@
 #ifdef USE_BASIC_CONFIG
 #undef USE_ASM_X86_64
 #undef USE_ECMULT_STATIC_PRECOMPUTATION
 #undef USE_EXTERNAL_ASM
 #undef USE_EXTERNAL_DEFAULT_CALLBACKS
 #undef USE_FIELD_INV_BUILTIN
 #undef USE_FIELD_INV_NUM
 #undef USE_NUM_GMP
 #undef USE_NUM_NONE
 #undef USE_SCALAR_INV_BUILTIN
 #undef USE_SCALAR_INV_NUM
 #undef USE_FORCE_WIDEMUL_INT64
 #undef USE_FORCE_WIDEMUL_INT128
 #undef ECMULT_WINDOW_SIZE
 #define USE_NUM_NONE 1
 #define USE_FIELD_INV_BUILTIN 1
 #define USE_SCALAR_INV_BUILTIN 1
 #define USE_WIDEMUL_64 1
 #define ECMULT_WINDOW_SIZE 15
 #define ECMULT_GEN_PREC_BITS 4
 #endif /* USE_BASIC_CONFIG */
--- a/src/secp256k1/src/bench_ecdh.c
+++ b/src/secp256k1/src/bench_ecdh.c
@ -6,8 +6,8 @@
 #include <string.h>
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
-#include "include/secp256k1_ecdh.h"
+#include "../include/secp256k1_ecdh.h"
 #include "util.h"
 #include "bench.h"
--- a/src/secp256k1/src/bench_ecmult.c
+++ b/src/secp256k1/src/bench_ecmult.c
@ -5,43 +5,187 @@
 ***********************************************************************/
 #include <stdio.h>
-#include "include/secp256k1.h"
+#include "secp256k1.c"
 #include "../include/secp256k1.h"
 #include "util.h"
 #include "hash_impl.h"
 #include "num_impl.h"
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
 #include "ecmult_impl.h"
 #include "bench.h"
 #include "secp256k1.c"
 #define POINTS 32768
 void help(char **argv) {
    printf("Benchmark EC multiplication algorithms\n");
    printf("\n");
    printf("Usage: %s <help|pippenger_wnaf|strauss_wnaf|simple>\n", argv[0]);
    printf("The output shows the number of multiplied and summed points right after the\n");
    printf("function name. The letter 'g' indicates that one of the points is the generator.\n");
    printf("The benchmarks are divided by the number of points.\n");
    printf("\n");
    printf("default (ecmult_multi): picks pippenger_wnaf or strauss_wnaf depending on the\n");
    printf("                        batch size\n");
    printf("pippenger_wnaf:         for all batch sizes\n");
    printf("strauss_wnaf:           for all batch sizes\n");
    printf("simple:                 multiply and sum each point individually\n");
 }
 typedef struct {
    /* Setup once in advance */
    secp256k1_context* ctx;
    secp256k1_scratch_space* scratch;
    secp256k1_scalar* scalars;
    secp256k1_ge* pubkeys;
    secp256k1_gej* pubkeys_gej;
    secp256k1_scalar* seckeys;
    secp256k1_gej* expected_output;
    secp256k1_ecmult_multi_func ecmult_multi;
-    /* Changes per test */
+    /* Changes per benchmark */
    size_t count;
    int includes_g;
-    /* Changes per test iteration */
+    /* Changes per benchmark iteration, used to pick different scalars and pubkeys
     * in each run. */
    size_t offset1;
    size_t offset2;
-    /* Test output. */
+    /* Benchmark output. */
    secp256k1_gej* output;
 } bench_data;
-static int bench_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, void* arg) {
+/* Hashes x into [0, POINTS) twice and store the result in offset1 and offset2. */
 static void hash_into_offset(bench_data* data, size_t x) {
    data->offset1 = (x * 0x537b7f6f + 0x8f66a481) % POINTS;
    data->offset2 = (x * 0x7f6f537b + 0x6a1a8f49) % POINTS;
 }
 /* Check correctness of the benchmark by computing
 * sum(outputs) ?= (sum(scalars_gen) + sum(seckeys)*sum(scalars))*G */
 static void bench_ecmult_teardown_helper(bench_data* data, size_t* seckey_offset, size_t* scalar_offset, size_t* scalar_gen_offset, int iters) {
    int i;
    secp256k1_gej sum_output, tmp;
    secp256k1_scalar sum_scalars;
    secp256k1_gej_set_infinity(&sum_output);
    secp256k1_scalar_clear(&sum_scalars);
    for (i = 0; i < iters; ++i) {
        secp256k1_gej_add_var(&sum_output, &sum_output, &data->output[i], NULL);
        if (scalar_gen_offset != NULL) {
            secp256k1_scalar_add(&sum_scalars, &sum_scalars, &data->scalars[(*scalar_gen_offset+i) % POINTS]);
        }
        if (seckey_offset != NULL) {
            secp256k1_scalar s = data->seckeys[(*seckey_offset+i) % POINTS];
            secp256k1_scalar_mul(&s, &s, &data->scalars[(*scalar_offset+i) % POINTS]);
            secp256k1_scalar_add(&sum_scalars, &sum_scalars, &s);
        }
    }
    secp256k1_ecmult_gen(&data->ctx->ecmult_gen_ctx, &tmp, &sum_scalars);
    secp256k1_gej_neg(&tmp, &tmp);
    secp256k1_gej_add_var(&tmp, &tmp, &sum_output, NULL);
    CHECK(secp256k1_gej_is_infinity(&tmp));
 }
 static void bench_ecmult_setup(void* arg) {
    bench_data* data = (bench_data*)arg;
    /* Re-randomize offset to ensure that we're using different scalars and
     * group elements in each run. */
    hash_into_offset(data, data->offset1);
 }
 static void bench_ecmult_gen(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    int i;
    for (i = 0; i < iters; ++i) {
        secp256k1_ecmult_gen(&data->ctx->ecmult_gen_ctx, &data->output[i], &data->scalars[(data->offset1+i) % POINTS]);
    }
 }
 static void bench_ecmult_gen_teardown(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    bench_ecmult_teardown_helper(data, NULL, NULL, &data->offset1, iters);
 }
 static void bench_ecmult_const(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    int i;
    for (i = 0; i < iters; ++i) {
        secp256k1_ecmult_const(&data->output[i], &data->pubkeys[(data->offset1+i) % POINTS], &data->scalars[(data->offset2+i) % POINTS], 256);
    }
 }
 static void bench_ecmult_const_teardown(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    bench_ecmult_teardown_helper(data, &data->offset1, &data->offset2, NULL, iters);
 }
 static void bench_ecmult_1(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    int i;
    for (i = 0; i < iters; ++i) {
        secp256k1_ecmult(&data->ctx->ecmult_ctx, &data->output[i], &data->pubkeys_gej[(data->offset1+i) % POINTS], &data->scalars[(data->offset2+i) % POINTS], NULL);
    }
 }
 static void bench_ecmult_1_teardown(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    bench_ecmult_teardown_helper(data, &data->offset1, &data->offset2, NULL, iters);
 }
 static void bench_ecmult_1g(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    secp256k1_scalar zero;
    int i;
    secp256k1_scalar_set_int(&zero, 0);
    for (i = 0; i < iters; ++i) {
        secp256k1_ecmult(&data->ctx->ecmult_ctx, &data->output[i], NULL, &zero, &data->scalars[(data->offset1+i) % POINTS]);
    }
 }
 static void bench_ecmult_1g_teardown(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    bench_ecmult_teardown_helper(data, NULL, NULL, &data->offset1, iters);
 }
 static void bench_ecmult_2g(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    int i;
    for (i = 0; i < iters/2; ++i) {
        secp256k1_ecmult(&data->ctx->ecmult_ctx, &data->output[i], &data->pubkeys_gej[(data->offset1+i) % POINTS], &data->scalars[(data->offset2+i) % POINTS], &data->scalars[(data->offset1+i) % POINTS]);
    }
 }
 static void bench_ecmult_2g_teardown(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    bench_ecmult_teardown_helper(data, &data->offset1, &data->offset2, &data->offset1, iters/2);
 }
 static void run_ecmult_bench(bench_data* data, int iters) {
    char str[32];
    sprintf(str, "ecmult_gen");
    run_benchmark(str, bench_ecmult_gen, bench_ecmult_setup, bench_ecmult_gen_teardown, data, 10, iters);
    sprintf(str, "ecmult_const");
    run_benchmark(str, bench_ecmult_const, bench_ecmult_setup, bench_ecmult_const_teardown, data, 10, iters);
    /* ecmult with non generator point */
    sprintf(str, "ecmult 1");
    run_benchmark(str, bench_ecmult_1, bench_ecmult_setup, bench_ecmult_1_teardown, data, 10, iters);
    /* ecmult with generator point */
    sprintf(str, "ecmult 1g");
    run_benchmark(str, bench_ecmult_1g, bench_ecmult_setup, bench_ecmult_1g_teardown, data, 10, iters);
    /* ecmult with generator and non-generator point. The reported time is per point. */
    sprintf(str, "ecmult 2g");
    run_benchmark(str, bench_ecmult_2g, bench_ecmult_setup, bench_ecmult_2g_teardown, data, 10, 2*iters);
 }
 static int bench_ecmult_multi_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, void* arg) {
    bench_data* data = (bench_data*)arg;
    if (data->includes_g) ++idx;
    if (idx == 0) {
@ -54,7 +198,7 @@ static int bench_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, vo
    return 1;
 }
-static void bench_ecmult(void* arg, int iters) {
+static void bench_ecmult_multi(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    int includes_g = data->includes_g;
@ -63,19 +207,18 @@ static void bench_ecmult(void* arg, int iters) {
    iters = iters / data->count;
    for (iter = 0; iter < iters; ++iter) {
-        data->ecmult_multi(&data->ctx->error_callback, &data->ctx->ecmult_ctx, data->scratch, &data->output[iter], data->includes_g ? &data->scalars[data->offset1] : NULL, bench_callback, arg, count - includes_g);
+        data->ecmult_multi(&data->ctx->error_callback, &data->ctx->ecmult_ctx, data->scratch, &data->output[iter], data->includes_g ? &data->scalars[data->offset1] : NULL, bench_ecmult_multi_callback, arg, count - includes_g);
        data->offset1 = (data->offset1 + count) % POINTS;
        data->offset2 = (data->offset2 + count - 1) % POINTS;
    }
 }
-static void bench_ecmult_setup(void* arg) {
+static void bench_ecmult_multi_setup(void* arg) {
    bench_data* data = (bench_data*)arg;
-    data->offset1 = (data->count * 0x537b7f6f + 0x8f66a481) % POINTS;
+    hash_into_offset(data, data->count);
    data->offset2 = (data->count * 0x7f6f537b + 0x6a1a8f49) % POINTS;
 }
-static void bench_ecmult_teardown(void* arg, int iters) {
+static void bench_ecmult_multi_teardown(void* arg, int iters) {
    bench_data* data = (bench_data*)arg;
    int iter;
    iters = iters / data->count;
@ -89,7 +232,7 @@ static void bench_ecmult_teardown(void* arg, int iters) {
 static void generate_scalar(uint32_t num, secp256k1_scalar* scalar) {
    secp256k1_sha256 sha256;
-    unsigned char c[11] = {'e', 'c', 'm', 'u', 'l', 't', 0, 0, 0, 0};
+    unsigned char c[10] = {'e', 'c', 'm', 'u', 'l', 't', 0, 0, 0, 0};
    unsigned char buf[32];
    int overflow = 0;
    c[6] = num;
@ -103,7 +246,7 @@ static void generate_scalar(uint32_t num, secp256k1_scalar* scalar) {
    CHECK(!overflow);
 }
-static void run_test(bench_data* data, size_t count, int includes_g, int num_iters) {
+static void run_ecmult_multi_bench(bench_data* data, size_t count, int includes_g, int num_iters) {
    char str[32];
    static const secp256k1_scalar zero = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0);
    size_t iters = 1 + num_iters / count;
@ -113,8 +256,7 @@ static void run_test(bench_data* data, size_t count, int includes_g, int num_ite
    data->includes_g = includes_g;
    /* Compute (the negation of) the expected results directly. */
-    data->offset1 = (data->count * 0x537b7f6f + 0x8f66a481) % POINTS;
+    hash_into_offset(data, data->count);
    data->offset2 = (data->count * 0x7f6f537b + 0x6a1a8f49) % POINTS;
    for (iter = 0; iter < iters; ++iter) {
        secp256k1_scalar tmp;
        secp256k1_scalar total = data->scalars[(data->offset1++) % POINTS];
@ -128,25 +270,26 @@ static void run_test(bench_data* data, size_t count, int includes_g, int num_ite
    }
    /* Run the benchmark. */
-    sprintf(str, includes_g ? "ecmult_%ig" : "ecmult_%i", (int)count);
+    sprintf(str, includes_g ? "ecmult_multi %ig" : "ecmult_multi %i", (int)count);
-    run_benchmark(str, bench_ecmult, bench_ecmult_setup, bench_ecmult_teardown, data, 10, count * iters);
+    run_benchmark(str, bench_ecmult_multi, bench_ecmult_multi_setup, bench_ecmult_multi_teardown, data, 10, count * iters);
 }
 int main(int argc, char **argv) {
    bench_data data;
    int i, p;
    secp256k1_gej* pubkeys_gej;
    size_t scratch_size;
    int iters = get_iters(10000);
    data.ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN | SECP256K1_CONTEXT_VERIFY);
    scratch_size = secp256k1_strauss_scratch_size(POINTS) + STRAUSS_SCRATCH_OBJECTS*16;
    data.scratch = secp256k1_scratch_space_create(data.ctx, scratch_size);
    data.ecmult_multi = secp256k1_ecmult_multi_var;
    if (argc > 1) {
-        if(have_flag(argc, argv, "pippenger_wnaf")) {
+        if(have_flag(argc, argv, "-h")
           || have_flag(argc, argv, "--help")
           || have_flag(argc, argv, "help")) {
            help(argv);
            return 1;
        } else if(have_flag(argc, argv, "pippenger_wnaf")) {
            printf("Using pippenger_wnaf:\n");
            data.ecmult_multi = secp256k1_ecmult_pippenger_batch_single;
        } else if(have_flag(argc, argv, "strauss_wnaf")) {
@ -154,39 +297,48 @@ int main(int argc, char **argv) {
            data.ecmult_multi = secp256k1_ecmult_strauss_batch_single;
        } else if(have_flag(argc, argv, "simple")) {
            printf("Using simple algorithm:\n");
            data.ecmult_multi = secp256k1_ecmult_multi_var;
            secp256k1_scratch_space_destroy(data.ctx, data.scratch);
            data.scratch = NULL;
        } else {
-            fprintf(stderr, "%s: unrecognized argument '%s'.\n", argv[0], argv[1]);
+            fprintf(stderr, "%s: unrecognized argument '%s'.\n\n", argv[0], argv[1]);
-            fprintf(stderr, "Use 'pippenger_wnaf', 'strauss_wnaf', 'simple' or no argument to benchmark a combined algorithm.\n");
+            help(argv);
            return 1;
        }
    }
    data.ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN | SECP256K1_CONTEXT_VERIFY);
    scratch_size = secp256k1_strauss_scratch_size(POINTS) + STRAUSS_SCRATCH_OBJECTS*16;
    if (!have_flag(argc, argv, "simple")) {
        data.scratch = secp256k1_scratch_space_create(data.ctx, scratch_size);
    } else {
        data.scratch = NULL;
    }
    /* Allocate stuff */
    data.scalars = malloc(sizeof(secp256k1_scalar) * POINTS);
    data.seckeys = malloc(sizeof(secp256k1_scalar) * POINTS);
    data.pubkeys = malloc(sizeof(secp256k1_ge) * POINTS);
    data.pubkeys_gej = malloc(sizeof(secp256k1_gej) * POINTS);
    data.expected_output = malloc(sizeof(secp256k1_gej) * (iters + 1));
    data.output = malloc(sizeof(secp256k1_gej) * (iters + 1));
    /* Generate a set of scalars, and private/public keypairs. */
-    pubkeys_gej = malloc(sizeof(secp256k1_gej) * POINTS);
+    secp256k1_gej_set_ge(&data.pubkeys_gej[0], &secp256k1_ge_const_g);
    secp256k1_gej_set_ge(&pubkeys_gej[0], &secp256k1_ge_const_g);
    secp256k1_scalar_set_int(&data.seckeys[0], 1);
    for (i = 0; i < POINTS; ++i) {
        generate_scalar(i, &data.scalars[i]);
        if (i) {
-            secp256k1_gej_double_var(&pubkeys_gej[i], &pubkeys_gej[i - 1], NULL);
+            secp256k1_gej_double_var(&data.pubkeys_gej[i], &data.pubkeys_gej[i - 1], NULL);
            secp256k1_scalar_add(&data.seckeys[i], &data.seckeys[i - 1], &data.seckeys[i - 1]);
        }
    }
-    secp256k1_ge_set_all_gej_var(data.pubkeys, pubkeys_gej, POINTS);
+    secp256k1_ge_set_all_gej_var(data.pubkeys, data.pubkeys_gej, POINTS);
-    free(pubkeys_gej);
+
    /* Initialize offset1 and offset2 */
    hash_into_offset(&data, 0);
    run_ecmult_bench(&data, iters);
    for (i = 1; i <= 8; ++i) {
-        run_test(&data, i, 1, iters);
+        run_ecmult_multi_bench(&data, i, 1, iters);
    }
    /* This is disabled with low count of iterations because the loop runs 77 times even with iters=1
@ -195,7 +347,7 @@ int main(int argc, char **argv) {
     if (iters > 2) {
        for (p = 0; p <= 11; ++p) {
            for (i = 9; i <= 16; ++i) {
-                run_test(&data, i << p, 1, iters);
+                run_ecmult_multi_bench(&data, i << p, 1, iters);
            }
        }
    }
@ -206,6 +358,7 @@ int main(int argc, char **argv) {
    secp256k1_context_destroy(data.ctx);
    free(data.scalars);
    free(data.pubkeys);
    free(data.pubkeys_gej);
    free(data.seckeys);
    free(data.output);
    free(data.expected_output);
--- a/src/secp256k1/src/bench_internal.c
+++ b/src/secp256k1/src/bench_internal.c
@ -5,19 +5,18 @@
 ***********************************************************************/
 #include <stdio.h>
-#include "include/secp256k1.h"
+#include "secp256k1.c"
 #include "../include/secp256k1.h"
 #include "assumptions.h"
 #include "util.h"
 #include "hash_impl.h"
 #include "num_impl.h"
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
 #include "ecmult_const_impl.h"
 #include "ecmult_impl.h"
 #include "bench.h"
 #include "secp256k1.c"
 typedef struct {
    secp256k1_scalar scalar[2];
@ -99,15 +98,6 @@ void bench_scalar_negate(void* arg, int iters) {
    }
 }
 void bench_scalar_sqr(void* arg, int iters) {
    int i;
    bench_inv *data = (bench_inv*)arg;
    for (i = 0; i < iters; i++) {
        secp256k1_scalar_sqr(&data->scalar[0], &data->scalar[0]);
    }
 }
 void bench_scalar_mul(void* arg, int iters) {
    int i;
    bench_inv *data = (bench_inv*)arg;
@ -255,26 +245,6 @@ void bench_group_add_affine_var(void* arg, int iters) {
    }
 }
 void bench_group_jacobi_var(void* arg, int iters) {
    int i, j = 0;
    bench_inv *data = (bench_inv*)arg;
    for (i = 0; i < iters; i++) {
        j += secp256k1_gej_has_quad_y_var(&data->gej[0]);
        /* Vary the Y and Z coordinates of the input (the X coordinate doesn't matter to
           secp256k1_gej_has_quad_y_var). Note that the resulting coordinates will
           generally not correspond to a point on the curve, but this is not a problem
           for the code being benchmarked here. Adding and normalizing have less
           overhead than EC operations (which could guarantee the point remains on the
           curve). */
        secp256k1_fe_add(&data->gej[0].y, &data->fe[1]);
        secp256k1_fe_add(&data->gej[0].z, &data->fe[2]);
        secp256k1_fe_normalize_var(&data->gej[0].y);
        secp256k1_fe_normalize_var(&data->gej[0].z);
    }
    CHECK(j <= iters);
 }
 void bench_group_to_affine_var(void* arg, int iters) {
    int i;
    bench_inv *data = (bench_inv*)arg;
@ -282,8 +252,10 @@ void bench_group_to_affine_var(void* arg, int iters) {
    for (i = 0; i < iters; ++i) {
        secp256k1_ge_set_gej_var(&data->ge[1], &data->gej[0]);
        /* Use the output affine X/Y coordinates to vary the input X/Y/Z coordinates.
-           Similar to bench_group_jacobi_var, this approach does not result in
+           Note that the resulting coordinates will generally not correspond to a point
-           coordinates of points on the curve. */
+           on the curve, but this is not a problem for the code being benchmarked here.
           Adding and normalizing have less overhead than EC operations (which could
           guarantee the point remains on the curve). */
        secp256k1_fe_add(&data->gej[0].x, &data->ge[1].y);
        secp256k1_fe_add(&data->gej[0].y, &data->fe[2]);
        secp256k1_fe_add(&data->gej[0].z, &data->ge[1].x);
@ -369,35 +341,16 @@ void bench_context_sign(void* arg, int iters) {
    }
 }
 #ifndef USE_NUM_NONE
 void bench_num_jacobi(void* arg, int iters) {
    int i, j = 0;
    bench_inv *data = (bench_inv*)arg;
    secp256k1_num nx, na, norder;
    secp256k1_scalar_get_num(&nx, &data->scalar[0]);
    secp256k1_scalar_order_get_num(&norder);
    secp256k1_scalar_get_num(&na, &data->scalar[1]);
    for (i = 0; i < iters; i++) {
        j += secp256k1_num_jacobi(&nx, &norder);
        secp256k1_num_add(&nx, &nx, &na);
    }
    CHECK(j <= iters);
 }
 #endif
 int main(int argc, char **argv) {
    bench_inv data;
    int iters = get_iters(20000);
    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "add")) run_benchmark("scalar_add", bench_scalar_add, bench_setup, NULL, &data, 10, iters*100);
    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "negate")) run_benchmark("scalar_negate", bench_scalar_negate, bench_setup, NULL, &data, 10, iters*100);
    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "sqr")) run_benchmark("scalar_sqr", bench_scalar_sqr, bench_setup, NULL, &data, 10, iters*10);
    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "mul")) run_benchmark("scalar_mul", bench_scalar_mul, bench_setup, NULL, &data, 10, iters*10);
    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "split")) run_benchmark("scalar_split", bench_scalar_split, bench_setup, NULL, &data, 10, iters);
-    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, 2000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, iters);
-    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, 2000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, iters);
    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize", bench_field_normalize, bench_setup, NULL, &data, 10, iters*100);
    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize_weak", bench_field_normalize_weak, bench_setup, NULL, &data, 10, iters*100);
@ -411,7 +364,6 @@ int main(int argc, char **argv) {
    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_var", bench_group_add_var, bench_setup, NULL, &data, 10, iters*10);
    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine", bench_group_add_affine, bench_setup, NULL, &data, 10, iters*10);
    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine_var", bench_group_add_affine_var, bench_setup, NULL, &data, 10, iters*10);
    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "jacobi")) run_benchmark("group_jacobi_var", bench_group_jacobi_var, bench_setup, NULL, &data, 10, iters);
    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "to_affine")) run_benchmark("group_to_affine_var", bench_group_to_affine_var, bench_setup, NULL, &data, 10, iters);
    if (have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("wnaf_const", bench_wnaf_const, bench_setup, NULL, &data, 10, iters);
@ -424,8 +376,5 @@ int main(int argc, char **argv) {
    if (have_flag(argc, argv, "context") || have_flag(argc, argv, "verify")) run_benchmark("context_verify", bench_context_verify, bench_setup, NULL, &data, 10, 1 + iters/1000);
    if (have_flag(argc, argv, "context") || have_flag(argc, argv, "sign")) run_benchmark("context_sign", bench_context_sign, bench_setup, NULL, &data, 10, 1 + iters/100);
 #ifndef USE_NUM_NONE
    if (have_flag(argc, argv, "num") || have_flag(argc, argv, "jacobi")) run_benchmark("num_jacobi", bench_num_jacobi, bench_setup, NULL, &data, 10, iters*10);
 #endif
    return 0;
 }
--- a/src/secp256k1/src/bench_recover.c
+++ b/src/secp256k1/src/bench_recover.c
@ -4,8 +4,8 @@
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
-#include "include/secp256k1_recovery.h"
+#include "../include/secp256k1_recovery.h"
 #include "util.h"
 #include "bench.h"
--- a/src/secp256k1/src/bench_schnorrsig.c
+++ b/src/secp256k1/src/bench_schnorrsig.c
@ -8,8 +8,8 @@
 #include <stdlib.h>
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
-#include "include/secp256k1_schnorrsig.h"
+#include "../include/secp256k1_schnorrsig.h"
 #include "util.h"
 #include "bench.h"
--- a/src/secp256k1/src/bench_sign.c
+++ b/src/secp256k1/src/bench_sign.c
@ -4,7 +4,7 @@
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
 #include "util.h"
 #include "bench.h"
--- a/src/secp256k1/src/bench_verify.c
+++ b/src/secp256k1/src/bench_verify.c
@ -7,7 +7,7 @@
 #include <stdio.h>
 #include <string.h>
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
 #include "util.h"
 #include "bench.h"
--- a/src/secp256k1/src/ecmult.h
+++ b/src/secp256k1/src/ecmult.h
@ -7,7 +7,6 @@
 #ifndef SECP256K1_ECMULT_H
 #define SECP256K1_ECMULT_H
 #include "num.h"
 #include "group.h"
 #include "scalar.h"
 #include "scratch.h"
--- a/src/secp256k1/src/field.h
+++ b/src/secp256k1/src/field.h
@ -43,13 +43,12 @@ static void secp256k1_fe_normalize_weak(secp256k1_fe *r);
 /** Normalize a field element, without constant-time guarantee. */
 static void secp256k1_fe_normalize_var(secp256k1_fe *r);
-/** Verify whether a field element represents zero i.e. would normalize to a zero value. The field
+/** Verify whether a field element represents zero i.e. would normalize to a zero value. */
- *  implementation may optionally normalize the input, but this should not be relied upon. */
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r);
 static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r);
-/** Verify whether a field element represents zero i.e. would normalize to a zero value. The field
+/** Verify whether a field element represents zero i.e. would normalize to a zero value,
- *  implementation may optionally normalize the input, but this should not be relied upon. */
+ *  without constant-time guarantee. */
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r);
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r);
 /** Set a field element equal to a small integer. Resulting field element is normalized. */
 static void secp256k1_fe_set_int(secp256k1_fe *r, int a);
@ -104,9 +103,6 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a);
 *  itself. */
 static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a);
 /** Checks whether a field element is a quadratic residue. */
 static int secp256k1_fe_is_quad_var(const secp256k1_fe *a);
 /** Sets a field element to be the (modular) inverse of another. Requires the input's magnitude to be
 *  at most 8. The output magnitude is 1 (but not guaranteed to be normalized). */
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a);
--- a/src/secp256k1/src/field_10x26_impl.h
+++ b/src/secp256k1/src/field_10x26_impl.h
@ -9,6 +9,7 @@
 #include "util.h"
 #include "field.h"
 #include "modinv32_impl.h"
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe *a) {
@ -181,7 +182,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
 #endif
 }
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {
    uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
@ -210,7 +211,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
    return (z0 == 0) | (z1 == 0x3FFFFFFUL);
 }
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {
    uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
    uint32_t z0, z1;
    uint32_t x;
@ -1164,4 +1165,92 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 static void secp256k1_fe_from_signed30(secp256k1_fe *r, const secp256k1_modinv32_signed30 *a) {
    const uint32_t M26 = UINT32_MAX >> 6;
    const uint32_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4],
                   a5 = a->v[5], a6 = a->v[6], a7 = a->v[7], a8 = a->v[8];
    /* The output from secp256k1_modinv32{_var} should be normalized to range [0,modulus), and
     * have limbs in [0,2^30). The modulus is < 2^256, so the top limb must be below 2^(256-30*8).
     */
    VERIFY_CHECK(a0 >> 30 == 0);
    VERIFY_CHECK(a1 >> 30 == 0);
    VERIFY_CHECK(a2 >> 30 == 0);
    VERIFY_CHECK(a3 >> 30 == 0);
    VERIFY_CHECK(a4 >> 30 == 0);
    VERIFY_CHECK(a5 >> 30 == 0);
    VERIFY_CHECK(a6 >> 30 == 0);
    VERIFY_CHECK(a7 >> 30 == 0);
    VERIFY_CHECK(a8 >> 16 == 0);
    r->n[0] =  a0                   & M26;
    r->n[1] = (a0 >> 26 | a1 <<  4) & M26;
    r->n[2] = (a1 >> 22 | a2 <<  8) & M26;
    r->n[3] = (a2 >> 18 | a3 << 12) & M26;
    r->n[4] = (a3 >> 14 | a4 << 16) & M26;
    r->n[5] = (a4 >> 10 | a5 << 20) & M26;
    r->n[6] = (a5 >>  6 | a6 << 24) & M26;
    r->n[7] = (a6 >>  2           ) & M26;
    r->n[8] = (a6 >> 28 | a7 <<  2) & M26;
    r->n[9] = (a7 >> 24 | a8 <<  6);
 #ifdef VERIFY
    r->magnitude = 1;
    r->normalized = 1;
    secp256k1_fe_verify(r);
 #endif
 }
 static void secp256k1_fe_to_signed30(secp256k1_modinv32_signed30 *r, const secp256k1_fe *a) {
    const uint32_t M30 = UINT32_MAX >> 2;
    const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4],
                   a5 = a->n[5], a6 = a->n[6], a7 = a->n[7], a8 = a->n[8], a9 = a->n[9];
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
 #endif
    r->v[0] = (a0       | a1 << 26) & M30;
    r->v[1] = (a1 >>  4 | a2 << 22) & M30;
    r->v[2] = (a2 >>  8 | a3 << 18) & M30;
    r->v[3] = (a3 >> 12 | a4 << 14) & M30;
    r->v[4] = (a4 >> 16 | a5 << 10) & M30;
    r->v[5] = (a5 >> 20 | a6 <<  6) & M30;
    r->v[6] = (a6 >> 24 | a7 <<  2
                        | a8 << 28) & M30;
    r->v[7] = (a8 >>  2 | a9 << 24) & M30;
    r->v[8] =  a9 >>  6;
 }
 static const secp256k1_modinv32_modinfo secp256k1_const_modinfo_fe = {
    {{-0x3D1, -4, 0, 0, 0, 0, 0, 0, 65536}},
    0x2DDACACFL
 };
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
    secp256k1_fe tmp;
    secp256k1_modinv32_signed30 s;
    tmp = *x;
    secp256k1_fe_normalize(&tmp);
    secp256k1_fe_to_signed30(&s, &tmp);
    secp256k1_modinv32(&s, &secp256k1_const_modinfo_fe);
    secp256k1_fe_from_signed30(r, &s);
    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 }
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
    secp256k1_fe tmp;
    secp256k1_modinv32_signed30 s;
    tmp = *x;
    secp256k1_fe_normalize_var(&tmp);
    secp256k1_fe_to_signed30(&s, &tmp);
    secp256k1_modinv32_var(&s, &secp256k1_const_modinfo_fe);
    secp256k1_fe_from_signed30(r, &s);
    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 }
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
--- a/src/secp256k1/src/field_5x52_impl.h
+++ b/src/secp256k1/src/field_5x52_impl.h
@ -13,6 +13,7 @@
 #include "util.h"
 #include "field.h"
 #include "modinv64_impl.h"
 #if defined(USE_ASM_X86_64)
 #include "field_5x52_asm_impl.h"
@ -161,7 +162,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
 #endif
 }
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {
    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
@ -184,7 +185,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
    return (z0 == 0) | (z1 == 0xFFFFFFFFFFFFFULL);
 }
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {
    uint64_t t0, t1, t2, t3, t4;
    uint64_t z0, z1;
    uint64_t x;
@ -498,4 +499,80 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64_signed62 *a) {
    const uint64_t M52 = UINT64_MAX >> 12;
    const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
    /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and
     * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4).
     */
    VERIFY_CHECK(a0 >> 62 == 0);
    VERIFY_CHECK(a1 >> 62 == 0);
    VERIFY_CHECK(a2 >> 62 == 0);
    VERIFY_CHECK(a3 >> 62 == 0);
    VERIFY_CHECK(a4 >> 8 == 0);
    r->n[0] =  a0                   & M52;
    r->n[1] = (a0 >> 52 | a1 << 10) & M52;
    r->n[2] = (a1 >> 42 | a2 << 20) & M52;
    r->n[3] = (a2 >> 32 | a3 << 30) & M52;
    r->n[4] = (a3 >> 22 | a4 << 40);
 #ifdef VERIFY
    r->magnitude = 1;
    r->normalized = 1;
    secp256k1_fe_verify(r);
 #endif
 }
 static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_fe *a) {
    const uint64_t M62 = UINT64_MAX >> 2;
    const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
 #endif
    r->v[0] = (a0       | a1 << 52) & M62;
    r->v[1] = (a1 >> 10 | a2 << 42) & M62;
    r->v[2] = (a2 >> 20 | a3 << 32) & M62;
    r->v[3] = (a3 >> 30 | a4 << 22) & M62;
    r->v[4] =  a4 >> 40;
 }
 static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_fe = {
    {{-0x1000003D1LL, 0, 0, 0, 256}},
    0x27C7F6E22DDACACFLL
 };
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
    secp256k1_fe tmp;
    secp256k1_modinv64_signed62 s;
    tmp = *x;
    secp256k1_fe_normalize(&tmp);
    secp256k1_fe_to_signed62(&s, &tmp);
    secp256k1_modinv64(&s, &secp256k1_const_modinfo_fe);
    secp256k1_fe_from_signed62(r, &s);
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 #endif
 }
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
    secp256k1_fe tmp;
    secp256k1_modinv64_signed62 s;
    tmp = *x;
    secp256k1_fe_normalize_var(&tmp);
    secp256k1_fe_to_signed62(&s, &tmp);
    secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_fe);
    secp256k1_fe_from_signed62(r, &s);
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 #endif
 }
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
--- a/src/secp256k1/src/field_impl.h
+++ b/src/secp256k1/src/field_impl.h
@ -12,7 +12,6 @@
 #endif
 #include "util.h"
 #include "num.h"
 #if defined(SECP256K1_WIDEMUL_INT128)
 #include "field_5x52_impl.h"
@ -136,158 +135,6 @@ static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a) {
    return secp256k1_fe_equal(&t1, a);
 }
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
    int j;
    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
     */
    secp256k1_fe_sqr(&x2, a);
    secp256k1_fe_mul(&x2, &x2, a);
    secp256k1_fe_sqr(&x3, &x2);
    secp256k1_fe_mul(&x3, &x3, a);
    x6 = x3;
    for (j=0; j<3; j++) {
        secp256k1_fe_sqr(&x6, &x6);
    }
    secp256k1_fe_mul(&x6, &x6, &x3);
    x9 = x6;
    for (j=0; j<3; j++) {
        secp256k1_fe_sqr(&x9, &x9);
    }
    secp256k1_fe_mul(&x9, &x9, &x3);
    x11 = x9;
    for (j=0; j<2; j++) {
        secp256k1_fe_sqr(&x11, &x11);
    }
    secp256k1_fe_mul(&x11, &x11, &x2);
    x22 = x11;
    for (j=0; j<11; j++) {
        secp256k1_fe_sqr(&x22, &x22);
    }
    secp256k1_fe_mul(&x22, &x22, &x11);
    x44 = x22;
    for (j=0; j<22; j++) {
        secp256k1_fe_sqr(&x44, &x44);
    }
    secp256k1_fe_mul(&x44, &x44, &x22);
    x88 = x44;
    for (j=0; j<44; j++) {
        secp256k1_fe_sqr(&x88, &x88);
    }
    secp256k1_fe_mul(&x88, &x88, &x44);
    x176 = x88;
    for (j=0; j<88; j++) {
        secp256k1_fe_sqr(&x176, &x176);
    }
    secp256k1_fe_mul(&x176, &x176, &x88);
    x220 = x176;
    for (j=0; j<44; j++) {
        secp256k1_fe_sqr(&x220, &x220);
    }
    secp256k1_fe_mul(&x220, &x220, &x44);
    x223 = x220;
    for (j=0; j<3; j++) {
        secp256k1_fe_sqr(&x223, &x223);
    }
    secp256k1_fe_mul(&x223, &x223, &x3);
    /* The final result is then assembled using a sliding window over the blocks. */
    t1 = x223;
    for (j=0; j<23; j++) {
        secp256k1_fe_sqr(&t1, &t1);
    }
    secp256k1_fe_mul(&t1, &t1, &x22);
    for (j=0; j<5; j++) {
        secp256k1_fe_sqr(&t1, &t1);
    }
    secp256k1_fe_mul(&t1, &t1, a);
    for (j=0; j<3; j++) {
        secp256k1_fe_sqr(&t1, &t1);
    }
    secp256k1_fe_mul(&t1, &t1, &x2);
    for (j=0; j<2; j++) {
        secp256k1_fe_sqr(&t1, &t1);
    }
    secp256k1_fe_mul(r, a, &t1);
 }
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
 #if defined(USE_FIELD_INV_BUILTIN)
    secp256k1_fe_inv(r, a);
 #elif defined(USE_FIELD_INV_NUM)
    secp256k1_num n, m;
    static const secp256k1_fe negone = SECP256K1_FE_CONST(
        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
    );
    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
    static const unsigned char prime[32] = {
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
    };
    unsigned char b[32];
    int res;
    secp256k1_fe c = *a;
    secp256k1_fe_normalize_var(&c);
    secp256k1_fe_get_b32(b, &c);
    secp256k1_num_set_bin(&n, b, 32);
    secp256k1_num_set_bin(&m, prime, 32);
    secp256k1_num_mod_inverse(&n, &n, &m);
    secp256k1_num_get_bin(b, 32, &n);
    res = secp256k1_fe_set_b32(r, b);
    (void)res;
    VERIFY_CHECK(res);
    /* Verify the result is the (unique) valid inverse using non-GMP code. */
    secp256k1_fe_mul(&c, &c, r);
    secp256k1_fe_add(&c, &negone);
    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
 #else
 #error "Please select field inverse implementation"
 #endif
 }
 static int secp256k1_fe_is_quad_var(const secp256k1_fe *a) {
 #ifndef USE_NUM_NONE
    unsigned char b[32];
    secp256k1_num n;
    secp256k1_num m;
    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
    static const unsigned char prime[32] = {
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
    };
    secp256k1_fe c = *a;
    secp256k1_fe_normalize_var(&c);
    secp256k1_fe_get_b32(b, &c);
    secp256k1_num_set_bin(&n, b, 32);
    secp256k1_num_set_bin(&m, prime, 32);
    return secp256k1_num_jacobi(&n, &m) >= 0;
 #else
    secp256k1_fe r;
    return secp256k1_fe_sqrt(&r, a);
 #endif
 }
 static const secp256k1_fe secp256k1_fe_one = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 #endif /* SECP256K1_FIELD_IMPL_H */
--- a/src/secp256k1/src/gen_context.c
+++ b/src/secp256k1/src/gen_context.c
@ -9,10 +9,16 @@
 #if !defined(ECMULT_GEN_PREC_BITS)
 #include "libsecp256k1-config.h"
 #endif
 #define USE_BASIC_CONFIG 1
 #include "basic-config.h"
-#include "include/secp256k1.h"
+/* We can't require the precomputed tables when creating them. */
 #undef USE_ECMULT_STATIC_PRECOMPUTATION
 /* In principle we could use external ASM, but this yields only a minor speedup in
   build time and it's very complicated. In particular when cross-compiling, we'd
   need to build the external ASM for the build and the host machine. */
 #undef USE_EXTERNAL_ASM
 #include "../include/secp256k1.h"
 #include "assumptions.h"
 #include "util.h"
 #include "field_impl.h"
--- a/src/secp256k1/src/group.h
+++ b/src/secp256k1/src/group.h
@ -7,7 +7,6 @@
 #ifndef SECP256K1_GROUP_H
 #define SECP256K1_GROUP_H
 #include "num.h"
 #include "field.h"
 /** A group element of the secp256k1 curve, in affine coordinates. */
@ -43,12 +42,6 @@ typedef struct {
 /** Set a group element equal to the point with given X and Y coordinates */
 static void secp256k1_ge_set_xy(secp256k1_ge *r, const secp256k1_fe *x, const secp256k1_fe *y);
 /** Set a group element (affine) equal to the point with the given X coordinate
 *  and a Y coordinate that is a quadratic residue modulo p. The return value
 *  is true iff a coordinate with the given X coordinate exists.
 */
 static int secp256k1_ge_set_xquad(secp256k1_ge *r, const secp256k1_fe *x);
 /** Set a group element (affine) equal to the point with the given X coordinate, and given oddness
 *  for Y. Return value indicates whether the result is valid. */
 static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd);
@ -96,9 +89,6 @@ static void secp256k1_gej_neg(secp256k1_gej *r, const secp256k1_gej *a);
 /** Check whether a group element is the point at infinity. */
 static int secp256k1_gej_is_infinity(const secp256k1_gej *a);
 /** Check whether a group element's y coordinate is a quadratic residue. */
 static int secp256k1_gej_has_quad_y_var(const secp256k1_gej *a);
 /** Set r equal to the double of a. Constant time. */
 static void secp256k1_gej_double(secp256k1_gej *r, const secp256k1_gej *a);
--- a/src/secp256k1/src/group_impl.h
+++ b/src/secp256k1/src/group_impl.h
@ -7,7 +7,6 @@
 #ifndef SECP256K1_GROUP_IMPL_H
 #define SECP256K1_GROUP_IMPL_H
 #include "num.h"
 #include "field.h"
 #include "group.h"
@ -101,8 +100,8 @@ static void secp256k1_ge_set_gej(secp256k1_ge *r, secp256k1_gej *a) {
 static void secp256k1_ge_set_gej_var(secp256k1_ge *r, secp256k1_gej *a) {
    secp256k1_fe z2, z3;
    r->infinity = a->infinity;
    if (a->infinity) {
        secp256k1_ge_set_infinity(r);
        return;
    }
    secp256k1_fe_inv_var(&a->z, &a->z);
@ -111,8 +110,7 @@ static void secp256k1_ge_set_gej_var(secp256k1_ge *r, secp256k1_gej *a) {
    secp256k1_fe_mul(&a->x, &a->x, &z2);
    secp256k1_fe_mul(&a->y, &a->y, &z3);
    secp256k1_fe_set_int(&a->z, 1);
-    r->x = a->x;
+    secp256k1_ge_set_xy(r, &a->x, &a->y);
    r->y = a->y;
 }
 static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a, size_t len) {
@ -121,7 +119,9 @@ static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a
    size_t last_i = SIZE_MAX;
    for (i = 0; i < len; i++) {
-        if (!a[i].infinity) {
+        if (a[i].infinity) {
            secp256k1_ge_set_infinity(&r[i]);
        } else {
            /* Use destination's x coordinates as scratch space */
            if (last_i == SIZE_MAX) {
                r[i].x = a[i].z;
@ -149,7 +149,6 @@ static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a
    r[last_i].x = u;
    for (i = 0; i < len; i++) {
        r[i].infinity = a[i].infinity;
        if (!a[i].infinity) {
            secp256k1_ge_set_gej_zinv(&r[i], &a[i], &r[i].x);
        }
@ -207,18 +206,14 @@ static void secp256k1_ge_clear(secp256k1_ge *r) {
    secp256k1_fe_clear(&r->y);
 }
-static int secp256k1_ge_set_xquad(secp256k1_ge *r, const secp256k1_fe *x) {
+static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) {
    secp256k1_fe x2, x3;
    r->x = *x;
    secp256k1_fe_sqr(&x2, x);
    secp256k1_fe_mul(&x3, x, &x2);
    r->infinity = 0;
    secp256k1_fe_add(&x3, &secp256k1_fe_const_b);
-    return secp256k1_fe_sqrt(&r->y, &x3);
+    if (!secp256k1_fe_sqrt(&r->y, &x3)) {
 }
 static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) {
    if (!secp256k1_ge_set_xquad(r, x)) {
        return 0;
    }
    secp256k1_fe_normalize_var(&r->y);
@ -316,7 +311,7 @@ static void secp256k1_gej_double_var(secp256k1_gej *r, const secp256k1_gej *a, s
     *  point will be gibberish (z = 0 but infinity = 0).
     */
    if (a->infinity) {
-        r->infinity = 1;
+        secp256k1_gej_set_infinity(r);
        if (rzr != NULL) {
            secp256k1_fe_set_int(rzr, 1);
        }
@ -591,7 +586,7 @@ static void secp256k1_gej_add_ge(secp256k1_gej *r, const secp256k1_gej *a, const
    secp256k1_fe_cmov(&n, &m, degenerate);              /* n = M^3 * Malt (2) */
    secp256k1_fe_sqr(&t, &rr_alt);                      /* t = Ralt^2 (1) */
    secp256k1_fe_mul(&r->z, &a->z, &m_alt);             /* r->z = Malt*Z (1) */
-    infinity = secp256k1_fe_normalizes_to_zero(&r->z) * (1 - a->infinity);
+    infinity = secp256k1_fe_normalizes_to_zero(&r->z) & ~a->infinity;
    secp256k1_fe_mul_int(&r->z, 2);                     /* r->z = Z3 = 2*Malt*Z (2) */
    secp256k1_fe_negate(&q, &q, 1);                     /* q = -Q (2) */
    secp256k1_fe_add(&t, &q);                           /* t = Ralt^2-Q (3) */
@ -655,20 +650,6 @@ static void secp256k1_ge_mul_lambda(secp256k1_ge *r, const secp256k1_ge *a) {
    secp256k1_fe_mul(&r->x, &r->x, &beta);
 }
 static int secp256k1_gej_has_quad_y_var(const secp256k1_gej *a) {
    secp256k1_fe yz;
    if (a->infinity) {
        return 0;
    }
    /* We rely on the fact that the Jacobi symbol of 1 / a->z^3 is the same as
     * that of a->z. Thus a->y / a->z^3 is a quadratic residue iff a->y * a->z
       is */
    secp256k1_fe_mul(&yz, &a->y, &a->z);
    return secp256k1_fe_is_quad_var(&yz);
 }
 static int secp256k1_ge_is_in_correct_subgroup(const secp256k1_ge* ge) {
 #ifdef EXHAUSTIVE_TEST_ORDER
    secp256k1_gej out;
--- a/src/secp256k1/src/modinv32.h
+++ b/src/secp256k1/src/modinv32.h
@ -0,0 +1,42 @@
 /***********************************************************************
 * Copyright (c) 2020 Peter Dettman                                    *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 **********************************************************************/
 #ifndef SECP256K1_MODINV32_H
 #define SECP256K1_MODINV32_H
 #if defined HAVE_CONFIG_H
 #include "libsecp256k1-config.h"
 #endif
 #include "util.h"
 /* A signed 30-bit limb representation of integers.
 *
 * Its value is sum(v[i] * 2^(30*i), i=0..8). */
 typedef struct {
    int32_t v[9];
 } secp256k1_modinv32_signed30;
 typedef struct {
    /* The modulus in signed30 notation, must be odd and in [3, 2^256]. */
    secp256k1_modinv32_signed30 modulus;
    /* modulus^{-1} mod 2^30 */
    uint32_t modulus_inv30;
 } secp256k1_modinv32_modinfo;
 /* Replace x with its modular inverse mod modinfo->modulus. x must be in range [0, modulus).
 * If x is zero, the result will be zero as well. If not, the inverse must exist (i.e., the gcd of
 * x and modulus must be 1). These rules are automatically satisfied if the modulus is prime.
 *
 * On output, all of x's limbs will be in [0, 2^30).
 */
 static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
 /* Same as secp256k1_modinv32_var, but constant time in x (not in the modulus). */
 static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
 #endif /* SECP256K1_MODINV32_H */
--- a/src/secp256k1/src/modinv32_impl.h
+++ b/src/secp256k1/src/modinv32_impl.h
@ -0,0 +1,587 @@
 /***********************************************************************
 * Copyright (c) 2020 Peter Dettman                                    *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 **********************************************************************/
 #ifndef SECP256K1_MODINV32_IMPL_H
 #define SECP256K1_MODINV32_IMPL_H
 #include "modinv32.h"
 #include "util.h"
 #include <stdlib.h>
 /* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
 * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
 *
 * For an explanation of the algorithm, see doc/safegcd_implementation.md. This file contains an
 * implementation for N=30, using 30-bit signed limbs represented as int32_t.
 */
 #ifdef VERIFY
 static const secp256k1_modinv32_signed30 SECP256K1_SIGNED30_ONE = {{1}};
 /* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^30). */
 static void secp256k1_modinv32_mul_30(secp256k1_modinv32_signed30 *r, const secp256k1_modinv32_signed30 *a, int alen, int32_t factor) {
    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
    int64_t c = 0;
    int i;
    for (i = 0; i < 8; ++i) {
        if (i < alen) c += (int64_t)a->v[i] * factor;
        r->v[i] = (int32_t)c & M30; c >>= 30;
    }
    if (8 < alen) c += (int64_t)a->v[8] * factor;
    VERIFY_CHECK(c == (int32_t)c);
    r->v[8] = (int32_t)c;
 }
 /* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A consists of alen limbs; b has 9. */
 static int secp256k1_modinv32_mul_cmp_30(const secp256k1_modinv32_signed30 *a, int alen, const secp256k1_modinv32_signed30 *b, int32_t factor) {
    int i;
    secp256k1_modinv32_signed30 am, bm;
    secp256k1_modinv32_mul_30(&am, a, alen, 1); /* Normalize all but the top limb of a. */
    secp256k1_modinv32_mul_30(&bm, b, 9, factor);
    for (i = 0; i < 8; ++i) {
        /* Verify that all but the top limb of a and b are normalized. */
        VERIFY_CHECK(am.v[i] >> 30 == 0);
        VERIFY_CHECK(bm.v[i] >> 30 == 0);
    }
    for (i = 8; i >= 0; --i) {
        if (am.v[i] < bm.v[i]) return -1;
        if (am.v[i] > bm.v[i]) return 1;
    }
    return 0;
 }
 #endif
 /* Take as input a signed30 number in range (-2*modulus,modulus), and add a multiple of the modulus
 * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
 * process. The input must have limbs in range (-2^30,2^30). The output will have limbs in range
 * [0,2^30). */
 static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int32_t sign, const secp256k1_modinv32_modinfo *modinfo) {
    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
    int32_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4],
            r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8];
    int32_t cond_add, cond_negate;
 #ifdef VERIFY
    /* Verify that all limbs are in range (-2^30,2^30). */
    int i;
    for (i = 0; i < 9; ++i) {
        VERIFY_CHECK(r->v[i] >= -M30);
        VERIFY_CHECK(r->v[i] <= M30);
    }
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */
 #endif
    /* In a first step, add the modulus if the input is negative, and then negate if requested.
     * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
     * limbs are in range (-2^30,2^30), this cannot overflow an int32_t. Note that the right
     * shifts below are signed sign-extending shifts (see assumptions.h for tests that that is
     * indeed the behavior of the right shift operator). */
    cond_add = r8 >> 31;
    r0 += modinfo->modulus.v[0] & cond_add;
    r1 += modinfo->modulus.v[1] & cond_add;
    r2 += modinfo->modulus.v[2] & cond_add;
    r3 += modinfo->modulus.v[3] & cond_add;
    r4 += modinfo->modulus.v[4] & cond_add;
    r5 += modinfo->modulus.v[5] & cond_add;
    r6 += modinfo->modulus.v[6] & cond_add;
    r7 += modinfo->modulus.v[7] & cond_add;
    r8 += modinfo->modulus.v[8] & cond_add;
    cond_negate = sign >> 31;
    r0 = (r0 ^ cond_negate) - cond_negate;
    r1 = (r1 ^ cond_negate) - cond_negate;
    r2 = (r2 ^ cond_negate) - cond_negate;
    r3 = (r3 ^ cond_negate) - cond_negate;
    r4 = (r4 ^ cond_negate) - cond_negate;
    r5 = (r5 ^ cond_negate) - cond_negate;
    r6 = (r6 ^ cond_negate) - cond_negate;
    r7 = (r7 ^ cond_negate) - cond_negate;
    r8 = (r8 ^ cond_negate) - cond_negate;
    /* Propagate the top bits, to bring limbs back to range (-2^30,2^30). */
    r1 += r0 >> 30; r0 &= M30;
    r2 += r1 >> 30; r1 &= M30;
    r3 += r2 >> 30; r2 &= M30;
    r4 += r3 >> 30; r3 &= M30;
    r5 += r4 >> 30; r4 &= M30;
    r6 += r5 >> 30; r5 &= M30;
    r7 += r6 >> 30; r6 &= M30;
    r8 += r7 >> 30; r7 &= M30;
    /* In a second step add the modulus again if the result is still negative, bringing r to range
     * [0,modulus). */
    cond_add = r8 >> 31;
    r0 += modinfo->modulus.v[0] & cond_add;
    r1 += modinfo->modulus.v[1] & cond_add;
    r2 += modinfo->modulus.v[2] & cond_add;
    r3 += modinfo->modulus.v[3] & cond_add;
    r4 += modinfo->modulus.v[4] & cond_add;
    r5 += modinfo->modulus.v[5] & cond_add;
    r6 += modinfo->modulus.v[6] & cond_add;
    r7 += modinfo->modulus.v[7] & cond_add;
    r8 += modinfo->modulus.v[8] & cond_add;
    /* And propagate again. */
    r1 += r0 >> 30; r0 &= M30;
    r2 += r1 >> 30; r1 &= M30;
    r3 += r2 >> 30; r2 &= M30;
    r4 += r3 >> 30; r3 &= M30;
    r5 += r4 >> 30; r4 &= M30;
    r6 += r5 >> 30; r5 &= M30;
    r7 += r6 >> 30; r6 &= M30;
    r8 += r7 >> 30; r7 &= M30;
    r->v[0] = r0;
    r->v[1] = r1;
    r->v[2] = r2;
    r->v[3] = r3;
    r->v[4] = r4;
    r->v[5] = r5;
    r->v[6] = r6;
    r->v[7] = r7;
    r->v[8] = r8;
 #ifdef VERIFY
    VERIFY_CHECK(r0 >> 30 == 0);
    VERIFY_CHECK(r1 >> 30 == 0);
    VERIFY_CHECK(r2 >> 30 == 0);
    VERIFY_CHECK(r3 >> 30 == 0);
    VERIFY_CHECK(r4 >> 30 == 0);
    VERIFY_CHECK(r5 >> 30 == 0);
    VERIFY_CHECK(r6 >> 30 == 0);
    VERIFY_CHECK(r7 >> 30 == 0);
    VERIFY_CHECK(r8 >> 30 == 0);
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 0) >= 0); /* r >= 0 */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */
 #endif
 }
 /* Data type for transition matrices (see section 3 of explanation).
 *
 * t = [ u  v ]
 *     [ q  r ]
 */
 typedef struct {
    int32_t u, v, q, r;
 } secp256k1_modinv32_trans2x2;
 /* Compute the transition matrix and zeta for 30 divsteps.
 *
 * Input:  zeta: initial zeta
 *         f0:   bottom limb of initial f
 *         g0:   bottom limb of initial g
 * Output: t: transition matrix
 * Return: final zeta
 *
 * Implements the divsteps_n_matrix function from the explanation.
 */
 static int32_t secp256k1_modinv32_divsteps_30(int32_t zeta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
    /* u,v,q,r are the elements of the transformation matrix being built up,
     * starting with the identity matrix. Semantically they are signed integers
     * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This
     * permits left shifting (which is UB for negative numbers). The range
     * being inside [-2^31,2^31) means that casting to signed works correctly.
     */
    uint32_t u = 1, v = 0, q = 0, r = 1;
    uint32_t c1, c2, f = f0, g = g0, x, y, z;
    int i;
    for (i = 0; i < 30; ++i) {
        VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
        VERIFY_CHECK((u * f0 + v * g0) == f << i);
        VERIFY_CHECK((q * f0 + r * g0) == g << i);
        /* Compute conditional masks for (zeta < 0) and for (g & 1). */
        c1 = zeta >> 31;
        c2 = -(g & 1);
        /* Compute x,y,z, conditionally negated versions of f,u,v. */
        x = (f ^ c1) - c1;
        y = (u ^ c1) - c1;
        z = (v ^ c1) - c1;
        /* Conditionally add x,y,z to g,q,r. */
        g += x & c2;
        q += y & c2;
        r += z & c2;
        /* In what follows, c1 is a condition mask for (zeta < 0) and (g & 1). */
        c1 &= c2;
        /* Conditionally change zeta into -zeta-2 or zeta-1. */
        zeta = (zeta ^ c1) - 1;
        /* Conditionally add g,q,r to f,u,v. */
        f += g & c1;
        u += q & c1;
        v += r & c1;
        /* Shifts */
        g >>= 1;
        u <<= 1;
        v <<= 1;
        /* Bounds on zeta that follow from the bounds on iteration count (max 20*30 divsteps). */
        VERIFY_CHECK(zeta >= -601 && zeta <= 601);
    }
    /* Return data in t and return value. */
    t->u = (int32_t)u;
    t->v = (int32_t)v;
    t->q = (int32_t)q;
    t->r = (int32_t)r;
    /* The determinant of t must be a power of two. This guarantees that multiplication with t
     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
     * will be divided out again). As each divstep's individual matrix has determinant 2, the
     * aggregate of 30 of them will have determinant 2^30. */
    VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30);
    return zeta;
 }
 /* Compute the transition matrix and eta for 30 divsteps (variable time).
 *
 * Input:  eta: initial eta
 *         f0:  bottom limb of initial f
 *         g0:  bottom limb of initial g
 * Output: t: transition matrix
 * Return: final eta
 *
 * Implements the divsteps_n_matrix_var function from the explanation.
 */
 static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
    /* inv256[i] = -(2*i+1)^-1 (mod 256) */
    static const uint8_t inv256[128] = {
        0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
        0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
        0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
        0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
        0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
        0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
        0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
        0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
        0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
        0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
        0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
    };
    /* Transformation matrix; see comments in secp256k1_modinv32_divsteps_30. */
    uint32_t u = 1, v = 0, q = 0, r = 1;
    uint32_t f = f0, g = g0, m;
    uint16_t w;
    int i = 30, limit, zeros;
    for (;;) {
        /* Use a sentinel bit to count zeros only up to i. */
        zeros = secp256k1_ctz32_var(g | (UINT32_MAX << i));
        /* Perform zeros divsteps at once; they all just divide g by two. */
        g >>= zeros;
        u <<= zeros;
        v <<= zeros;
        eta -= zeros;
        i -= zeros;
         /* We're done once we've done 30 divsteps. */
        if (i == 0) break;
        VERIFY_CHECK((f & 1) == 1);
        VERIFY_CHECK((g & 1) == 1);
        VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
        VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
        /* Bounds on eta that follow from the bounds on iteration count (max 25*30 divsteps). */
        VERIFY_CHECK(eta >= -751 && eta <= 751);
        /* If eta is negative, negate it and replace f,g with g,-f. */
        if (eta < 0) {
            uint32_t tmp;
            eta = -eta;
            tmp = f; f = g; g = -tmp;
            tmp = u; u = q; q = -tmp;
            tmp = v; v = r; r = -tmp;
        }
        /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
         * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
         * can be done as its sign will flip once that happens. */
        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
        /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
        VERIFY_CHECK(limit > 0 && limit <= 30);
        m = (UINT32_MAX >> (32 - limit)) & 255U;
        /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
        w = (g * inv256[(f >> 1) & 127]) & m;
        /* Do so. */
        g += f * w;
        q += u * w;
        r += v * w;
        VERIFY_CHECK((g & m) == 0);
    }
    /* Return data in t and return value. */
    t->u = (int32_t)u;
    t->v = (int32_t)v;
    t->q = (int32_t)q;
    t->r = (int32_t)r;
    /* The determinant of t must be a power of two. This guarantees that multiplication with t
     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
     * will be divided out again). As each divstep's individual matrix has determinant 2, the
     * aggregate of 30 of them will have determinant 2^30. */
    VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30);
    return eta;
 }
 /* Compute (t/2^30) * [d, e] mod modulus, where t is a transition matrix for 30 divsteps.
 *
 * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range
 * (-2^30,2^30).
 *
 * This implements the update_de function from the explanation.
 */
 static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp256k1_modinv32_signed30 *e, const secp256k1_modinv32_trans2x2 *t, const secp256k1_modinv32_modinfo* modinfo) {
    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
    int32_t di, ei, md, me, sd, se;
    int64_t cd, ce;
    int i;
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, 1) < 0);  /* d <    modulus */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, 1) < 0);  /* e <    modulus */
    VERIFY_CHECK((labs(u) + labs(v)) >= 0); /* |u|+|v| doesn't overflow */
    VERIFY_CHECK((labs(q) + labs(r)) >= 0); /* |q|+|r| doesn't overflow */
    VERIFY_CHECK((labs(u) + labs(v)) <= M30 + 1); /* |u|+|v| <= 2^30 */
    VERIFY_CHECK((labs(q) + labs(r)) <= M30 + 1); /* |q|+|r| <= 2^30 */
 #endif
    /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
    sd = d->v[8] >> 31;
    se = e->v[8] >> 31;
    md = (u & sd) + (v & se);
    me = (q & sd) + (r & se);
    /* Begin computing t*[d,e]. */
    di = d->v[0];
    ei = e->v[0];
    cd = (int64_t)u * di + (int64_t)v * ei;
    ce = (int64_t)q * di + (int64_t)r * ei;
    /* Correct md,me so that t*[d,e]+modulus*[md,me] has 30 zero bottom bits. */
    md -= (modinfo->modulus_inv30 * (uint32_t)cd + md) & M30;
    me -= (modinfo->modulus_inv30 * (uint32_t)ce + me) & M30;
    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
    cd += (int64_t)modinfo->modulus.v[0] * md;
    ce += (int64_t)modinfo->modulus.v[0] * me;
    /* Verify that the low 30 bits of the computation are indeed zero, and then throw them away. */
    VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30;
    VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30;
    /* Now iteratively compute limb i=1..8 of t*[d,e]+modulus*[md,me], and store them in output
     * limb i-1 (shifting down by 30 bits). */
    for (i = 1; i < 9; ++i) {
        di = d->v[i];
        ei = e->v[i];
        cd += (int64_t)u * di + (int64_t)v * ei;
        ce += (int64_t)q * di + (int64_t)r * ei;
        cd += (int64_t)modinfo->modulus.v[i] * md;
        ce += (int64_t)modinfo->modulus.v[i] * me;
        d->v[i - 1] = (int32_t)cd & M30; cd >>= 30;
        e->v[i - 1] = (int32_t)ce & M30; ce >>= 30;
    }
    /* What remains is limb 9 of t*[d,e]+modulus*[md,me]; store it as output limb 8. */
    d->v[8] = (int32_t)cd;
    e->v[8] = (int32_t)ce;
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, 1) < 0);  /* d <    modulus */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, 1) < 0);  /* e <    modulus */
 #endif
 }
 /* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps.
 *
 * This implements the update_fg function from the explanation.
 */
 static void secp256k1_modinv32_update_fg_30(secp256k1_modinv32_signed30 *f, secp256k1_modinv32_signed30 *g, const secp256k1_modinv32_trans2x2 *t) {
    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
    int32_t fi, gi;
    int64_t cf, cg;
    int i;
    /* Start computing t*[f,g]. */
    fi = f->v[0];
    gi = g->v[0];
    cf = (int64_t)u * fi + (int64_t)v * gi;
    cg = (int64_t)q * fi + (int64_t)r * gi;
    /* Verify that the bottom 30 bits of the result are zero, and then throw them away. */
    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
    /* Now iteratively compute limb i=1..8 of t*[f,g], and store them in output limb i-1 (shifting
     * down by 30 bits). */
    for (i = 1; i < 9; ++i) {
        fi = f->v[i];
        gi = g->v[i];
        cf += (int64_t)u * fi + (int64_t)v * gi;
        cg += (int64_t)q * fi + (int64_t)r * gi;
        f->v[i - 1] = (int32_t)cf & M30; cf >>= 30;
        g->v[i - 1] = (int32_t)cg & M30; cg >>= 30;
    }
    /* What remains is limb 9 of t*[f,g]; store it as output limb 8. */
    f->v[8] = (int32_t)cf;
    g->v[8] = (int32_t)cg;
 }
 /* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps.
 *
 * Version that operates on a variable number of limbs in f and g.
 *
 * This implements the update_fg function from the explanation in modinv64_impl.h.
 */
 static void secp256k1_modinv32_update_fg_30_var(int len, secp256k1_modinv32_signed30 *f, secp256k1_modinv32_signed30 *g, const secp256k1_modinv32_trans2x2 *t) {
    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
    int32_t fi, gi;
    int64_t cf, cg;
    int i;
    VERIFY_CHECK(len > 0);
    /* Start computing t*[f,g]. */
    fi = f->v[0];
    gi = g->v[0];
    cf = (int64_t)u * fi + (int64_t)v * gi;
    cg = (int64_t)q * fi + (int64_t)r * gi;
    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
    /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
     * down by 30 bits). */
    for (i = 1; i < len; ++i) {
        fi = f->v[i];
        gi = g->v[i];
        cf += (int64_t)u * fi + (int64_t)v * gi;
        cg += (int64_t)q * fi + (int64_t)r * gi;
        f->v[i - 1] = (int32_t)cf & M30; cf >>= 30;
        g->v[i - 1] = (int32_t)cg & M30; cg >>= 30;
    }
    /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
    f->v[len - 1] = (int32_t)cf;
    g->v[len - 1] = (int32_t)cg;
 }
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
 static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
    /* Start with d=0, e=1, f=modulus, g=x, zeta=-1. */
    secp256k1_modinv32_signed30 d = {{0}};
    secp256k1_modinv32_signed30 e = {{1}};
    secp256k1_modinv32_signed30 f = modinfo->modulus;
    secp256k1_modinv32_signed30 g = *x;
    int i;
    int32_t zeta = -1; /* zeta = -(delta+1/2); delta is initially 1/2. */
    /* Do 20 iterations of 30 divsteps each = 600 divsteps. 590 suffices for 256-bit inputs. */
    for (i = 0; i < 20; ++i) {
        /* Compute transition matrix and new zeta after 30 divsteps. */
        secp256k1_modinv32_trans2x2 t;
        zeta = secp256k1_modinv32_divsteps_30(zeta, f.v[0], g.v[0], &t);
        /* Update d,e using that transition matrix. */
        secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
        /* Update f,g using that transition matrix. */
 #ifdef VERIFY
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) > 0); /* f > -modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) <= 0); /* f <= modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, -1) > 0); /* g > -modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
        secp256k1_modinv32_update_fg_30(&f, &g, &t);
 #ifdef VERIFY
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) > 0); /* f > -modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) <= 0); /* f <= modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, -1) > 0); /* g > -modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
    }
    /* At this point sufficient iterations have been performed that g must have reached 0
     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
 #ifdef VERIFY
    /* g == 0 */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &SECP256K1_SIGNED30_ONE, 0) == 0);
    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
                 secp256k1_modinv32_mul_cmp_30(&f, 9, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
                 (secp256k1_modinv32_mul_cmp_30(x, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
                  secp256k1_modinv32_mul_cmp_30(&d, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
                  (secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) == 0 ||
                   secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) == 0)));
 #endif
    /* Optionally negate d, normalize to [0,modulus), and return it. */
    secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
    *x = d;
 }
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (variable time). */
 static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
    secp256k1_modinv32_signed30 d = {{0, 0, 0, 0, 0, 0, 0, 0, 0}};
    secp256k1_modinv32_signed30 e = {{1, 0, 0, 0, 0, 0, 0, 0, 0}};
    secp256k1_modinv32_signed30 f = modinfo->modulus;
    secp256k1_modinv32_signed30 g = *x;
 #ifdef VERIFY
    int i = 0;
 #endif
    int j, len = 9;
    int32_t eta = -1; /* eta = -delta; delta is initially 1 (faster for the variable-time code) */
    int32_t cond, fn, gn;
    /* Do iterations of 30 divsteps each until g=0. */
    while (1) {
        /* Compute transition matrix and new eta after 30 divsteps. */
        secp256k1_modinv32_trans2x2 t;
        eta = secp256k1_modinv32_divsteps_30_var(eta, f.v[0], g.v[0], &t);
        /* Update d,e using that transition matrix. */
        secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
        /* Update f,g using that transition matrix. */
 #ifdef VERIFY
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
        secp256k1_modinv32_update_fg_30_var(len, &f, &g, &t);
        /* If the bottom limb of g is 0, there is a chance g=0. */
        if (g.v[0] == 0) {
            cond = 0;
            /* Check if all other limbs are also 0. */
            for (j = 1; j < len; ++j) {
                cond |= g.v[j];
            }
            /* If so, we're done. */
            if (cond == 0) break;
        }
        /* Determine if len>1 and limb (len-1) of both f and g is 0 or -1. */
        fn = f.v[len - 1];
        gn = g.v[len - 1];
        cond = ((int32_t)len - 2) >> 31;
        cond |= fn ^ (fn >> 31);
        cond |= gn ^ (gn >> 31);
        /* If so, reduce length, propagating the sign of f and g's top limb into the one below. */
        if (cond == 0) {
            f.v[len - 2] |= (uint32_t)fn << 30;
            g.v[len - 2] |= (uint32_t)gn << 30;
            --len;
        }
 #ifdef VERIFY
        VERIFY_CHECK(++i < 25); /* We should never need more than 25*30 = 750 divsteps */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
    }
    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 #ifdef VERIFY
    /* g == 0 */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &SECP256K1_SIGNED30_ONE, 0) == 0);
    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
                 secp256k1_modinv32_mul_cmp_30(&f, len, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
                 (secp256k1_modinv32_mul_cmp_30(x, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
                  secp256k1_modinv32_mul_cmp_30(&d, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
                  (secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) == 0 ||
                   secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) == 0)));
 #endif
    /* Optionally negate d, normalize to [0,modulus), and return it. */
    secp256k1_modinv32_normalize_30(&d, f.v[len - 1], modinfo);
    *x = d;
 }
 #endif /* SECP256K1_MODINV32_IMPL_H */
--- a/src/secp256k1/src/modinv64.h
+++ b/src/secp256k1/src/modinv64.h
@ -0,0 +1,46 @@
 /***********************************************************************
 * Copyright (c) 2020 Peter Dettman                                    *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 **********************************************************************/
 #ifndef SECP256K1_MODINV64_H
 #define SECP256K1_MODINV64_H
 #if defined HAVE_CONFIG_H
 #include "libsecp256k1-config.h"
 #endif
 #include "util.h"
 #ifndef SECP256K1_WIDEMUL_INT128
 #error "modinv64 requires 128-bit wide multiplication support"
 #endif
 /* A signed 62-bit limb representation of integers.
 *
 * Its value is sum(v[i] * 2^(62*i), i=0..4). */
 typedef struct {
    int64_t v[5];
 } secp256k1_modinv64_signed62;
 typedef struct {
    /* The modulus in signed62 notation, must be odd and in [3, 2^256]. */
    secp256k1_modinv64_signed62 modulus;
    /* modulus^{-1} mod 2^62 */
    uint64_t modulus_inv62;
 } secp256k1_modinv64_modinfo;
 /* Replace x with its modular inverse mod modinfo->modulus. x must be in range [0, modulus).
 * If x is zero, the result will be zero as well. If not, the inverse must exist (i.e., the gcd of
 * x and modulus must be 1). These rules are automatically satisfied if the modulus is prime.
 *
 * On output, all of x's limbs will be in [0, 2^62).
 */
 static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
 /* Same as secp256k1_modinv64_var, but constant time in x (not in the modulus). */
 static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
 #endif /* SECP256K1_MODINV64_H */
--- a/src/secp256k1/src/modinv64_impl.h
+++ b/src/secp256k1/src/modinv64_impl.h
@ -0,0 +1,593 @@
 /***********************************************************************
 * Copyright (c) 2020 Peter Dettman                                    *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 **********************************************************************/
 #ifndef SECP256K1_MODINV64_IMPL_H
 #define SECP256K1_MODINV64_IMPL_H
 #include "modinv64.h"
 #include "util.h"
 /* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
 * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
 *
 * For an explanation of the algorithm, see doc/safegcd_implementation.md. This file contains an
 * implementation for N=62, using 62-bit signed limbs represented as int64_t.
 */
 #ifdef VERIFY
 /* Helper function to compute the absolute value of an int64_t.
 * (we don't use abs/labs/llabs as it depends on the int sizes). */
 static int64_t secp256k1_modinv64_abs(int64_t v) {
    VERIFY_CHECK(v > INT64_MIN);
    if (v < 0) return -v;
    return v;
 }
 static const secp256k1_modinv64_signed62 SECP256K1_SIGNED62_ONE = {{1}};
 /* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^62). */
 static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int alen, int64_t factor) {
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
    int128_t c = 0;
    int i;
    for (i = 0; i < 4; ++i) {
        if (i < alen) c += (int128_t)a->v[i] * factor;
        r->v[i] = (int64_t)c & M62; c >>= 62;
    }
    if (4 < alen) c += (int128_t)a->v[4] * factor;
    VERIFY_CHECK(c == (int64_t)c);
    r->v[4] = (int64_t)c;
 }
 /* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A has alen limbs; b has 5. */
 static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, int alen, const secp256k1_modinv64_signed62 *b, int64_t factor) {
    int i;
    secp256k1_modinv64_signed62 am, bm;
    secp256k1_modinv64_mul_62(&am, a, alen, 1); /* Normalize all but the top limb of a. */
    secp256k1_modinv64_mul_62(&bm, b, 5, factor);
    for (i = 0; i < 4; ++i) {
        /* Verify that all but the top limb of a and b are normalized. */
        VERIFY_CHECK(am.v[i] >> 62 == 0);
        VERIFY_CHECK(bm.v[i] >> 62 == 0);
    }
    for (i = 4; i >= 0; --i) {
        if (am.v[i] < bm.v[i]) return -1;
        if (am.v[i] > bm.v[i]) return 1;
    }
    return 0;
 }
 #endif
 /* Take as input a signed62 number in range (-2*modulus,modulus), and add a multiple of the modulus
 * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
 * process. The input must have limbs in range (-2^62,2^62). The output will have limbs in range
 * [0,2^62). */
 static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int64_t sign, const secp256k1_modinv64_modinfo *modinfo) {
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
    int64_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4];
    int64_t cond_add, cond_negate;
 #ifdef VERIFY
    /* Verify that all limbs are in range (-2^62,2^62). */
    int i;
    for (i = 0; i < 5; ++i) {
        VERIFY_CHECK(r->v[i] >= -M62);
        VERIFY_CHECK(r->v[i] <= M62);
    }
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 1) < 0); /* r < modulus */
 #endif
    /* In a first step, add the modulus if the input is negative, and then negate if requested.
     * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
     * limbs are in range (-2^62,2^62), this cannot overflow an int64_t. Note that the right
     * shifts below are signed sign-extending shifts (see assumptions.h for tests that that is
     * indeed the behavior of the right shift operator). */
    cond_add = r4 >> 63;
    r0 += modinfo->modulus.v[0] & cond_add;
    r1 += modinfo->modulus.v[1] & cond_add;
    r2 += modinfo->modulus.v[2] & cond_add;
    r3 += modinfo->modulus.v[3] & cond_add;
    r4 += modinfo->modulus.v[4] & cond_add;
    cond_negate = sign >> 63;
    r0 = (r0 ^ cond_negate) - cond_negate;
    r1 = (r1 ^ cond_negate) - cond_negate;
    r2 = (r2 ^ cond_negate) - cond_negate;
    r3 = (r3 ^ cond_negate) - cond_negate;
    r4 = (r4 ^ cond_negate) - cond_negate;
    /* Propagate the top bits, to bring limbs back to range (-2^62,2^62). */
    r1 += r0 >> 62; r0 &= M62;
    r2 += r1 >> 62; r1 &= M62;
    r3 += r2 >> 62; r2 &= M62;
    r4 += r3 >> 62; r3 &= M62;
    /* In a second step add the modulus again if the result is still negative, bringing
     * r to range [0,modulus). */
    cond_add = r4 >> 63;
    r0 += modinfo->modulus.v[0] & cond_add;
    r1 += modinfo->modulus.v[1] & cond_add;
    r2 += modinfo->modulus.v[2] & cond_add;
    r3 += modinfo->modulus.v[3] & cond_add;
    r4 += modinfo->modulus.v[4] & cond_add;
    /* And propagate again. */
    r1 += r0 >> 62; r0 &= M62;
    r2 += r1 >> 62; r1 &= M62;
    r3 += r2 >> 62; r2 &= M62;
    r4 += r3 >> 62; r3 &= M62;
    r->v[0] = r0;
    r->v[1] = r1;
    r->v[2] = r2;
    r->v[3] = r3;
    r->v[4] = r4;
 #ifdef VERIFY
    VERIFY_CHECK(r0 >> 62 == 0);
    VERIFY_CHECK(r1 >> 62 == 0);
    VERIFY_CHECK(r2 >> 62 == 0);
    VERIFY_CHECK(r3 >> 62 == 0);
    VERIFY_CHECK(r4 >> 62 == 0);
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 0) >= 0); /* r >= 0 */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 1) < 0); /* r < modulus */
 #endif
 }
 /* Data type for transition matrices (see section 3 of explanation).
 *
 * t = [ u  v ]
 *     [ q  r ]
 */
 typedef struct {
    int64_t u, v, q, r;
 } secp256k1_modinv64_trans2x2;
 /* Compute the transition matrix and eta for 59 divsteps (where zeta=-(delta+1/2)).
 * Note that the transformation matrix is scaled by 2^62 and not 2^59.
 *
 * Input:  zeta: initial zeta
 *         f0:   bottom limb of initial f
 *         g0:   bottom limb of initial g
 * Output: t: transition matrix
 * Return: final zeta
 *
 * Implements the divsteps_n_matrix function from the explanation.
 */
 static int64_t secp256k1_modinv64_divsteps_59(int64_t zeta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
    /* u,v,q,r are the elements of the transformation matrix being built up,
     * starting with the identity matrix times 8 (because the caller expects
     * a result scaled by 2^62). Semantically they are signed integers
     * in range [-2^62,2^62], but here represented as unsigned mod 2^64. This
     * permits left shifting (which is UB for negative numbers). The range
     * being inside [-2^63,2^63) means that casting to signed works correctly.
     */
    uint64_t u = 8, v = 0, q = 0, r = 8;
    uint64_t c1, c2, f = f0, g = g0, x, y, z;
    int i;
    for (i = 3; i < 62; ++i) {
        VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
        VERIFY_CHECK((u * f0 + v * g0) == f << i);
        VERIFY_CHECK((q * f0 + r * g0) == g << i);
        /* Compute conditional masks for (zeta < 0) and for (g & 1). */
        c1 = zeta >> 63;
        c2 = -(g & 1);
        /* Compute x,y,z, conditionally negated versions of f,u,v. */
        x = (f ^ c1) - c1;
        y = (u ^ c1) - c1;
        z = (v ^ c1) - c1;
        /* Conditionally add x,y,z to g,q,r. */
        g += x & c2;
        q += y & c2;
        r += z & c2;
        /* In what follows, c1 is a condition mask for (zeta < 0) and (g & 1). */
        c1 &= c2;
        /* Conditionally change zeta into -zeta-2 or zeta-1. */
        zeta = (zeta ^ c1) - 1;
        /* Conditionally add g,q,r to f,u,v. */
        f += g & c1;
        u += q & c1;
        v += r & c1;
        /* Shifts */
        g >>= 1;
        u <<= 1;
        v <<= 1;
        /* Bounds on zeta that follow from the bounds on iteration count (max 10*59 divsteps). */
        VERIFY_CHECK(zeta >= -591 && zeta <= 591);
    }
    /* Return data in t and return value. */
    t->u = (int64_t)u;
    t->v = (int64_t)v;
    t->q = (int64_t)q;
    t->r = (int64_t)r;
    /* The determinant of t must be a power of two. This guarantees that multiplication with t
     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
     * will be divided out again). As each divstep's individual matrix has determinant 2, the
     * aggregate of 59 of them will have determinant 2^59. Multiplying with the initial
     * 8*identity (which has determinant 2^6) means the overall outputs has determinant
     * 2^65. */
    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 65);
    return zeta;
 }
 /* Compute the transition matrix and eta for 62 divsteps (variable time, eta=-delta).
 *
 * Input:  eta: initial eta
 *         f0:  bottom limb of initial f
 *         g0:  bottom limb of initial g
 * Output: t: transition matrix
 * Return: final eta
 *
 * Implements the divsteps_n_matrix_var function from the explanation.
 */
 static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
    /* Transformation matrix; see comments in secp256k1_modinv64_divsteps_62. */
    uint64_t u = 1, v = 0, q = 0, r = 1;
    uint64_t f = f0, g = g0, m;
    uint32_t w;
    int i = 62, limit, zeros;
    for (;;) {
        /* Use a sentinel bit to count zeros only up to i. */
        zeros = secp256k1_ctz64_var(g | (UINT64_MAX << i));
        /* Perform zeros divsteps at once; they all just divide g by two. */
        g >>= zeros;
        u <<= zeros;
        v <<= zeros;
        eta -= zeros;
        i -= zeros;
        /* We're done once we've done 62 divsteps. */
        if (i == 0) break;
        VERIFY_CHECK((f & 1) == 1);
        VERIFY_CHECK((g & 1) == 1);
        VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i));
        VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i));
        /* Bounds on eta that follow from the bounds on iteration count (max 12*62 divsteps). */
        VERIFY_CHECK(eta >= -745 && eta <= 745);
        /* If eta is negative, negate it and replace f,g with g,-f. */
        if (eta < 0) {
            uint64_t tmp;
            eta = -eta;
            tmp = f; f = g; g = -tmp;
            tmp = u; u = q; q = -tmp;
            tmp = v; v = r; r = -tmp;
            /* Use a formula to cancel out up to 6 bits of g. Also, no more than i can be cancelled
             * out (as we'd be done before that point), and no more than eta+1 can be done as its
             * will flip again once that happens. */
            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
            VERIFY_CHECK(limit > 0 && limit <= 62);
            /* m is a mask for the bottom min(limit, 6) bits. */
            m = (UINT64_MAX >> (64 - limit)) & 63U;
            /* Find what multiple of f must be added to g to cancel its bottom min(limit, 6)
             * bits. */
            w = (f * g * (f * f - 2)) & m;
        } else {
            /* In this branch, use a simpler formula that only lets us cancel up to 4 bits of g, as
             * eta tends to be smaller here. */
            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
            VERIFY_CHECK(limit > 0 && limit <= 62);
            /* m is a mask for the bottom min(limit, 4) bits. */
            m = (UINT64_MAX >> (64 - limit)) & 15U;
            /* Find what multiple of f must be added to g to cancel its bottom min(limit, 4)
             * bits. */
            w = f + (((f + 1) & 4) << 1);
            w = (-w * g) & m;
        }
        g += f * w;
        q += u * w;
        r += v * w;
        VERIFY_CHECK((g & m) == 0);
    }
    /* Return data in t and return value. */
    t->u = (int64_t)u;
    t->v = (int64_t)v;
    t->q = (int64_t)q;
    t->r = (int64_t)r;
    /* The determinant of t must be a power of two. This guarantees that multiplication with t
     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
     * will be divided out again). As each divstep's individual matrix has determinant 2, the
     * aggregate of 62 of them will have determinant 2^62. */
    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
    return eta;
 }
 /* Compute (t/2^62) * [d, e] mod modulus, where t is a transition matrix scaled by 2^62.
 *
 * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range
 * (-2^62,2^62).
 *
 * This implements the update_de function from the explanation.
 */
 static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp256k1_modinv64_signed62 *e, const secp256k1_modinv64_trans2x2 *t, const secp256k1_modinv64_modinfo* modinfo) {
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
    const int64_t d0 = d->v[0], d1 = d->v[1], d2 = d->v[2], d3 = d->v[3], d4 = d->v[4];
    const int64_t e0 = e->v[0], e1 = e->v[1], e2 = e->v[2], e3 = e->v[3], e4 = e->v[4];
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
    int64_t md, me, sd, se;
    int128_t cd, ce;
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, 1) < 0);  /* e <    modulus */
    VERIFY_CHECK((secp256k1_modinv64_abs(u) + secp256k1_modinv64_abs(v)) >= 0); /* |u|+|v| doesn't overflow */
    VERIFY_CHECK((secp256k1_modinv64_abs(q) + secp256k1_modinv64_abs(r)) >= 0); /* |q|+|r| doesn't overflow */
    VERIFY_CHECK((secp256k1_modinv64_abs(u) + secp256k1_modinv64_abs(v)) <= M62 + 1); /* |u|+|v| <= 2^62 */
    VERIFY_CHECK((secp256k1_modinv64_abs(q) + secp256k1_modinv64_abs(r)) <= M62 + 1); /* |q|+|r| <= 2^62 */
 #endif
    /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
    sd = d4 >> 63;
    se = e4 >> 63;
    md = (u & sd) + (v & se);
    me = (q & sd) + (r & se);
    /* Begin computing t*[d,e]. */
    cd = (int128_t)u * d0 + (int128_t)v * e0;
    ce = (int128_t)q * d0 + (int128_t)r * e0;
    /* Correct md,me so that t*[d,e]+modulus*[md,me] has 62 zero bottom bits. */
    md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
    me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
    cd += (int128_t)modinfo->modulus.v[0] * md;
    ce += (int128_t)modinfo->modulus.v[0] * me;
    /* Verify that the low 62 bits of the computation are indeed zero, and then throw them away. */
    VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
    /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
    cd += (int128_t)u * d1 + (int128_t)v * e1;
    ce += (int128_t)q * d1 + (int128_t)r * e1;
    if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */
        cd += (int128_t)modinfo->modulus.v[1] * md;
        ce += (int128_t)modinfo->modulus.v[1] * me;
    }
    d->v[0] = (int64_t)cd & M62; cd >>= 62;
    e->v[0] = (int64_t)ce & M62; ce >>= 62;
    /* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */
    cd += (int128_t)u * d2 + (int128_t)v * e2;
    ce += (int128_t)q * d2 + (int128_t)r * e2;
    if (modinfo->modulus.v[2]) { /* Optimize for the case where limb of modulus is zero. */
        cd += (int128_t)modinfo->modulus.v[2] * md;
        ce += (int128_t)modinfo->modulus.v[2] * me;
    }
    d->v[1] = (int64_t)cd & M62; cd >>= 62;
    e->v[1] = (int64_t)ce & M62; ce >>= 62;
    /* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */
    cd += (int128_t)u * d3 + (int128_t)v * e3;
    ce += (int128_t)q * d3 + (int128_t)r * e3;
    if (modinfo->modulus.v[3]) { /* Optimize for the case where limb of modulus is zero. */
        cd += (int128_t)modinfo->modulus.v[3] * md;
        ce += (int128_t)modinfo->modulus.v[3] * me;
    }
    d->v[2] = (int64_t)cd & M62; cd >>= 62;
    e->v[2] = (int64_t)ce & M62; ce >>= 62;
    /* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */
    cd += (int128_t)u * d4 + (int128_t)v * e4;
    ce += (int128_t)q * d4 + (int128_t)r * e4;
    cd += (int128_t)modinfo->modulus.v[4] * md;
    ce += (int128_t)modinfo->modulus.v[4] * me;
    d->v[3] = (int64_t)cd & M62; cd >>= 62;
    e->v[3] = (int64_t)ce & M62; ce >>= 62;
    /* What remains is limb 5 of t*[d,e]+modulus*[md,me]; store it as output limb 4. */
    d->v[4] = (int64_t)cd;
    e->v[4] = (int64_t)ce;
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, 1) < 0);  /* e <    modulus */
 #endif
 }
 /* Compute (t/2^62) * [f, g], where t is a transition matrix scaled by 2^62.
 *
 * This implements the update_fg function from the explanation.
 */
 static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp256k1_modinv64_signed62 *g, const secp256k1_modinv64_trans2x2 *t) {
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
    const int64_t f0 = f->v[0], f1 = f->v[1], f2 = f->v[2], f3 = f->v[3], f4 = f->v[4];
    const int64_t g0 = g->v[0], g1 = g->v[1], g2 = g->v[2], g3 = g->v[3], g4 = g->v[4];
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
    int128_t cf, cg;
    /* Start computing t*[f,g]. */
    cf = (int128_t)u * f0 + (int128_t)v * g0;
    cg = (int128_t)q * f0 + (int128_t)r * g0;
    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
    /* Compute limb 1 of t*[f,g], and store it as output limb 0 (= down shift). */
    cf += (int128_t)u * f1 + (int128_t)v * g1;
    cg += (int128_t)q * f1 + (int128_t)r * g1;
    f->v[0] = (int64_t)cf & M62; cf >>= 62;
    g->v[0] = (int64_t)cg & M62; cg >>= 62;
    /* Compute limb 2 of t*[f,g], and store it as output limb 1. */
    cf += (int128_t)u * f2 + (int128_t)v * g2;
    cg += (int128_t)q * f2 + (int128_t)r * g2;
    f->v[1] = (int64_t)cf & M62; cf >>= 62;
    g->v[1] = (int64_t)cg & M62; cg >>= 62;
    /* Compute limb 3 of t*[f,g], and store it as output limb 2. */
    cf += (int128_t)u * f3 + (int128_t)v * g3;
    cg += (int128_t)q * f3 + (int128_t)r * g3;
    f->v[2] = (int64_t)cf & M62; cf >>= 62;
    g->v[2] = (int64_t)cg & M62; cg >>= 62;
    /* Compute limb 4 of t*[f,g], and store it as output limb 3. */
    cf += (int128_t)u * f4 + (int128_t)v * g4;
    cg += (int128_t)q * f4 + (int128_t)r * g4;
    f->v[3] = (int64_t)cf & M62; cf >>= 62;
    g->v[3] = (int64_t)cg & M62; cg >>= 62;
    /* What remains is limb 5 of t*[f,g]; store it as output limb 4. */
    f->v[4] = (int64_t)cf;
    g->v[4] = (int64_t)cg;
 }
 /* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
 *
 * Version that operates on a variable number of limbs in f and g.
 *
 * This implements the update_fg function from the explanation.
 */
 static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_signed62 *f, secp256k1_modinv64_signed62 *g, const secp256k1_modinv64_trans2x2 *t) {
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
    int64_t fi, gi;
    int128_t cf, cg;
    int i;
    VERIFY_CHECK(len > 0);
    /* Start computing t*[f,g]. */
    fi = f->v[0];
    gi = g->v[0];
    cf = (int128_t)u * fi + (int128_t)v * gi;
    cg = (int128_t)q * fi + (int128_t)r * gi;
    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
    /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
     * down by 62 bits). */
    for (i = 1; i < len; ++i) {
        fi = f->v[i];
        gi = g->v[i];
        cf += (int128_t)u * fi + (int128_t)v * gi;
        cg += (int128_t)q * fi + (int128_t)r * gi;
        f->v[i - 1] = (int64_t)cf & M62; cf >>= 62;
        g->v[i - 1] = (int64_t)cg & M62; cg >>= 62;
    }
    /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
    f->v[len - 1] = (int64_t)cf;
    g->v[len - 1] = (int64_t)cg;
 }
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
 static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
    /* Start with d=0, e=1, f=modulus, g=x, zeta=-1. */
    secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
    secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
    secp256k1_modinv64_signed62 f = modinfo->modulus;
    secp256k1_modinv64_signed62 g = *x;
    int i;
    int64_t zeta = -1; /* zeta = -(delta+1/2); delta starts at 1/2. */
    /* Do 10 iterations of 59 divsteps each = 590 divsteps. This suffices for 256-bit inputs. */
    for (i = 0; i < 10; ++i) {
        /* Compute transition matrix and new zeta after 59 divsteps. */
        secp256k1_modinv64_trans2x2 t;
        zeta = secp256k1_modinv64_divsteps_59(zeta, f.v[0], g.v[0], &t);
        /* Update d,e using that transition matrix. */
        secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
        /* Update f,g using that transition matrix. */
 #ifdef VERIFY
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) > 0); /* f > -modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) <= 0); /* f <= modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, -1) > 0); /* g > -modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
        secp256k1_modinv64_update_fg_62(&f, &g, &t);
 #ifdef VERIFY
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) > 0); /* f > -modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) <= 0); /* f <= modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, -1) > 0); /* g > -modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
    }
    /* At this point sufficient iterations have been performed that g must have reached 0
     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
 #ifdef VERIFY
    /* g == 0 */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &SECP256K1_SIGNED62_ONE, 0) == 0);
    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
                 secp256k1_modinv64_mul_cmp_62(&f, 5, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
                 (secp256k1_modinv64_mul_cmp_62(x, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
                  secp256k1_modinv64_mul_cmp_62(&d, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
                  (secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) == 0 ||
                   secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) == 0)));
 #endif
    /* Optionally negate d, normalize to [0,modulus), and return it. */
    secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
    *x = d;
 }
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (variable time). */
 static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
    secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
    secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
    secp256k1_modinv64_signed62 f = modinfo->modulus;
    secp256k1_modinv64_signed62 g = *x;
 #ifdef VERIFY
    int i = 0;
 #endif
    int j, len = 5;
    int64_t eta = -1; /* eta = -delta; delta is initially 1 */
    int64_t cond, fn, gn;
    /* Do iterations of 62 divsteps each until g=0. */
    while (1) {
        /* Compute transition matrix and new eta after 62 divsteps. */
        secp256k1_modinv64_trans2x2 t;
        eta = secp256k1_modinv64_divsteps_62_var(eta, f.v[0], g.v[0], &t);
        /* Update d,e using that transition matrix. */
        secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
        /* Update f,g using that transition matrix. */
 #ifdef VERIFY
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
        secp256k1_modinv64_update_fg_62_var(len, &f, &g, &t);
        /* If the bottom limb of g is zero, there is a chance that g=0. */
        if (g.v[0] == 0) {
            cond = 0;
            /* Check if the other limbs are also 0. */
            for (j = 1; j < len; ++j) {
                cond |= g.v[j];
            }
            /* If so, we're done. */
            if (cond == 0) break;
        }
        /* Determine if len>1 and limb (len-1) of both f and g is 0 or -1. */
        fn = f.v[len - 1];
        gn = g.v[len - 1];
        cond = ((int64_t)len - 2) >> 63;
        cond |= fn ^ (fn >> 63);
        cond |= gn ^ (gn >> 63);
        /* If so, reduce length, propagating the sign of f and g's top limb into the one below. */
        if (cond == 0) {
            f.v[len - 2] |= (uint64_t)fn << 62;
            g.v[len - 2] |= (uint64_t)gn << 62;
            --len;
        }
 #ifdef VERIFY
        VERIFY_CHECK(++i < 12); /* We should never need more than 12*62 = 744 divsteps */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
    }
    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 #ifdef VERIFY
    /* g == 0 */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &SECP256K1_SIGNED62_ONE, 0) == 0);
    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
                 secp256k1_modinv64_mul_cmp_62(&f, len, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
                 (secp256k1_modinv64_mul_cmp_62(x, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
                  secp256k1_modinv64_mul_cmp_62(&d, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
                  (secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) == 0 ||
                   secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) == 0)));
 #endif
    /* Optionally negate d, normalize to [0,modulus), and return it. */
    secp256k1_modinv64_normalize_62(&d, f.v[len - 1], modinfo);
    *x = d;
 }
 #endif /* SECP256K1_MODINV64_IMPL_H */
--- a/src/secp256k1/src/modules/ecdh/main_impl.h
+++ b/src/secp256k1/src/modules/ecdh/main_impl.h
@ -7,8 +7,8 @@
 #ifndef SECP256K1_MODULE_ECDH_MAIN_H
 #define SECP256K1_MODULE_ECDH_MAIN_H
-#include "include/secp256k1_ecdh.h"
+#include "../../../include/secp256k1_ecdh.h"
-#include "ecmult_const_impl.h"
+#include "../../ecmult_const_impl.h"
 static int ecdh_hash_function_sha256(unsigned char *output, const unsigned char *x32, const unsigned char *y32, void *data) {
    unsigned char version = (y32[31] & 0x01) | 0x02;
--- a/src/secp256k1/src/modules/extrakeys/main_impl.h
+++ b/src/secp256k1/src/modules/extrakeys/main_impl.h
@ -7,8 +7,8 @@
 #ifndef SECP256K1_MODULE_EXTRAKEYS_MAIN_H
 #define SECP256K1_MODULE_EXTRAKEYS_MAIN_H
-#include "include/secp256k1.h"
+#include "../../../include/secp256k1.h"
-#include "include/secp256k1_extrakeys.h"
+#include "../../../include/secp256k1_extrakeys.h"
 static SECP256K1_INLINE int secp256k1_xonly_pubkey_load(const secp256k1_context* ctx, secp256k1_ge *ge, const secp256k1_xonly_pubkey *pubkey) {
    return secp256k1_pubkey_load(ctx, ge, (const secp256k1_pubkey *) pubkey);
@ -55,6 +55,32 @@ int secp256k1_xonly_pubkey_serialize(const secp256k1_context* ctx, unsigned char
    return 1;
 }
 int secp256k1_xonly_pubkey_cmp(const secp256k1_context* ctx, const secp256k1_xonly_pubkey* pk0, const secp256k1_xonly_pubkey* pk1) {
    unsigned char out[2][32];
    const secp256k1_xonly_pubkey* pk[2];
    int i;
    VERIFY_CHECK(ctx != NULL);
    pk[0] = pk0; pk[1] = pk1;
    for (i = 0; i < 2; i++) {
        /* If the public key is NULL or invalid, xonly_pubkey_serialize will
         * call the illegal_callback and return 0. In that case we will
         * serialize the key as all zeros which is less than any valid public
         * key. This results in consistent comparisons even if NULL or invalid
         * pubkeys are involved and prevents edge cases such as sorting
         * algorithms that use this function and do not terminate as a
         * result. */
        if (!secp256k1_xonly_pubkey_serialize(ctx, out[i], pk[i])) {
            /* Note that xonly_pubkey_serialize should already set the output to
             * zero in that case, but it's not guaranteed by the API, we can't
             * test it and writing a VERIFY_CHECK is more complex than
             * explicitly memsetting (again). */
            memset(out[i], 0, sizeof(out[i]));
        }
    }
    return secp256k1_memcmp_var(out[0], out[1], sizeof(out[1]));
 }
 /** Keeps a group element as is if it has an even Y and otherwise negates it.
 *  y_parity is set to 0 in the former case and to 1 in the latter case.
 *  Requires that the coordinates of r are normalized. */
--- a/src/secp256k1/src/modules/extrakeys/tests_exhaustive_impl.h
+++ b/src/secp256k1/src/modules/extrakeys/tests_exhaustive_impl.h
@ -8,7 +8,7 @@
 #define SECP256K1_MODULE_EXTRAKEYS_TESTS_EXHAUSTIVE_H
 #include "src/modules/extrakeys/main_impl.h"
-#include "include/secp256k1_extrakeys.h"
+#include "../../../include/secp256k1_extrakeys.h"
 static void test_exhaustive_extrakeys(const secp256k1_context *ctx, const secp256k1_ge* group) {
    secp256k1_keypair keypair[EXHAUSTIVE_TEST_ORDER - 1];
--- a/src/secp256k1/src/modules/extrakeys/tests_impl.h
+++ b/src/secp256k1/src/modules/extrakeys/tests_impl.h
@ -7,7 +7,7 @@
 #ifndef SECP256K1_MODULE_EXTRAKEYS_TESTS_H
 #define SECP256K1_MODULE_EXTRAKEYS_TESTS_H
-#include "secp256k1_extrakeys.h"
+#include "../../../include/secp256k1_extrakeys.h"
 static secp256k1_context* api_test_context(int flags, int *ecount) {
    secp256k1_context *ctx0 = secp256k1_context_create(flags);
@ -137,6 +137,43 @@ void test_xonly_pubkey(void) {
    secp256k1_context_destroy(verify);
 }
 void test_xonly_pubkey_comparison(void) {
    unsigned char pk1_ser[32] = {
        0x58, 0x84, 0xb3, 0xa2, 0x4b, 0x97, 0x37, 0x88, 0x92, 0x38, 0xa6, 0x26, 0x62, 0x52, 0x35, 0x11,
        0xd0, 0x9a, 0xa1, 0x1b, 0x80, 0x0b, 0x5e, 0x93, 0x80, 0x26, 0x11, 0xef, 0x67, 0x4b, 0xd9, 0x23
    };
    const unsigned char pk2_ser[32] = {
        0xde, 0x36, 0x0e, 0x87, 0x59, 0x8f, 0x3c, 0x01, 0x36, 0x2a, 0x2a, 0xb8, 0xc6, 0xf4, 0x5e, 0x4d,
        0xb2, 0xc2, 0xd5, 0x03, 0xa7, 0xf9, 0xf1, 0x4f, 0xa8, 0xfa, 0x95, 0xa8, 0xe9, 0x69, 0x76, 0x1c
    };
    secp256k1_xonly_pubkey pk1;
    secp256k1_xonly_pubkey pk2;
    int ecount = 0;
    secp256k1_context *none = api_test_context(SECP256K1_CONTEXT_NONE, &ecount);
    CHECK(secp256k1_xonly_pubkey_parse(none, &pk1, pk1_ser) == 1);
    CHECK(secp256k1_xonly_pubkey_parse(none, &pk2, pk2_ser) == 1);
    CHECK(secp256k1_xonly_pubkey_cmp(none, NULL, &pk2) < 0);
    CHECK(ecount == 1);
    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, NULL) > 0);
    CHECK(ecount == 2);
    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, &pk2) < 0);
    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk2, &pk1) > 0);
    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, &pk1) == 0);
    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk2, &pk2) == 0);
    CHECK(ecount == 2);
    memset(&pk1, 0, sizeof(pk1)); /* illegal pubkey */
    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, &pk2) < 0);
    CHECK(ecount == 3);
    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, &pk1) == 0);
    CHECK(ecount == 5);
    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk2, &pk1) > 0);
    CHECK(ecount == 6);
    secp256k1_context_destroy(none);
 }
 void test_xonly_pubkey_tweak(void) {
    unsigned char zeros64[64] = { 0 };
    unsigned char overflows[32];
@ -540,6 +577,7 @@ void run_extrakeys_tests(void) {
    test_xonly_pubkey_tweak();
    test_xonly_pubkey_tweak_check();
    test_xonly_pubkey_tweak_recursive();
    test_xonly_pubkey_comparison();
    /* keypair tests */
    test_keypair();
--- a/src/secp256k1/src/modules/recovery/main_impl.h
+++ b/src/secp256k1/src/modules/recovery/main_impl.h
@ -7,7 +7,7 @@
 #ifndef SECP256K1_MODULE_RECOVERY_MAIN_H
 #define SECP256K1_MODULE_RECOVERY_MAIN_H
-#include "include/secp256k1_recovery.h"
+#include "../../../include/secp256k1_recovery.h"
 static void secp256k1_ecdsa_recoverable_signature_load(const secp256k1_context* ctx, secp256k1_scalar* r, secp256k1_scalar* s, int* recid, const secp256k1_ecdsa_recoverable_signature* sig) {
    (void)ctx;
--- a/src/secp256k1/src/modules/recovery/tests_exhaustive_impl.h
+++ b/src/secp256k1/src/modules/recovery/tests_exhaustive_impl.h
@ -8,7 +8,7 @@
 #define SECP256K1_MODULE_RECOVERY_EXHAUSTIVE_TESTS_H
 #include "src/modules/recovery/main_impl.h"
-#include "include/secp256k1_recovery.h"
+#include "../../../include/secp256k1_recovery.h"
 void test_exhaustive_recovery_sign(const secp256k1_context *ctx, const secp256k1_ge *group) {
    int i, j, k;
--- a/src/secp256k1/src/modules/schnorrsig/main_impl.h
+++ b/src/secp256k1/src/modules/schnorrsig/main_impl.h
@ -7,9 +7,9 @@
 #ifndef SECP256K1_MODULE_SCHNORRSIG_MAIN_H
 #define SECP256K1_MODULE_SCHNORRSIG_MAIN_H
-#include "include/secp256k1.h"
+#include "../../../include/secp256k1.h"
-#include "include/secp256k1_schnorrsig.h"
+#include "../../../include/secp256k1_schnorrsig.h"
-#include "hash.h"
+#include "../../hash.h"
 /* Initializes SHA256 with fixed midstate. This midstate was computed by applying
 * SHA256 to SHA256("BIP0340/nonce")||SHA256("BIP0340/nonce"). */
--- a/src/secp256k1/src/modules/schnorrsig/tests_exhaustive_impl.h
+++ b/src/secp256k1/src/modules/schnorrsig/tests_exhaustive_impl.h
@ -7,7 +7,7 @@
 #ifndef SECP256K1_MODULE_SCHNORRSIG_TESTS_EXHAUSTIVE_H
 #define SECP256K1_MODULE_SCHNORRSIG_TESTS_EXHAUSTIVE_H
-#include "include/secp256k1_schnorrsig.h"
+#include "../../../include/secp256k1_schnorrsig.h"
 #include "src/modules/schnorrsig/main_impl.h"
 static const unsigned char invalid_pubkey_bytes[][32] = {
--- a/src/secp256k1/src/modules/schnorrsig/tests_impl.h
+++ b/src/secp256k1/src/modules/schnorrsig/tests_impl.h
@ -7,7 +7,7 @@
 #ifndef SECP256K1_MODULE_SCHNORRSIG_TESTS_H
 #define SECP256K1_MODULE_SCHNORRSIG_TESTS_H
-#include "secp256k1_schnorrsig.h"
+#include "../../../include/secp256k1_schnorrsig.h"
 /* Checks that a bit flip in the n_flip-th argument (that has n_bytes many
 * bytes) changes the hash function
@ -103,7 +103,7 @@ void test_schnorrsig_api(void) {
    unsigned char sk3[32];
    unsigned char msg[32];
    secp256k1_keypair keypairs[3];
-    secp256k1_keypair invalid_keypair = { 0 };
+    secp256k1_keypair invalid_keypair = {{ 0 }};
    secp256k1_xonly_pubkey pk[3];
    secp256k1_xonly_pubkey zero_pk;
    unsigned char sig[64];
--- a/src/secp256k1/src/num.h
+++ b/src/secp256k1/src/num.h
@ -1,74 +0,0 @@
 /***********************************************************************
 * Copyright (c) 2013, 2014 Pieter Wuille                              *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
 #ifndef SECP256K1_NUM_H
 #define SECP256K1_NUM_H
 #ifndef USE_NUM_NONE
 #if defined HAVE_CONFIG_H
 #include "libsecp256k1-config.h"
 #endif
 #if defined(USE_NUM_GMP)
 #include "num_gmp.h"
 #else
 #error "Please select num implementation"
 #endif
 /** Copy a number. */
 static void secp256k1_num_copy(secp256k1_num *r, const secp256k1_num *a);
 /** Convert a number's absolute value to a binary big-endian string.
 *  There must be enough place. */
 static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num *a);
 /** Set a number to the value of a binary big-endian string. */
 static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsigned int alen);
 /** Compute a modular inverse. The input must be less than the modulus. */
 static void secp256k1_num_mod_inverse(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *m);
 /** Compute the jacobi symbol (a|b). b must be positive and odd. */
 static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b);
 /** Compare the absolute value of two numbers. */
 static int secp256k1_num_cmp(const secp256k1_num *a, const secp256k1_num *b);
 /** Test whether two number are equal (including sign). */
 static int secp256k1_num_eq(const secp256k1_num *a, const secp256k1_num *b);
 /** Add two (signed) numbers. */
 static void secp256k1_num_add(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
 /** Subtract two (signed) numbers. */
 static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
 /** Multiply two (signed) numbers. */
 static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
 /** Replace a number by its remainder modulo m. M's sign is ignored. The result is a number between 0 and m-1,
    even if r was negative. */
 static void secp256k1_num_mod(secp256k1_num *r, const secp256k1_num *m);
 /** Right-shift the passed number by bits bits. */
 static void secp256k1_num_shift(secp256k1_num *r, int bits);
 /** Check whether a number is zero. */
 static int secp256k1_num_is_zero(const secp256k1_num *a);
 /** Check whether a number is one. */
 static int secp256k1_num_is_one(const secp256k1_num *a);
 /** Check whether a number is strictly negative. */
 static int secp256k1_num_is_neg(const secp256k1_num *a);
 /** Change a number's sign. */
 static void secp256k1_num_negate(secp256k1_num *r);
 #endif
 #endif /* SECP256K1_NUM_H */
--- a/src/secp256k1/src/num_gmp.h
+++ b/src/secp256k1/src/num_gmp.h
@ -1,20 +0,0 @@
 /***********************************************************************
 * Copyright (c) 2013, 2014 Pieter Wuille                              *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
 #ifndef SECP256K1_NUM_REPR_H
 #define SECP256K1_NUM_REPR_H
 #include <gmp.h>
 #define NUM_LIMBS ((256+GMP_NUMB_BITS-1)/GMP_NUMB_BITS)
 typedef struct {
    mp_limb_t data[2*NUM_LIMBS];
    int neg;
    int limbs;
 } secp256k1_num;
 #endif /* SECP256K1_NUM_REPR_H */
--- a/src/secp256k1/src/num_gmp_impl.h
+++ b/src/secp256k1/src/num_gmp_impl.h
@ -1,288 +0,0 @@
 /***********************************************************************
 * Copyright (c) 2013, 2014 Pieter Wuille                              *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
 #ifndef SECP256K1_NUM_REPR_IMPL_H
 #define SECP256K1_NUM_REPR_IMPL_H
 #include <string.h>
 #include <stdlib.h>
 #include <gmp.h>
 #include "util.h"
 #include "num.h"
 #ifdef VERIFY
 static void secp256k1_num_sanity(const secp256k1_num *a) {
    VERIFY_CHECK(a->limbs == 1 || (a->limbs > 1 && a->data[a->limbs-1] != 0));
 }
 #else
 #define secp256k1_num_sanity(a) do { } while(0)
 #endif
 static void secp256k1_num_copy(secp256k1_num *r, const secp256k1_num *a) {
    *r = *a;
 }
 static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num *a) {
    unsigned char tmp[65];
    int len = 0;
    int shift = 0;
    if (a->limbs>1 || a->data[0] != 0) {
        len = mpn_get_str(tmp, 256, (mp_limb_t*)a->data, a->limbs);
    }
    while (shift < len && tmp[shift] == 0) shift++;
    VERIFY_CHECK(len-shift <= (int)rlen);
    memset(r, 0, rlen - len + shift);
    if (len > shift) {
        memcpy(r + rlen - len + shift, tmp + shift, len - shift);
    }
    memset(tmp, 0, sizeof(tmp));
 }
 static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsigned int alen) {
    int len;
    VERIFY_CHECK(alen > 0);
    VERIFY_CHECK(alen <= 64);
    len = mpn_set_str(r->data, a, alen, 256);
    if (len == 0) {
        r->data[0] = 0;
        len = 1;
    }
    VERIFY_CHECK(len <= NUM_LIMBS*2);
    r->limbs = len;
    r->neg = 0;
    while (r->limbs > 1 && r->data[r->limbs-1]==0) {
        r->limbs--;
    }
 }
 static void secp256k1_num_add_abs(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
    mp_limb_t c = mpn_add(r->data, a->data, a->limbs, b->data, b->limbs);
    r->limbs = a->limbs;
    if (c != 0) {
        VERIFY_CHECK(r->limbs < 2*NUM_LIMBS);
        r->data[r->limbs++] = c;
    }
 }
 static void secp256k1_num_sub_abs(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
    mp_limb_t c = mpn_sub(r->data, a->data, a->limbs, b->data, b->limbs);
    (void)c;
    VERIFY_CHECK(c == 0);
    r->limbs = a->limbs;
    while (r->limbs > 1 && r->data[r->limbs-1]==0) {
        r->limbs--;
    }
 }
 static void secp256k1_num_mod(secp256k1_num *r, const secp256k1_num *m) {
    secp256k1_num_sanity(r);
    secp256k1_num_sanity(m);
    if (r->limbs >= m->limbs) {
        mp_limb_t t[2*NUM_LIMBS];
        mpn_tdiv_qr(t, r->data, 0, r->data, r->limbs, m->data, m->limbs);
        memset(t, 0, sizeof(t));
        r->limbs = m->limbs;
        while (r->limbs > 1 && r->data[r->limbs-1]==0) {
            r->limbs--;
        }
    }
    if (r->neg && (r->limbs > 1 || r->data[0] != 0)) {
        secp256k1_num_sub_abs(r, m, r);
        r->neg = 0;
    }
 }
 static void secp256k1_num_mod_inverse(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *m) {
    int i;
    mp_limb_t g[NUM_LIMBS+1];
    mp_limb_t u[NUM_LIMBS+1];
    mp_limb_t v[NUM_LIMBS+1];
    mp_size_t sn;
    mp_size_t gn;
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(m);
    /** mpn_gcdext computes: (G,S) = gcdext(U,V), where
     *  * G = gcd(U,V)
     *  * G = U*S + V*T
     *  * U has equal or more limbs than V, and V has no padding
     *  If we set U to be (a padded version of) a, and V = m:
     *    G = a*S + m*T
     *    G = a*S mod m
     *  Assuming G=1:
     *    S = 1/a mod m
     */
    VERIFY_CHECK(m->limbs <= NUM_LIMBS);
    VERIFY_CHECK(m->data[m->limbs-1] != 0);
    for (i = 0; i < m->limbs; i++) {
        u[i] = (i < a->limbs) ? a->data[i] : 0;
        v[i] = m->data[i];
    }
    sn = NUM_LIMBS+1;
    gn = mpn_gcdext(g, r->data, &sn, u, m->limbs, v, m->limbs);
    (void)gn;
    VERIFY_CHECK(gn == 1);
    VERIFY_CHECK(g[0] == 1);
    r->neg = a->neg ^ m->neg;
    if (sn < 0) {
        mpn_sub(r->data, m->data, m->limbs, r->data, -sn);
        r->limbs = m->limbs;
        while (r->limbs > 1 && r->data[r->limbs-1]==0) {
            r->limbs--;
        }
    } else {
        r->limbs = sn;
    }
    memset(g, 0, sizeof(g));
    memset(u, 0, sizeof(u));
    memset(v, 0, sizeof(v));
 }
 static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b) {
    int ret;
    mpz_t ga, gb;
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(b);
    VERIFY_CHECK(!b->neg && (b->limbs > 0) && (b->data[0] & 1));
    mpz_inits(ga, gb, NULL);
    mpz_import(gb, b->limbs, -1, sizeof(mp_limb_t), 0, 0, b->data);
    mpz_import(ga, a->limbs, -1, sizeof(mp_limb_t), 0, 0, a->data);
    if (a->neg) {
        mpz_neg(ga, ga);
    }
    ret = mpz_jacobi(ga, gb);
    mpz_clears(ga, gb, NULL);
    return ret;
 }
 static int secp256k1_num_is_one(const secp256k1_num *a) {
    return (a->limbs == 1 && a->data[0] == 1);
 }
 static int secp256k1_num_is_zero(const secp256k1_num *a) {
    return (a->limbs == 1 && a->data[0] == 0);
 }
 static int secp256k1_num_is_neg(const secp256k1_num *a) {
    return (a->limbs > 1 || a->data[0] != 0) && a->neg;
 }
 static int secp256k1_num_cmp(const secp256k1_num *a, const secp256k1_num *b) {
    if (a->limbs > b->limbs) {
        return 1;
    }
    if (a->limbs < b->limbs) {
        return -1;
    }
    return mpn_cmp(a->data, b->data, a->limbs);
 }
 static int secp256k1_num_eq(const secp256k1_num *a, const secp256k1_num *b) {
    if (a->limbs > b->limbs) {
        return 0;
    }
    if (a->limbs < b->limbs) {
        return 0;
    }
    if ((a->neg && !secp256k1_num_is_zero(a)) != (b->neg && !secp256k1_num_is_zero(b))) {
        return 0;
    }
    return mpn_cmp(a->data, b->data, a->limbs) == 0;
 }
 static void secp256k1_num_subadd(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b, int bneg) {
    if (!(b->neg ^ bneg ^ a->neg)) { /* a and b have the same sign */
        r->neg = a->neg;
        if (a->limbs >= b->limbs) {
            secp256k1_num_add_abs(r, a, b);
        } else {
            secp256k1_num_add_abs(r, b, a);
        }
    } else {
        if (secp256k1_num_cmp(a, b) > 0) {
            r->neg = a->neg;
            secp256k1_num_sub_abs(r, a, b);
        } else {
            r->neg = b->neg ^ bneg;
            secp256k1_num_sub_abs(r, b, a);
        }
    }
 }
 static void secp256k1_num_add(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(b);
    secp256k1_num_subadd(r, a, b, 0);
 }
 static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(b);
    secp256k1_num_subadd(r, a, b, 1);
 }
 static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
    mp_limb_t tmp[2*NUM_LIMBS+1];
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(b);
    VERIFY_CHECK(a->limbs + b->limbs <= 2*NUM_LIMBS+1);
    if ((a->limbs==1 && a->data[0]==0) || (b->limbs==1 && b->data[0]==0)) {
        r->limbs = 1;
        r->neg = 0;
        r->data[0] = 0;
        return;
    }
    if (a->limbs >= b->limbs) {
        mpn_mul(tmp, a->data, a->limbs, b->data, b->limbs);
    } else {
        mpn_mul(tmp, b->data, b->limbs, a->data, a->limbs);
    }
    r->limbs = a->limbs + b->limbs;
    if (r->limbs > 1 && tmp[r->limbs - 1]==0) {
        r->limbs--;
    }
    VERIFY_CHECK(r->limbs <= 2*NUM_LIMBS);
    mpn_copyi(r->data, tmp, r->limbs);
    r->neg = a->neg ^ b->neg;
    memset(tmp, 0, sizeof(tmp));
 }
 static void secp256k1_num_shift(secp256k1_num *r, int bits) {
    if (bits % GMP_NUMB_BITS) {
        /* Shift within limbs. */
        mpn_rshift(r->data, r->data, r->limbs, bits % GMP_NUMB_BITS);
    }
    if (bits >= GMP_NUMB_BITS) {
        int i;
        /* Shift full limbs. */
        for (i = 0; i < r->limbs; i++) {
            int index = i + (bits / GMP_NUMB_BITS);
            if (index < r->limbs && index < 2*NUM_LIMBS) {
                r->data[i] = r->data[index];
            } else {
                r->data[i] = 0;
            }
        }
    }
    while (r->limbs>1 && r->data[r->limbs-1]==0) {
        r->limbs--;
    }
 }
 static void secp256k1_num_negate(secp256k1_num *r) {
    r->neg ^= 1;
 }
 #endif /* SECP256K1_NUM_REPR_IMPL_H */
--- a/src/secp256k1/src/num_impl.h
+++ b/src/secp256k1/src/num_impl.h
@ -1,24 +0,0 @@
 /***********************************************************************
 * Copyright (c) 2013, 2014 Pieter Wuille                              *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
 #ifndef SECP256K1_NUM_IMPL_H
 #define SECP256K1_NUM_IMPL_H
 #if defined HAVE_CONFIG_H
 #include "libsecp256k1-config.h"
 #endif
 #include "num.h"
 #if defined(USE_NUM_GMP)
 #include "num_gmp_impl.h"
 #elif defined(USE_NUM_NONE)
 /* Nothing. */
 #else
 #error "Please select num implementation"
 #endif
 #endif /* SECP256K1_NUM_IMPL_H */
--- a/src/secp256k1/src/scalar.h
+++ b/src/secp256k1/src/scalar.h
@ -7,7 +7,6 @@
 #ifndef SECP256K1_SCALAR_H
 #define SECP256K1_SCALAR_H
 #include "num.h"
 #include "util.h"
 #if defined HAVE_CONFIG_H
@ -63,9 +62,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
 *  the low bits that were shifted off */
 static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n);
 /** Compute the square of a scalar (modulo the group order). */
 static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a);
 /** Compute the inverse of a scalar (modulo the group order). */
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *a);
@ -91,14 +87,6 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar *a);
 * Returns -1 if the number was negated, 1 otherwise */
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *a, int flag);
 #ifndef USE_NUM_NONE
 /** Convert a scalar to a number. */
 static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a);
 /** Get the order of the group as a number. */
 static void secp256k1_scalar_order_get_num(secp256k1_num *r);
 #endif
 /** Compare two scalars. */
 static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b);
--- a/src/secp256k1/src/scalar_4x64_impl.h
+++ b/src/secp256k1/src/scalar_4x64_impl.h
@ -7,6 +7,8 @@
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 #include "modinv64_impl.h"
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
 #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
@ -240,28 +242,6 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
    VERIFY_CHECK(c1 >= th); \
 }
 /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd2(a,b) { \
    uint64_t tl, th, th2, tl2; \
    { \
        uint128_t t = (uint128_t)a * b; \
        th = t >> 64;               /* at most 0xFFFFFFFFFFFFFFFE */ \
        tl = t; \
    } \
    th2 = th + th;                  /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
    c2 += (th2 < th);               /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
    tl2 = tl + tl;                  /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
    th2 += (tl2 < tl);              /* at most 0xFFFFFFFFFFFFFFFF */ \
    c0 += tl2;                      /* overflow is handled on the next line */ \
    th2 += (c0 < tl2);              /* second overflow is handled on the next line */ \
    c2 += (c0 < tl2) & (th2 == 0);  /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \
    c1 += th2;                      /* overflow is handled on the next line */ \
    c2 += (c1 < th2);               /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \
 }
 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
    unsigned int over; \
@ -771,148 +751,10 @@ static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar *a, c
 #endif
 }
 static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar *a) {
 #ifdef USE_ASM_X86_64
    __asm__ __volatile__(
    /* Preload */
    "movq 0(%%rdi), %%r11\n"
    "movq 8(%%rdi), %%r12\n"
    "movq 16(%%rdi), %%r13\n"
    "movq 24(%%rdi), %%r14\n"
    /* (rax,rdx) = a0 * a0 */
    "movq %%r11, %%rax\n"
    "mulq %%r11\n"
    /* Extract l0 */
    "movq %%rax, 0(%%rsi)\n"
    /* (r8,r9,r10) = (rdx,0) */
    "movq %%rdx, %%r8\n"
    "xorq %%r9, %%r9\n"
    "xorq %%r10, %%r10\n"
    /* (r8,r9,r10) += 2 * a0 * a1 */
    "movq %%r11, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* Extract l1 */
    "movq %%r8, 8(%%rsi)\n"
    "xorq %%r8, %%r8\n"
    /* (r9,r10,r8) += 2 * a0 * a2 */
    "movq %%r11, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* (r9,r10,r8) += a1 * a1 */
    "movq %%r12, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* Extract l2 */
    "movq %%r9, 16(%%rsi)\n"
    "xorq %%r9, %%r9\n"
    /* (r10,r8,r9) += 2 * a0 * a3 */
    "movq %%r11, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* (r10,r8,r9) += 2 * a1 * a2 */
    "movq %%r12, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* Extract l3 */
    "movq %%r10, 24(%%rsi)\n"
    "xorq %%r10, %%r10\n"
    /* (r8,r9,r10) += 2 * a1 * a3 */
    "movq %%r12, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* (r8,r9,r10) += a2 * a2 */
    "movq %%r13, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* Extract l4 */
    "movq %%r8, 32(%%rsi)\n"
    "xorq %%r8, %%r8\n"
    /* (r9,r10,r8) += 2 * a2 * a3 */
    "movq %%r13, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* Extract l5 */
    "movq %%r9, 40(%%rsi)\n"
    /* (r10,r8) += a3 * a3 */
    "movq %%r14, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    /* Extract l6 */
    "movq %%r10, 48(%%rsi)\n"
    /* Extract l7 */
    "movq %%r8, 56(%%rsi)\n"
    :
    : "S"(l), "D"(a->d)
    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory");
 #else
    /* 160 bit accumulator. */
    uint64_t c0 = 0, c1 = 0;
    uint32_t c2 = 0;
    /* l[0..7] = a[0..3] * b[0..3]. */
    muladd_fast(a->d[0], a->d[0]);
    extract_fast(l[0]);
    muladd2(a->d[0], a->d[1]);
    extract(l[1]);
    muladd2(a->d[0], a->d[2]);
    muladd(a->d[1], a->d[1]);
    extract(l[2]);
    muladd2(a->d[0], a->d[3]);
    muladd2(a->d[1], a->d[2]);
    extract(l[3]);
    muladd2(a->d[1], a->d[3]);
    muladd(a->d[2], a->d[2]);
    extract(l[4]);
    muladd2(a->d[2], a->d[3]);
    extract(l[5]);
    muladd_fast(a->d[3], a->d[3]);
    extract_fast(l[6]);
    VERIFY_CHECK(c1 == 0);
    l[7] = c0;
 #endif
 }
 #undef sumadd
 #undef sumadd_fast
 #undef muladd
 #undef muladd_fast
 #undef muladd2
 #undef extract
 #undef extract_fast
@ -934,12 +776,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
    return ret;
 }
 static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
    uint64_t l[8];
    secp256k1_scalar_sqr_512(l, a);
    secp256k1_scalar_reduce_512(r, l);
 }
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
    r1->d[0] = k->d[0];
    r1->d[1] = k->d[1];
@ -983,4 +819,78 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
    r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
 }
 static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_modinv64_signed62 *a) {
    const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
    /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and
     * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4).
     */
    VERIFY_CHECK(a0 >> 62 == 0);
    VERIFY_CHECK(a1 >> 62 == 0);
    VERIFY_CHECK(a2 >> 62 == 0);
    VERIFY_CHECK(a3 >> 62 == 0);
    VERIFY_CHECK(a4 >> 8 == 0);
    r->d[0] = a0      | a1 << 62;
    r->d[1] = a1 >> 2 | a2 << 60;
    r->d[2] = a2 >> 4 | a3 << 58;
    r->d[3] = a3 >> 6 | a4 << 56;
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
 #endif
 }
 static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_scalar *a) {
    const uint64_t M62 = UINT64_MAX >> 2;
    const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3];
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
 #endif
    r->v[0] =  a0                   & M62;
    r->v[1] = (a0 >> 62 | a1 <<  2) & M62;
    r->v[2] = (a1 >> 60 | a2 <<  4) & M62;
    r->v[3] = (a2 >> 58 | a3 <<  6) & M62;
    r->v[4] =  a3 >> 56;
 }
 static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_scalar = {
    {{0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256}},
    0x34F20099AA774EC1LL
 };
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
    secp256k1_modinv64_signed62 s;
 #ifdef VERIFY
    int zero_in = secp256k1_scalar_is_zero(x);
 #endif
    secp256k1_scalar_to_signed62(&s, x);
    secp256k1_modinv64(&s, &secp256k1_const_modinfo_scalar);
    secp256k1_scalar_from_signed62(r, &s);
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
 #endif
 }
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
    secp256k1_modinv64_signed62 s;
 #ifdef VERIFY
    int zero_in = secp256k1_scalar_is_zero(x);
 #endif
    secp256k1_scalar_to_signed62(&s, x);
    secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_scalar);
    secp256k1_scalar_from_signed62(r, &s);
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
 #endif
 }
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
    return !(a->d[0] & 1);
 }
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
--- a/src/secp256k1/src/scalar_8x32_impl.h
+++ b/src/secp256k1/src/scalar_8x32_impl.h
@ -7,6 +7,8 @@
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 #include "modinv32_impl.h"
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint32_t)0xD0364141UL)
 #define SECP256K1_N_1 ((uint32_t)0xBFD25E8CUL)
@ -291,28 +293,6 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
    VERIFY_CHECK(c1 >= th); \
 }
 /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd2(a,b) { \
    uint32_t tl, th, th2, tl2; \
    { \
        uint64_t t = (uint64_t)a * b; \
        th = t >> 32;               /* at most 0xFFFFFFFE */ \
        tl = t; \
    } \
    th2 = th + th;                  /* at most 0xFFFFFFFE (in case th was 0x7FFFFFFF) */ \
    c2 += (th2 < th);               /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
    tl2 = tl + tl;                  /* at most 0xFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFF) */ \
    th2 += (tl2 < tl);              /* at most 0xFFFFFFFF */ \
    c0 += tl2;                      /* overflow is handled on the next line */ \
    th2 += (c0 < tl2);              /* second overflow is handled on the next line */ \
    c2 += (c0 < tl2) & (th2 == 0);  /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \
    c1 += th2;                      /* overflow is handled on the next line */ \
    c2 += (c1 < th2);               /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \
 }
 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
    unsigned int over; \
@ -576,71 +556,10 @@ static void secp256k1_scalar_mul_512(uint32_t *l, const secp256k1_scalar *a, con
    l[15] = c0;
 }
 static void secp256k1_scalar_sqr_512(uint32_t *l, const secp256k1_scalar *a) {
    /* 96 bit accumulator. */
    uint32_t c0 = 0, c1 = 0, c2 = 0;
    /* l[0..15] = a[0..7]^2. */
    muladd_fast(a->d[0], a->d[0]);
    extract_fast(l[0]);
    muladd2(a->d[0], a->d[1]);
    extract(l[1]);
    muladd2(a->d[0], a->d[2]);
    muladd(a->d[1], a->d[1]);
    extract(l[2]);
    muladd2(a->d[0], a->d[3]);
    muladd2(a->d[1], a->d[2]);
    extract(l[3]);
    muladd2(a->d[0], a->d[4]);
    muladd2(a->d[1], a->d[3]);
    muladd(a->d[2], a->d[2]);
    extract(l[4]);
    muladd2(a->d[0], a->d[5]);
    muladd2(a->d[1], a->d[4]);
    muladd2(a->d[2], a->d[3]);
    extract(l[5]);
    muladd2(a->d[0], a->d[6]);
    muladd2(a->d[1], a->d[5]);
    muladd2(a->d[2], a->d[4]);
    muladd(a->d[3], a->d[3]);
    extract(l[6]);
    muladd2(a->d[0], a->d[7]);
    muladd2(a->d[1], a->d[6]);
    muladd2(a->d[2], a->d[5]);
    muladd2(a->d[3], a->d[4]);
    extract(l[7]);
    muladd2(a->d[1], a->d[7]);
    muladd2(a->d[2], a->d[6]);
    muladd2(a->d[3], a->d[5]);
    muladd(a->d[4], a->d[4]);
    extract(l[8]);
    muladd2(a->d[2], a->d[7]);
    muladd2(a->d[3], a->d[6]);
    muladd2(a->d[4], a->d[5]);
    extract(l[9]);
    muladd2(a->d[3], a->d[7]);
    muladd2(a->d[4], a->d[6]);
    muladd(a->d[5], a->d[5]);
    extract(l[10]);
    muladd2(a->d[4], a->d[7]);
    muladd2(a->d[5], a->d[6]);
    extract(l[11]);
    muladd2(a->d[5], a->d[7]);
    muladd(a->d[6], a->d[6]);
    extract(l[12]);
    muladd2(a->d[6], a->d[7]);
    extract(l[13]);
    muladd_fast(a->d[7], a->d[7]);
    extract_fast(l[14]);
    VERIFY_CHECK(c1 == 0);
    l[15] = c0;
 }
 #undef sumadd
 #undef sumadd_fast
 #undef muladd
 #undef muladd_fast
 #undef muladd2
 #undef extract
 #undef extract_fast
@ -666,12 +585,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
    return ret;
 }
 static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
    uint32_t l[16];
    secp256k1_scalar_sqr_512(l, a);
    secp256k1_scalar_reduce_512(r, l);
 }
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
    r1->d[0] = k->d[0];
    r1->d[1] = k->d[1];
@ -731,4 +644,92 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
    r->d[7] = (r->d[7] & mask0) | (a->d[7] & mask1);
 }
 static void secp256k1_scalar_from_signed30(secp256k1_scalar *r, const secp256k1_modinv32_signed30 *a) {
    const uint32_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4],
                   a5 = a->v[5], a6 = a->v[6], a7 = a->v[7], a8 = a->v[8];
    /* The output from secp256k1_modinv32{_var} should be normalized to range [0,modulus), and
     * have limbs in [0,2^30). The modulus is < 2^256, so the top limb must be below 2^(256-30*8).
     */
    VERIFY_CHECK(a0 >> 30 == 0);
    VERIFY_CHECK(a1 >> 30 == 0);
    VERIFY_CHECK(a2 >> 30 == 0);
    VERIFY_CHECK(a3 >> 30 == 0);
    VERIFY_CHECK(a4 >> 30 == 0);
    VERIFY_CHECK(a5 >> 30 == 0);
    VERIFY_CHECK(a6 >> 30 == 0);
    VERIFY_CHECK(a7 >> 30 == 0);
    VERIFY_CHECK(a8 >> 16 == 0);
    r->d[0] = a0       | a1 << 30;
    r->d[1] = a1 >>  2 | a2 << 28;
    r->d[2] = a2 >>  4 | a3 << 26;
    r->d[3] = a3 >>  6 | a4 << 24;
    r->d[4] = a4 >>  8 | a5 << 22;
    r->d[5] = a5 >> 10 | a6 << 20;
    r->d[6] = a6 >> 12 | a7 << 18;
    r->d[7] = a7 >> 14 | a8 << 16;
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
 #endif
 }
 static void secp256k1_scalar_to_signed30(secp256k1_modinv32_signed30 *r, const secp256k1_scalar *a) {
    const uint32_t M30 = UINT32_MAX >> 2;
    const uint32_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3],
                   a4 = a->d[4], a5 = a->d[5], a6 = a->d[6], a7 = a->d[7];
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
 #endif
    r->v[0] =  a0                   & M30;
    r->v[1] = (a0 >> 30 | a1 <<  2) & M30;
    r->v[2] = (a1 >> 28 | a2 <<  4) & M30;
    r->v[3] = (a2 >> 26 | a3 <<  6) & M30;
    r->v[4] = (a3 >> 24 | a4 <<  8) & M30;
    r->v[5] = (a4 >> 22 | a5 << 10) & M30;
    r->v[6] = (a5 >> 20 | a6 << 12) & M30;
    r->v[7] = (a6 >> 18 | a7 << 14) & M30;
    r->v[8] =  a7 >> 16;
 }
 static const secp256k1_modinv32_modinfo secp256k1_const_modinfo_scalar = {
    {{0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L, 0, 0, 0, 65536}},
    0x2A774EC1L
 };
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
    secp256k1_modinv32_signed30 s;
 #ifdef VERIFY
    int zero_in = secp256k1_scalar_is_zero(x);
 #endif
    secp256k1_scalar_to_signed30(&s, x);
    secp256k1_modinv32(&s, &secp256k1_const_modinfo_scalar);
    secp256k1_scalar_from_signed30(r, &s);
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
 #endif
 }
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
    secp256k1_modinv32_signed30 s;
 #ifdef VERIFY
    int zero_in = secp256k1_scalar_is_zero(x);
 #endif
    secp256k1_scalar_to_signed30(&s, x);
    secp256k1_modinv32_var(&s, &secp256k1_const_modinfo_scalar);
    secp256k1_scalar_from_signed30(r, &s);
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
 #endif
 }
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
    return !(a->d[0] & 1);
 }
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
--- a/src/secp256k1/src/scalar_impl.h
+++ b/src/secp256k1/src/scalar_impl.h
@ -31,231 +31,12 @@
 static const secp256k1_scalar secp256k1_scalar_one = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 static const secp256k1_scalar secp256k1_scalar_zero = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0);
 #ifndef USE_NUM_NONE
 static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a) {
    unsigned char c[32];
    secp256k1_scalar_get_b32(c, a);
    secp256k1_num_set_bin(r, c, 32);
 }
 /** secp256k1 curve order, see secp256k1_ecdsa_const_order_as_fe in ecdsa_impl.h */
 static void secp256k1_scalar_order_get_num(secp256k1_num *r) {
 #if defined(EXHAUSTIVE_TEST_ORDER)
    static const unsigned char order[32] = {
        0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,EXHAUSTIVE_TEST_ORDER
    };
 #else
    static const unsigned char order[32] = {
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
    };
 #endif
    secp256k1_num_set_bin(r, order, 32);
 }
 #endif
 static int secp256k1_scalar_set_b32_seckey(secp256k1_scalar *r, const unsigned char *bin) {
    int overflow;
    secp256k1_scalar_set_b32(r, bin, &overflow);
    return (!overflow) & (!secp256k1_scalar_is_zero(r));
 }
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
 #if defined(EXHAUSTIVE_TEST_ORDER)
    int i;
    *r = 0;
    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
            *r = i;
    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
     * have a composite group order; fix it in exhaustive_tests.c). */
    VERIFY_CHECK(*r != 0);
 }
 #else
    secp256k1_scalar *t;
    int i;
    /* First compute xN as x ^ (2^N - 1) for some values of N,
     * and uM as x ^ M for some values of M. */
    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
    secp256k1_scalar u2, u5, u9, u11, u13;
    secp256k1_scalar_sqr(&u2, x);
    secp256k1_scalar_mul(&x2, &u2,  x);
    secp256k1_scalar_mul(&u5, &u2, &x2);
    secp256k1_scalar_mul(&x3, &u5,  &u2);
    secp256k1_scalar_mul(&u9, &x3, &u2);
    secp256k1_scalar_mul(&u11, &u9, &u2);
    secp256k1_scalar_mul(&u13, &u11, &u2);
    secp256k1_scalar_sqr(&x6, &u13);
    secp256k1_scalar_sqr(&x6, &x6);
    secp256k1_scalar_mul(&x6, &x6, &u11);
    secp256k1_scalar_sqr(&x8, &x6);
    secp256k1_scalar_sqr(&x8, &x8);
    secp256k1_scalar_mul(&x8, &x8,  &x2);
    secp256k1_scalar_sqr(&x14, &x8);
    for (i = 0; i < 5; i++) {
        secp256k1_scalar_sqr(&x14, &x14);
    }
    secp256k1_scalar_mul(&x14, &x14, &x6);
    secp256k1_scalar_sqr(&x28, &x14);
    for (i = 0; i < 13; i++) {
        secp256k1_scalar_sqr(&x28, &x28);
    }
    secp256k1_scalar_mul(&x28, &x28, &x14);
    secp256k1_scalar_sqr(&x56, &x28);
    for (i = 0; i < 27; i++) {
        secp256k1_scalar_sqr(&x56, &x56);
    }
    secp256k1_scalar_mul(&x56, &x56, &x28);
    secp256k1_scalar_sqr(&x112, &x56);
    for (i = 0; i < 55; i++) {
        secp256k1_scalar_sqr(&x112, &x112);
    }
    secp256k1_scalar_mul(&x112, &x112, &x56);
    secp256k1_scalar_sqr(&x126, &x112);
    for (i = 0; i < 13; i++) {
        secp256k1_scalar_sqr(&x126, &x126);
    }
    secp256k1_scalar_mul(&x126, &x126, &x14);
    /* Then accumulate the final result (t starts at x126). */
    t = &x126;
    for (i = 0; i < 3; i++) {
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u5); /* 101 */
    for (i = 0; i < 4; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
    for (i = 0; i < 4; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u5); /* 101 */
    for (i = 0; i < 5; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
    for (i = 0; i < 4; i++) {
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
    for (i = 0; i < 4; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
    for (i = 0; i < 5; i++) { /* 00 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
    for (i = 0; i < 6; i++) { /* 00 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
    for (i = 0; i < 4; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u5); /* 101 */
    for (i = 0; i < 3; i++) {
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
    for (i = 0; i < 5; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
    for (i = 0; i < 6; i++) { /* 000 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u5); /* 101 */
    for (i = 0; i < 10; i++) { /* 0000000 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
    for (i = 0; i < 4; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
    for (i = 0; i < 9; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
    for (i = 0; i < 5; i++) { /* 0 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
    for (i = 0; i < 6; i++) { /* 00 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
    for (i = 0; i < 4; i++) {
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
    for (i = 0; i < 5; i++) {
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
    for (i = 0; i < 6; i++) { /* 00 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
    for (i = 0; i < 10; i++) { /* 000000 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
    for (i = 0; i < 4; i++) {
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
    for (i = 0; i < 6; i++) { /* 00000 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(t, t, x); /* 1 */
    for (i = 0; i < 8; i++) { /* 00 */
        secp256k1_scalar_sqr(t, t);
    }
    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
 }
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
    return !(a->d[0] & 1);
 }
 #endif
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
 #if defined(USE_SCALAR_INV_BUILTIN)
    secp256k1_scalar_inverse(r, x);
 #elif defined(USE_SCALAR_INV_NUM)
    unsigned char b[32];
    secp256k1_num n, m;
    secp256k1_scalar t = *x;
    secp256k1_scalar_get_b32(b, &t);
    secp256k1_num_set_bin(&n, b, 32);
    secp256k1_scalar_order_get_num(&m);
    secp256k1_num_mod_inverse(&n, &n, &m);
    secp256k1_num_get_bin(b, 32, &n);
    secp256k1_scalar_set_b32(r, b, NULL);
    /* Verify that the inverse was computed correctly, without GMP code. */
    secp256k1_scalar_mul(&t, &t, r);
    CHECK(secp256k1_scalar_is_one(&t));
 #else
 #error "Please select scalar inverse implementation"
 #endif
 }
 /* These parameters are generated using sage/gen_exhaustive_groups.sage. */
 #if defined(EXHAUSTIVE_TEST_ORDER)
 #  if EXHAUSTIVE_TEST_ORDER == 13
--- a/src/secp256k1/src/scalar_low_impl.h
+++ b/src/secp256k1/src/scalar_low_impl.h
@ -104,10 +104,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
    return ret;
 }
 static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
    *r = (*a * *a) % EXHAUSTIVE_TEST_ORDER;
 }
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *a) {
    *r1 = *a;
    *r2 = 0;
@ -125,4 +121,19 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
    *r = (*r & mask0) | (*a & mask1);
 }
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
    int i;
    *r = 0;
    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
            *r = i;
    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
     * have a composite group order; fix it in exhaustive_tests.c). */
    VERIFY_CHECK(*r != 0);
 }
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
    secp256k1_scalar_inverse(r, x);
 }
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
--- a/src/secp256k1/src/secp256k1.c
+++ b/src/secp256k1/src/secp256k1.c
@ -4,12 +4,13 @@
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
-#include "include/secp256k1.h"
+#define SECP256K1_BUILD
-#include "include/secp256k1_preallocated.h"
+
 #include "../include/secp256k1.h"
 #include "../include/secp256k1_preallocated.h"
 #include "assumptions.h"
 #include "util.h"
 #include "num_impl.h"
 #include "field_impl.h"
 #include "scalar_impl.h"
 #include "group_impl.h"
@ -22,6 +23,10 @@
 #include "scratch_impl.h"
 #include "selftest.h"
 #ifdef SECP256K1_NO_BUILD
 # error "secp256k1.h processed without SECP256K1_BUILD defined while building secp256k1.c"
 #endif
 #if defined(VALGRIND)
 # include <valgrind/memcheck.h>
 #endif
@ -317,6 +322,32 @@ int secp256k1_ec_pubkey_serialize(const secp256k1_context* ctx, unsigned char *o
    return ret;
 }
 int secp256k1_ec_pubkey_cmp(const secp256k1_context* ctx, const secp256k1_pubkey* pubkey0, const secp256k1_pubkey* pubkey1) {
    unsigned char out[2][33];
    const secp256k1_pubkey* pk[2];
    int i;
    VERIFY_CHECK(ctx != NULL);
    pk[0] = pubkey0; pk[1] = pubkey1;
    for (i = 0; i < 2; i++) {
        size_t out_size = sizeof(out[i]);
        /* If the public key is NULL or invalid, ec_pubkey_serialize will call
         * the illegal_callback and return 0. In that case we will serialize the
         * key as all zeros which is less than any valid public key. This
         * results in consistent comparisons even if NULL or invalid pubkeys are
         * involved and prevents edge cases such as sorting algorithms that use
         * this function and do not terminate as a result. */
        if (!secp256k1_ec_pubkey_serialize(ctx, out[i], &out_size, pk[i], SECP256K1_EC_COMPRESSED)) {
            /* Note that ec_pubkey_serialize should already set the output to
             * zero in that case, but it's not guaranteed by the API, we can't
             * test it and writing a VERIFY_CHECK is more complex than
             * explicitly memsetting (again). */
            memset(out[i], 0, sizeof(out[i]));
        }
    }
    return secp256k1_memcmp_var(out[0], out[1], sizeof(out[0]));
 }
 static void secp256k1_ecdsa_signature_load(const secp256k1_context* ctx, secp256k1_scalar* r, secp256k1_scalar* s, const secp256k1_ecdsa_signature* sig) {
    (void)ctx;
    if (sizeof(secp256k1_scalar) == 32) {
--- a/src/secp256k1/src/testrand_impl.h
+++ b/src/secp256k1/src/testrand_impl.h
@ -127,7 +127,7 @@ static void secp256k1_testrand_init(const char* hexseed) {
            pos++;
        }
    } else {
-        FILE *frand = fopen("/dev/urandom", "r");
+        FILE *frand = fopen("/dev/urandom", "rb");
        if ((frand == NULL) || fread(&seed16, 1, sizeof(seed16), frand) != sizeof(seed16)) {
            uint64_t t = time(NULL) * (uint64_t)1337;
            fprintf(stderr, "WARNING: could not read 16 bytes from /dev/urandom; falling back to insecure PRNG\n");
--- a/src/secp256k1/src/tests.c
+++ b/src/secp256k1/src/tests.c
--- a/src/secp256k1/src/tests_exhaustive.c
+++ b/src/secp256k1/src/tests_exhaustive.c
@ -10,7 +10,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #undef USE_ECMULT_STATIC_PRECOMPUTATION
@ -20,10 +19,10 @@
 #define EXHAUSTIVE_TEST_ORDER 13
 #endif
-#include "include/secp256k1.h"
+#include "secp256k1.c"
 #include "../include/secp256k1.h"
 #include "assumptions.h"
 #include "group.h"
 #include "secp256k1.c"
 #include "testrand_impl.h"
 static int count = 2;
--- a/src/secp256k1/src/util.h
+++ b/src/secp256k1/src/util.h
@ -113,7 +113,7 @@ static SECP256K1_INLINE void *checked_realloc(const secp256k1_callback* cb, void
 #define ALIGNMENT 16
 #endif
-#define ROUND_TO_ALIGN(size) (((size + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT)
+#define ROUND_TO_ALIGN(size) ((((size) + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT)
 /* Assume there is a contiguous memory object with bounds [base, base + max_size)
 * of which the memory range [base, *prealloc_ptr) is already allocated for usage,
@ -276,4 +276,69 @@ SECP256K1_GNUC_EXT typedef __int128 int128_t;
 # endif
 #endif
 #ifndef __has_builtin
 #define __has_builtin(x) 0
 #endif
 /* Determine the number of trailing zero bits in a (non-zero) 32-bit x.
 * This function is only intended to be used as fallback for
 * secp256k1_ctz32_var, but permits it to be tested separately. */
 static SECP256K1_INLINE int secp256k1_ctz32_var_debruijn(uint32_t x) {
    static const uint8_t debruijn[32] = {
        0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A,
        0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B,
        0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B
    };
    return debruijn[((x & -x) * 0x04D7651F) >> 27];
 }
 /* Determine the number of trailing zero bits in a (non-zero) 64-bit x.
 * This function is only intended to be used as fallback for
 * secp256k1_ctz64_var, but permits it to be tested separately. */
 static SECP256K1_INLINE int secp256k1_ctz64_var_debruijn(uint64_t x) {
    static const uint8_t debruijn[64] = {
        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
    };
    return debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58];
 }
 /* Determine the number of trailing zero bits in a (non-zero) 32-bit x. */
 static SECP256K1_INLINE int secp256k1_ctz32_var(uint32_t x) {
    VERIFY_CHECK(x != 0);
 #if (__has_builtin(__builtin_ctz) || SECP256K1_GNUC_PREREQ(3,4))
    /* If the unsigned type is sufficient to represent the largest uint32_t, consider __builtin_ctz. */
    if (((unsigned)UINT32_MAX) == UINT32_MAX) {
        return __builtin_ctz(x);
    }
 #endif
 #if (__has_builtin(__builtin_ctzl) || SECP256K1_GNUC_PREREQ(3,4))
    /* Otherwise consider __builtin_ctzl (the unsigned long type is always at least 32 bits). */
    return __builtin_ctzl(x);
 #else
    /* If no suitable CTZ builtin is available, use a (variable time) software emulation. */
    return secp256k1_ctz32_var_debruijn(x);
 #endif
 }
 /* Determine the number of trailing zero bits in a (non-zero) 64-bit x. */
 static SECP256K1_INLINE int secp256k1_ctz64_var(uint64_t x) {
    VERIFY_CHECK(x != 0);
 #if (__has_builtin(__builtin_ctzl) || SECP256K1_GNUC_PREREQ(3,4))
    /* If the unsigned long type is sufficient to represent the largest uint64_t, consider __builtin_ctzl. */
    if (((unsigned long)UINT64_MAX) == UINT64_MAX) {
        return __builtin_ctzl(x);
    }
 #endif
 #if (__has_builtin(__builtin_ctzll) || SECP256K1_GNUC_PREREQ(3,4))
    /* Otherwise consider __builtin_ctzll (the unsigned long long type is always at least 64 bits). */
    return __builtin_ctzll(x);
 #else
    /* If no suitable CTZ builtin is available, use a (variable time) software emulation. */
    return secp256k1_ctz64_var_debruijn(x);
 #endif
 }
 #endif /* SECP256K1_UTIL_H */
--- a/src/secp256k1/src/valgrind_ctime_test.c
+++ b/src/secp256k1/src/valgrind_ctime_test.c
@ -5,28 +5,64 @@
 ***********************************************************************/
 #include <valgrind/memcheck.h>
-#include "include/secp256k1.h"
+#include <stdio.h>
 #include "../include/secp256k1.h"
 #include "assumptions.h"
 #include "util.h"
 #ifdef ENABLE_MODULE_ECDH
-# include "include/secp256k1_ecdh.h"
+# include "../include/secp256k1_ecdh.h"
 #endif
 #ifdef ENABLE_MODULE_RECOVERY
-# include "include/secp256k1_recovery.h"
+# include "../include/secp256k1_recovery.h"
 #endif
 #ifdef ENABLE_MODULE_EXTRAKEYS
-# include "include/secp256k1_extrakeys.h"
+# include "../include/secp256k1_extrakeys.h"
 #endif
 #ifdef ENABLE_MODULE_SCHNORRSIG
-#include "include/secp256k1_schnorrsig.h"
+#include "../include/secp256k1_schnorrsig.h"
 #endif
 void run_tests(secp256k1_context *ctx, unsigned char *key);
 int main(void) {
    secp256k1_context* ctx;
    unsigned char key[32];
    int ret, i;
    if (!RUNNING_ON_VALGRIND) {
        fprintf(stderr, "This test can only usefully be run inside valgrind.\n");
        fprintf(stderr, "Usage: libtool --mode=execute valgrind ./valgrind_ctime_test\n");
        return 1;
    }
    ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN
                                   | SECP256K1_CONTEXT_VERIFY
                                   | SECP256K1_CONTEXT_DECLASSIFY);
    /** In theory, testing with a single secret input should be sufficient:
     *  If control flow depended on secrets the tool would generate an error.
     */
    for (i = 0; i < 32; i++) {
        key[i] = i + 65;
    }
    run_tests(ctx, key);
    /* Test context randomisation. Do this last because it leaves the context
     * tainted. */
    VALGRIND_MAKE_MEM_UNDEFINED(key, 32);
    ret = secp256k1_context_randomize(ctx, key);
    VALGRIND_MAKE_MEM_DEFINED(&ret, sizeof(ret));
    CHECK(ret);
    secp256k1_context_destroy(ctx);
    return 0;
 }
 void run_tests(secp256k1_context *ctx, unsigned char *key) {
    secp256k1_ecdsa_signature signature;
    secp256k1_pubkey pubkey;
    size_t siglen = 74;
@ -34,7 +70,6 @@ int main(void) {
    int i;
    int ret;
    unsigned char msg[32];
    unsigned char key[32];
    unsigned char sig[74];
    unsigned char spubkey[33];
 #ifdef ENABLE_MODULE_RECOVERY
@ -45,26 +80,10 @@ int main(void) {
    secp256k1_keypair keypair;
 #endif
    if (!RUNNING_ON_VALGRIND) {
        fprintf(stderr, "This test can only usefully be run inside valgrind.\n");
        fprintf(stderr, "Usage: libtool --mode=execute valgrind ./valgrind_ctime_test\n");
        exit(1);
    }
    /** In theory, testing with a single secret input should be sufficient:
     *  If control flow depended on secrets the tool would generate an error.
     */
    for (i = 0; i < 32; i++) {
        key[i] = i + 65;
    }
    for (i = 0; i < 32; i++) {
        msg[i] = i + 1;
    }
    ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN
                                   | SECP256K1_CONTEXT_VERIFY
                                   | SECP256K1_CONTEXT_DECLASSIFY);
    /* Test keygen. */
    VALGRIND_MAKE_MEM_UNDEFINED(key, 32);
    ret = secp256k1_ec_pubkey_create(ctx, &pubkey, key);
@ -122,12 +141,6 @@ int main(void) {
    VALGRIND_MAKE_MEM_DEFINED(&ret, sizeof(ret));
    CHECK(ret == 1);
    /* Test context randomisation. Do this last because it leaves the context tainted. */
    VALGRIND_MAKE_MEM_UNDEFINED(key, 32);
    ret = secp256k1_context_randomize(ctx, key);
    VALGRIND_MAKE_MEM_DEFINED(&ret, sizeof(ret));
    CHECK(ret);
    /* Test keypair_create and keypair_xonly_tweak_add. */
 #ifdef ENABLE_MODULE_EXTRAKEYS
    VALGRIND_MAKE_MEM_UNDEFINED(key, 32);
@ -157,7 +170,4 @@ int main(void) {
    VALGRIND_MAKE_MEM_DEFINED(&ret, sizeof(ret));
    CHECK(ret == 1);
 #endif
    secp256k1_context_destroy(ctx);
    return 0;
 }