diff --git a/README.third_party.md b/README.third_party.md index e388103cf16..5440c873e0a 100644 --- a/README.third_party.md +++ b/README.third_party.md @@ -47,7 +47,7 @@ a notice will be included in | [jbeder/yaml-cpp] | MIT | 0.6.3 | | ✗ | | [JSON-Schema-Test-Suite] | Unknown License | Unknown | | | | [libmongocrypt] | Apache-2.0 | 1.14.0 | ✗ | ✗ | -| [librdkafka - the Apache Kafka C/C++ client library] | BSD-3-Clause, Xmlproc License, ISC, MIT, Public Domain, Zlib, BSD-2-Clause, Andreas Stolcke License | 2.0.2 | | ✗ | +| [librdkafka - the Apache Kafka C/C++ client library] | BSD-3-Clause, Xmlproc License, ISC, MIT, Public Domain, Zlib, BSD-2-Clause, Andreas Stolcke License | 2.11.1 | | ✗ | | [LibTomCrypt] | WTFPL, Public Domain | 1.18.2 | ✗ | ✗ | | [libunwind/libunwind] | MIT | v1.8.1 | | ✗ | | [linenoise] | BSD-2-Clause | Unknown | | ✗ | diff --git a/sbom.json b/sbom.json index f66a2d21fd7..6b6d44438f7 100644 --- a/sbom.json +++ b/sbom.json @@ -1023,7 +1023,7 @@ "name": "Organization: github" }, "name": "librdkafka - the Apache Kafka C/C++ client library", - "version": "2.0.2", + "version": "2.11.0", "licenses": [ { "license": { @@ -1066,7 +1066,7 @@ } } ], - "purl": "pkg:github/edenhill/librdkafka@v2.0.2", + "purl": "pkg:github/edenhill/librdkafka@v2.11.0", "properties": [ { "name": "internal:team_responsible", diff --git a/src/third_party/librdkafka/BUILD.bazel b/src/third_party/librdkafka/BUILD.bazel index ba16c0febbf..cbfc3f8dd91 100644 --- a/src/third_party/librdkafka/BUILD.bazel +++ b/src/third_party/librdkafka/BUILD.bazel @@ -40,8 +40,15 @@ mongo_cc_library( "dist/src/lz4.c", "dist/src/lz4frame.c", "dist/src/lz4hc.c", + "dist/src/nanopb/pb_common.c", + "dist/src/nanopb/pb_decode.c", + "dist/src/nanopb/pb_encode.c", + "dist/src/opentelemetry/common.pb.c", + "dist/src/opentelemetry/metrics.pb.c", + "dist/src/opentelemetry/resource.pb.c", "dist/src/rdaddr.c", "dist/src/rdavl.c", + "dist/src/rdbase64.c", "dist/src/rdbuf.c", "dist/src/rdcrc32.c", "dist/src/rddl.c", @@ -95,6 +102,9 @@ mongo_cc_library( "dist/src/rdkafka_ssl.c", "dist/src/rdkafka_sticky_assignor.c", "dist/src/rdkafka_subscription.c", + "dist/src/rdkafka_telemetry.c", + "dist/src/rdkafka_telemetry_decode.c", + "dist/src/rdkafka_telemetry_encode.c", "dist/src/rdkafka_timer.c", "dist/src/rdkafka_topic.c", "dist/src/rdkafka_transport.c", @@ -127,6 +137,7 @@ mongo_cc_library( copts = [ "-Wno-array-bounds", "-Wno-unused-variable", + "-Wno-enum-conversion", "-Wno-implicit-fallthrough", "-Wno-unused-but-set-variable", "-I$(GENDIR)/src/third_party/librdkafka/dist/FAKE", diff --git a/src/third_party/librdkafka/dist/LICENSE b/src/third_party/librdkafka/dist/LICENSE index 193ffaae283..660e3cfb005 100644 --- a/src/third_party/librdkafka/dist/LICENSE +++ b/src/third_party/librdkafka/dist/LICENSE @@ -1,6 +1,7 @@ librdkafka - Apache Kafka C driver library -Copyright (c) 2012-2020, Magnus Edenhill +Copyright (c) 2012-2022, Magnus Edenhill + 2023, Confluent Inc. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/LICENSE.lz4 b/src/third_party/librdkafka/dist/LICENSE.lz4 index f57dbc6baed..067a0d15b1d 100644 --- a/src/third_party/librdkafka/dist/LICENSE.lz4 +++ b/src/third_party/librdkafka/dist/LICENSE.lz4 @@ -1,7 +1,7 @@ -src/rdxxhash.[ch] src/lz4*.[ch]: git@github.com:lz4/lz4.git e2827775ee80d2ef985858727575df31fc60f1f3 +src/rdxxhash.[ch] src/lz4*.[ch]: git@github.com:lz4/lz4.git 5ff839680134437dbf4678f3d0c7b371d84f4964 LZ4 Library -Copyright (c) 2011-2016, Yann Collet +Copyright (c) 2011-2020, Yann Collet All rights reserved. Redistribution and use in source and binary forms, with or without modification, diff --git a/src/third_party/librdkafka/dist/LICENSE.nanopb b/src/third_party/librdkafka/dist/LICENSE.nanopb new file mode 100644 index 00000000000..497ec8cd797 --- /dev/null +++ b/src/third_party/librdkafka/dist/LICENSE.nanopb @@ -0,0 +1,22 @@ +For files in src/nanopb : https://github.com/nanopb/nanopb/blob/8ef41e0ebd45daaf19459a011f67e66224b247cd/LICENSE.txt + +Copyright (c) 2011 Petteri Aimonen + +This software is provided 'as-is', without any express or +implied warranty. In no event will the authors be held liable +for any damages arising from the use of this software. + +Permission is granted to anyone to use this software for any +purpose, including commercial applications, and to alter it and +redistribute it freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you + must not claim that you wrote the original software. If you use + this software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and + must not be misrepresented as being the original software. + +3. This notice may not be removed or altered from any source + distribution. diff --git a/src/third_party/librdkafka/dist/LICENSE.opentelemetry b/src/third_party/librdkafka/dist/LICENSE.opentelemetry new file mode 100644 index 00000000000..819ea6a0eb7 --- /dev/null +++ b/src/third_party/librdkafka/dist/LICENSE.opentelemetry @@ -0,0 +1,203 @@ +For files in src/opentelemetry: https://github.com/open-telemetry/opentelemetry-proto/blob/81a296f9dba23e32d77f46d58c8ea4244a2157a6/LICENSE + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/src/third_party/librdkafka/dist/LICENSES.txt b/src/third_party/librdkafka/dist/LICENSES.txt index 1ab8a1dd4d7..1621ba0996d 100644 --- a/src/third_party/librdkafka/dist/LICENSES.txt +++ b/src/third_party/librdkafka/dist/LICENSES.txt @@ -2,7 +2,8 @@ LICENSE -------------------------------------------------------------- librdkafka - Apache Kafka C driver library -Copyright (c) 2012-2020, Magnus Edenhill +Copyright (c) 2012-2022, Magnus Edenhill + 2023, Confluent Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -140,10 +141,10 @@ THE SOFTWARE LICENSE.lz4 -------------------------------------------------------------- -src/rdxxhash.[ch] src/lz4*.[ch]: git@github.com:lz4/lz4.git e2827775ee80d2ef985858727575df31fc60f1f3 +src/rdxxhash.[ch] src/lz4*.[ch]: git@github.com:lz4/lz4.git 5ff839680134437dbf4678f3d0c7b371d84f4964 LZ4 Library -Copyright (c) 2011-2016, Yann Collet +Copyright (c) 2011-2020, Yann Collet All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -197,6 +198,238 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +LICENSE.nanopb +-------------------------------------------------------------- +For files in src/nanopb : https://github.com/nanopb/nanopb/blob/8ef41e0ebd45daaf19459a011f67e66224b247cd/LICENSE.txt + +Copyright (c) 2011 Petteri Aimonen + +This software is provided 'as-is', without any express or +implied warranty. In no event will the authors be held liable +for any damages arising from the use of this software. + +Permission is granted to anyone to use this software for any +purpose, including commercial applications, and to alter it and +redistribute it freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you + must not claim that you wrote the original software. If you use + this software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and + must not be misrepresented as being the original software. + +3. This notice may not be removed or altered from any source + distribution. + + +LICENSE.opentelemetry +-------------------------------------------------------------- +For files in src/opentelemetry: https://github.com/open-telemetry/opentelemetry-proto/blob/81a296f9dba23e32d77f46d58c8ea4244a2157a6/LICENSE + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + LICENSE.pycrc -------------------------------------------------------------- The following license applies to the files rdcrc32.c and rdcrc32.h which diff --git a/src/third_party/librdkafka/dist/platform/linux_aarch64/include/config.h b/src/third_party/librdkafka/dist/platform/linux_aarch64/include/config.h index 8864c504c50..c7cc74084fe 100644 --- a/src/third_party/librdkafka/dist/platform/linux_aarch64/include/config.h +++ b/src/third_party/librdkafka/dist/platform/linux_aarch64/include/config.h @@ -26,8 +26,6 @@ #define WITH_GCC 1 // gxx #define WITH_GXX 1 -// pkgconfig -#define WITH_PKGCONFIG 1 // install #define WITH_INSTALL 1 // gnuar @@ -51,21 +49,21 @@ // atomic_64 #define ATOMIC_OP(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST) // parseversion -#define RDKAFKA_VERSION_STR "2.0.2" +#define RDKAFKA_VERSION_STR "2.11.0" // parseversion -#define MKL_APP_VERSION "2.0.2" +#define MKL_APP_VERSION "2.11.0" +// c11threads +#define WITH_C11THREADS 1 // libdl #define WITH_LIBDL 1 // WITH_PLUGINS #define WITH_PLUGINS 1 -// zlib -#define WITH_ZLIB 1 // libssl #define WITH_SSL 1 +// libcrypto +#define OPENSSL_SUPPRESS_DEPRECATED "OPENSSL_SUPPRESS_DEPRECATED" // libsasl2 #define WITH_SASL_CYRUS 1 -// libzstd -#define WITH_ZSTD 1 // libcurl #define WITH_CURL 1 // WITH_HDRHISTOGRAM @@ -99,5 +97,5 @@ // getrusage #define HAVE_GETRUSAGE 1 // BUILT_WITH -#define BUILT_WITH "GCC GXX PKGCONFIG INSTALL GNULD LDS C11THREADS LIBDL PLUGINS ZLIB SSL SASL_CYRUS ZSTD CURL HDRHISTOGRAM SYSLOG SNAPPY SOCKEM SASL_SCRAM SASL_OAUTHBEARER OAUTHBEARER_OIDC" +#define BUILT_WITH "GCC GXX INSTALL GNULD LDS C11THREADS LIBDL PLUGINS SSL SASL_CYRUS CURL HDRHISTOGRAM SYSLOG SNAPPY SOCKEM SASL_SCRAM SASL_OAUTHBEARER OAUTHBEARER_OIDC" #endif /* _CONFIG_H_ */ diff --git a/src/third_party/librdkafka/dist/platform/linux_x86_64/include/config.h b/src/third_party/librdkafka/dist/platform/linux_x86_64/include/config.h index 65f1fc27db0..67c3f2e86dd 100644 --- a/src/third_party/librdkafka/dist/platform/linux_x86_64/include/config.h +++ b/src/third_party/librdkafka/dist/platform/linux_x86_64/include/config.h @@ -51,9 +51,11 @@ // atomic_64 #define ATOMIC_OP(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST) // parseversion -#define RDKAFKA_VERSION_STR "2.0.2" +#define RDKAFKA_VERSION_STR "2.11.0" // parseversion -#define MKL_APP_VERSION "2.0.2" +#define MKL_APP_VERSION "2.11.0" +// c11threads +#define WITH_C11THREADS 1 // libdl #define WITH_LIBDL 1 // WITH_PLUGINS @@ -62,6 +64,8 @@ #define WITH_ZLIB 1 // libssl #define WITH_SSL 1 +// libcrypto +#define OPENSSL_SUPPRESS_DEPRECATED "OPENSSL_SUPPRESS_DEPRECATED" // libsasl2 #define WITH_SASL_CYRUS 1 // libzstd @@ -101,5 +105,5 @@ // getrusage #define HAVE_GETRUSAGE 1 // BUILT_WITH -#define BUILT_WITH "GCC GXX PKGCONFIG INSTALL GNULD LDS LIBDL PLUGINS ZLIB SSL SASL_CYRUS ZSTD CURL HDRHISTOGRAM SYSLOG SNAPPY SOCKEM SASL_SCRAM SASL_OAUTHBEARER OAUTHBEARER_OIDC CRC32C_HW" +#define BUILT_WITH "GCC GXX PKGCONFIG INSTALL GNULD LDS C11THREADS LIBDL PLUGINS ZLIB SSL SASL_CYRUS ZSTD CURL HDRHISTOGRAM SYSLOG SNAPPY SOCKEM SASL_SCRAM SASL_OAUTHBEARER OAUTHBEARER_OIDC CRC32C_HW" #endif /* _CONFIG_H_ */ diff --git a/src/third_party/librdkafka/dist/src-cpp/ConfImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/ConfImpl.cpp index 53d7b30c568..4f1f7090829 100644 --- a/src/third_party/librdkafka/dist/src-cpp/ConfImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/ConfImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/ConsumerImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/ConsumerImpl.cpp index b7f5e3b220a..a467acfb0da 100644 --- a/src/third_party/librdkafka/dist/src-cpp/ConsumerImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/ConsumerImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/HandleImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/HandleImpl.cpp index 741fcafbc10..9e3924469aa 100644 --- a/src/third_party/librdkafka/dist/src-cpp/HandleImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/HandleImpl.cpp @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -130,7 +131,6 @@ int RdKafka::socket_cb_trampoline(int domain, return handle->socket_cb_->socket_cb(domain, type, protocol); } - int RdKafka::resolve_cb_trampoline(const char *node, const char *service, const struct addrinfo *hints, @@ -152,7 +152,6 @@ int RdKafka::connect_cb_trampoline(int sockfd, return handle->connect_cb_->connect_cb(sockfd, addr, addrlen, id); } - int RdKafka::open_cb_trampoline(const char *pathname, int flags, mode_t mode, @@ -426,6 +425,14 @@ rd_kafka_topic_partition_list_t *partitions_to_c_parts( rd_kafka_topic_partition_t *rktpar = rd_kafka_topic_partition_list_add( c_parts, tpi->topic_.c_str(), tpi->partition_); rktpar->offset = tpi->offset_; + if (tpi->metadata_.size()) { + void *metadata_p = mem_malloc(tpi->metadata_.size()); + memcpy(metadata_p, tpi->metadata_.data(), tpi->metadata_.size()); + rktpar->metadata = metadata_p; + rktpar->metadata_size = tpi->metadata_.size(); + } + if (tpi->leader_epoch_ != -1) + rd_kafka_topic_partition_set_leader_epoch(rktpar, tpi->leader_epoch_); } return c_parts; @@ -447,8 +454,13 @@ void update_partitions_from_c_parts( dynamic_cast(partitions[j]); if (!strcmp(p->topic, pp->topic_.c_str()) && p->partition == pp->partition_) { - pp->offset_ = p->offset; - pp->err_ = static_cast(p->err); + pp->offset_ = p->offset; + pp->err_ = static_cast(p->err); + pp->leader_epoch_ = rd_kafka_topic_partition_get_leader_epoch(p); + if (p->metadata_size) { + unsigned char *metadata = (unsigned char *)p->metadata; + pp->metadata_.assign(metadata, metadata + p->metadata_size); + } } } } diff --git a/src/third_party/librdkafka/dist/src-cpp/HeadersImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/HeadersImpl.cpp index b567ef36c00..2b29488dc53 100644 --- a/src/third_party/librdkafka/dist/src-cpp/HeadersImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/HeadersImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/KafkaConsumerImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/KafkaConsumerImpl.cpp index 6f3b81c727c..984710b214a 100644 --- a/src/third_party/librdkafka/dist/src-cpp/KafkaConsumerImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/KafkaConsumerImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/MessageImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/MessageImpl.cpp index c6d83150fd3..8261b1f6e1f 100644 --- a/src/third_party/librdkafka/dist/src-cpp/MessageImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/MessageImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/MetadataImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/MetadataImpl.cpp index 62cbf9042ea..df58d4dbd70 100644 --- a/src/third_party/librdkafka/dist/src-cpp/MetadataImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/MetadataImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/ProducerImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/ProducerImpl.cpp index 8300dfb3b60..88752156c11 100644 --- a/src/third_party/librdkafka/dist/src-cpp/ProducerImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/ProducerImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/QueueImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/QueueImpl.cpp index 19ebce9d681..7148d72011d 100644 --- a/src/third_party/librdkafka/dist/src-cpp/QueueImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/QueueImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/RdKafka.cpp b/src/third_party/librdkafka/dist/src-cpp/RdKafka.cpp index b6cb33c288b..c7c41ec9846 100644 --- a/src/third_party/librdkafka/dist/src-cpp/RdKafka.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/RdKafka.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/TopicImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/TopicImpl.cpp index bf9734df944..6868b5932d6 100644 --- a/src/third_party/librdkafka/dist/src-cpp/TopicImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/TopicImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/TopicPartitionImpl.cpp b/src/third_party/librdkafka/dist/src-cpp/TopicPartitionImpl.cpp index 90ef820bf66..d453d964257 100644 --- a/src/third_party/librdkafka/dist/src-cpp/TopicPartitionImpl.cpp +++ b/src/third_party/librdkafka/dist/src-cpp/TopicPartitionImpl.cpp @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src-cpp/rdkafkacpp.h b/src/third_party/librdkafka/dist/src-cpp/rdkafkacpp.h index f46e8b1e4f1..9492650b540 100644 --- a/src/third_party/librdkafka/dist/src-cpp/rdkafkacpp.h +++ b/src/third_party/librdkafka/dist/src-cpp/rdkafkacpp.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014-2022 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,7 +121,7 @@ namespace RdKafka { * @remark This value should only be used during compile time, * for runtime checks of version use RdKafka::version() */ -#define RD_KAFKA_VERSION 0x020002ff +#define RD_KAFKA_VERSION 0x020b00ff /** * @brief Returns the librdkafka version as integer. @@ -333,6 +334,13 @@ enum ErrorCode { ERR__NOOP = -141, /** No offset to automatically reset to */ ERR__AUTO_OFFSET_RESET = -140, + /** Partition log truncation detected */ + ERR__LOG_TRUNCATION = -139, + /** A different record in the batch was invalid + * and this message failed persisting. */ + ERR__INVALID_DIFFERENT_RECORD = -138, + /** Broker is going away but client isn't terminating */ + ERR__DESTROY_BROKER = -137, /** End internal error codes */ ERR__END = -100, @@ -550,7 +558,28 @@ enum ErrorCode { /** Unable to update finalized features due to server error */ ERR_FEATURE_UPDATE_FAILED = 96, /** Request principal deserialization failed during forwarding */ - ERR_PRINCIPAL_DESERIALIZATION_FAILURE = 97 + ERR_PRINCIPAL_DESERIALIZATION_FAILURE = 97, + /** Unknown Topic Id */ + ERR_UNKNOWN_TOPIC_ID = 100, + /** The member epoch is fenced by the group coordinator */ + ERR_FENCED_MEMBER_EPOCH = 110, + /** The instance ID is still used by another member in the + * consumer group */ + ERR_UNRELEASED_INSTANCE_ID = 111, + /** The assignor or its version range is not supported by the consumer + * group */ + ERR_UNSUPPORTED_ASSIGNOR = 112, + /** The member epoch is stale */ + ERR_STALE_MEMBER_EPOCH = 113, + /** Client sent a push telemetry request with an invalid or outdated + * subscription ID. */ + ERR_UNKNOWN_SUBSCRIPTION_ID = 117, + /** Client sent a push telemetry request larger than the maximum size + * the broker will accept. */ + ERR_TELEMETRY_TOO_LARGE = 118, + /** Client metadata is stale, + * client should rebootstrap to obtain new metadata. */ + ERR_REBOOTSTRAP_REQUIRED = 129 }; @@ -2061,6 +2090,18 @@ class RD_EXPORT TopicPartition { /** @returns error code (if applicable) */ virtual ErrorCode err() const = 0; + + /** @brief Get partition leader epoch, or -1 if not known or relevant. */ + virtual int32_t get_leader_epoch() = 0; + + /** @brief Set partition leader epoch. */ + virtual void set_leader_epoch(int32_t leader_epoch) = 0; + + /** @brief Get partition metadata. */ + virtual std::vector get_metadata() = 0; + + /** @brief Set partition metadata. */ + virtual void set_metadata(std::vector &metadata) = 0; }; @@ -2118,6 +2159,11 @@ class RD_EXPORT Topic { * The offset will be committed (written) to the broker (or file) according * to \p auto.commit.interval.ms or next manual offset-less commit call. * + * @deprecated This API lacks support for partition leader epochs, which makes + * it at risk for unclean leader election log truncation issues. + * Use KafkaConsumer::offsets_store() or + * Message::offset_store() instead. + * * @remark \c enable.auto.offset.store must be set to \c false when using * this API. * @@ -2548,6 +2594,31 @@ class RD_EXPORT Message { /** @returns the broker id of the broker the message was produced to or * fetched from, or -1 if not known/applicable. */ virtual int32_t broker_id() const = 0; + + /** @returns the message's partition leader epoch at the time the message was + * fetched and if known, else -1. */ + virtual int32_t leader_epoch() const = 0; + + /** + * @brief Store offset +1 for the consumed message. + * + * The message offset + 1 will be committed to broker according + * to \c `auto.commit.interval.ms` or manual offset-less commit() + * + * @warning This method may only be called for partitions that are currently + * assigned. + * Non-assigned partitions will fail with ERR__STATE. + * + * @warning Avoid storing offsets after calling seek() (et.al) as + * this may later interfere with resuming a paused partition, instead + * store offsets prior to calling seek. + * + * @remark \c `enable.auto.offset.store` must be set to "false" when using + * this API. + * + * @returns NULL on success or an error object on failure. + */ + virtual Error *offset_store() = 0; }; /**@}*/ @@ -2948,6 +3019,9 @@ class RD_EXPORT KafkaConsumer : public virtual Handle { * @remark \c enable.auto.offset.store must be set to \c false when using * this API. * + * @remark The leader epoch, if set, will be used to fence outdated partition + * leaders. See TopicPartition::set_leader_epoch(). + * * @returns RdKafka::ERR_NO_ERROR on success, or * RdKafka::ERR___UNKNOWN_PARTITION if none of the offsets could * be stored, or diff --git a/src/third_party/librdkafka/dist/src-cpp/rdkafkacpp_int.h b/src/third_party/librdkafka/dist/src-cpp/rdkafkacpp_int.h index d7db13e2577..f3b48059a0f 100644 --- a/src/third_party/librdkafka/dist/src-cpp/rdkafkacpp_int.h +++ b/src/third_party/librdkafka/dist/src-cpp/rdkafkacpp_int.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C/C++ library * - * Copyright (c) 2014 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -550,6 +551,21 @@ class MessageImpl : public Message { return rd_kafka_message_broker_id(rkmessage_); } + int32_t leader_epoch() const { + return rd_kafka_message_leader_epoch(rkmessage_); + } + + + Error *offset_store() { + rd_kafka_error_t *c_error; + + c_error = rd_kafka_offset_store_message(rkmessage_); + + if (c_error) + return new ErrorImpl(c_error); + else + return NULL; + } RdKafka::Topic *topic_; rd_kafka_message_t *rkmessage_; @@ -759,7 +775,6 @@ class ConfImpl : public Conf { return Conf::CONF_OK; } - Conf::ConfResult set(const std::string &name, OpenCb *open_cb, std::string &errstr) { @@ -926,6 +941,20 @@ class ConfImpl : public Conf { return Conf::CONF_OK; } + Conf::ConfResult get(ResolveCb *&resolve_cb) const { + if (!rk_conf_) + return Conf::CONF_INVALID; + resolve_cb = this->resolve_cb_; + return Conf::CONF_OK; + } + + Conf::ConfResult get(ConnectCb *&connect_cb) const { + if (!rk_conf_) + return Conf::CONF_INVALID; + connect_cb = this->connect_cb_; + return Conf::CONF_OK; + } + Conf::ConfResult get( OAuthBearerTokenRefreshCb *&oauthbearer_token_refresh_cb) const { if (!rk_conf_) @@ -962,20 +991,6 @@ class ConfImpl : public Conf { return Conf::CONF_OK; } - Conf::ConfResult get(ResolveCb *&resolve_cb) const { - if (!rk_conf_) - return Conf::CONF_INVALID; - resolve_cb = this->resolve_cb_; - return Conf::CONF_OK; - } - - Conf::ConfResult get(ConnectCb *&connect_cb) const { - if (!rk_conf_) - return Conf::CONF_INVALID; - connect_cb = this->connect_cb_; - return Conf::CONF_OK; - } - Conf::ConfResult get(OpenCb *&open_cb) const { if (!rk_conf_) return Conf::CONF_INVALID; @@ -1294,22 +1309,28 @@ class TopicPartitionImpl : public TopicPartition { topic_(topic), partition_(partition), offset_(RdKafka::Topic::OFFSET_INVALID), - err_(ERR_NO_ERROR) { + err_(ERR_NO_ERROR), + leader_epoch_(-1) { } TopicPartitionImpl(const std::string &topic, int partition, int64_t offset) : topic_(topic), partition_(partition), offset_(offset), - err_(ERR_NO_ERROR) { + err_(ERR_NO_ERROR), + leader_epoch_(-1) { } TopicPartitionImpl(const rd_kafka_topic_partition_t *c_part) { - topic_ = std::string(c_part->topic); - partition_ = c_part->partition; - offset_ = c_part->offset; - err_ = static_cast(c_part->err); - // FIXME: metadata + topic_ = std::string(c_part->topic); + partition_ = c_part->partition; + offset_ = c_part->offset; + err_ = static_cast(c_part->err); + leader_epoch_ = rd_kafka_topic_partition_get_leader_epoch(c_part); + if (c_part->metadata_size > 0) { + unsigned char *metadata = (unsigned char *)c_part->metadata; + metadata_.assign(metadata, metadata + c_part->metadata_size); + } } static void destroy(std::vector &partitions); @@ -1333,6 +1354,22 @@ class TopicPartitionImpl : public TopicPartition { offset_ = offset; } + int32_t get_leader_epoch() { + return leader_epoch_; + } + + void set_leader_epoch(int32_t leader_epoch) { + leader_epoch_ = leader_epoch; + } + + std::vector get_metadata() { + return metadata_; + } + + void set_metadata(std::vector &metadata) { + metadata_ = metadata; + } + std::ostream &operator<<(std::ostream &ostrm) const { return ostrm << topic_ << " [" << partition_ << "]"; } @@ -1341,6 +1378,8 @@ class TopicPartitionImpl : public TopicPartition { int partition_; int64_t offset_; ErrorCode err_; + int32_t leader_epoch_; + std::vector metadata_; }; diff --git a/src/third_party/librdkafka/dist/src/README.lz4.md b/src/third_party/librdkafka/dist/src/README.lz4.md new file mode 100644 index 00000000000..5ed331c44bb --- /dev/null +++ b/src/third_party/librdkafka/dist/src/README.lz4.md @@ -0,0 +1,30 @@ +# Instructions for Updating KLZ4 Version + +This document describes the steps to update the bundled lz4 version, that is, +the version used when `./configure` is run with `--disable-lz4-ext`. + +1. For each file in the [lz4 repository's](https://github.com/lz4/lz4/) `lib` + directory (checked out to the appropriate version tag), copy it into the + librdkafka `src` directory, overwriting the previous files. +2. Copy `xxhash.h` and `xxhash.c` files, and rename them to `rdxxhash.h` and + `rdxxhash.c`, respectively, replacing the previous files. Change any + `#include`s of `xxhash.h` to `rdxxhash.h`. +3. Replace the `#else` block of the + `#if defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)` + with the following code, including the comment: + ```c + #else + /* NOTE: While upgrading the lz4 version, replace the original `#else` block + * in the code with this block, and retain this comment. */ + struct rdkafka_s; + extern void *rd_kafka_mem_malloc(struct rdkafka_s *rk, size_t s); + extern void *rd_kafka_mem_calloc(struct rdkafka_s *rk, size_t n, size_t s); + extern void rd_kafka_mem_free(struct rdkafka_s *rk, void *p); + # define ALLOC(s) rd_kafka_mem_malloc(NULL, s) + # define ALLOC_AND_ZERO(s) rd_kafka_mem_calloc(NULL, 1, s) + # define FREEMEM(p) rd_kafka_mem_free(NULL, p) + #endif + ``` +4. Change version mentioned for lz4 in `configure.self`. +4. Run `./configure` with `--disable-lz4-ext` option, make and run test 0017. +5. Update CHANGELOG.md and both the lz4 LICENSE, and the combined LICENSE. diff --git a/src/third_party/librdkafka/dist/src/cJSON.c b/src/third_party/librdkafka/dist/src/cJSON.c deleted file mode 100644 index 9aec18469c1..00000000000 --- a/src/third_party/librdkafka/dist/src/cJSON.c +++ /dev/null @@ -1,2834 +0,0 @@ -/* - Copyright (c) 2009-2017 Dave Gamble and cJSON contributors - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -/* cJSON */ -/* JSON parser in C. */ - -/* disable warnings about old C89 functions in MSVC */ -#if !defined(_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER) -#define _CRT_SECURE_NO_DEPRECATE -#endif - -#ifdef __GNUC__ -#pragma GCC visibility push(default) -#endif -#if defined(_MSC_VER) -#pragma warning(push) -/* disable warning about single line comments in system headers */ -#pragma warning(disable : 4001) -#endif - -#include -#include -#include -#include -#include -#include -#include - -#ifdef ENABLE_LOCALES -#include -#endif - -#if defined(_MSC_VER) -#pragma warning(pop) -#endif -#ifdef __GNUC__ -#pragma GCC visibility pop -#endif - -#include "cJSON.h" - -/* define our own boolean type */ -#ifdef true -#undef true -#endif -#define true ((cJSON_bool)1) - -#ifdef false -#undef false -#endif -#define false ((cJSON_bool)0) - -/* define isnan and isinf for ANSI C, if in C99 or above, isnan and isinf has - * been defined in math.h */ -#ifndef isinf -#define isinf(d) (isnan((d - d)) && !isnan(d)) -#endif -#ifndef isnan -#define isnan(d) (d != d) -#endif - -#ifndef NAN -#define NAN 0.0 / 0.0 -#endif - -typedef struct { - const unsigned char *json; - size_t position; -} error; -static error global_error = {NULL, 0}; - -CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void) { - return (const char *)(global_error.json + global_error.position); -} - -CJSON_PUBLIC(char *) cJSON_GetStringValue(const cJSON *const item) { - if (!cJSON_IsString(item)) { - return NULL; - } - - return item->valuestring; -} - -CJSON_PUBLIC(double) cJSON_GetNumberValue(const cJSON *const item) { - if (!cJSON_IsNumber(item)) { - return (double)NAN; - } - - return item->valuedouble; -} - -/* This is a safeguard to prevent copy-pasters from using incompatible C and - * header files */ -#if (CJSON_VERSION_MAJOR != 1) || (CJSON_VERSION_MINOR != 7) || \ - (CJSON_VERSION_PATCH != 14) -#error cJSON.h and cJSON.c have different versions. Make sure that both have the same. -#endif - -CJSON_PUBLIC(const char *) cJSON_Version(void) { - static char version[15]; - sprintf(version, "%i.%i.%i", CJSON_VERSION_MAJOR, CJSON_VERSION_MINOR, - CJSON_VERSION_PATCH); - - return version; -} - -/* Case insensitive string comparison, doesn't consider two NULL pointers equal - * though */ -static int case_insensitive_strcmp(const unsigned char *string1, - const unsigned char *string2) { - if ((string1 == NULL) || (string2 == NULL)) { - return 1; - } - - if (string1 == string2) { - return 0; - } - - for (; tolower(*string1) == tolower(*string2); - (void)string1++, string2++) { - if (*string1 == '\0') { - return 0; - } - } - - return tolower(*string1) - tolower(*string2); -} - -typedef struct internal_hooks { - void *(CJSON_CDECL *allocate)(size_t size); - void(CJSON_CDECL *deallocate)(void *pointer); - void *(CJSON_CDECL *reallocate)(void *pointer, size_t size); -} internal_hooks; - -#if defined(_MSC_VER) -/* work around MSVC error C2322: '...' address of dllimport '...' is not static - */ -static void *CJSON_CDECL internal_malloc(size_t size) { - return malloc(size); -} -static void CJSON_CDECL internal_free(void *pointer) { - free(pointer); -} -static void *CJSON_CDECL internal_realloc(void *pointer, size_t size) { - return realloc(pointer, size); -} -#else -#define internal_malloc malloc -#define internal_free free -#define internal_realloc realloc -#endif - -/* strlen of character literals resolved at compile time */ -#define static_strlen(string_literal) (sizeof(string_literal) - sizeof("")) - -static internal_hooks global_hooks = {internal_malloc, internal_free, - internal_realloc}; - -static unsigned char *cJSON_strdup(const unsigned char *string, - const internal_hooks *const hooks) { - size_t length = 0; - unsigned char *copy = NULL; - - if (string == NULL) { - return NULL; - } - - length = strlen((const char *)string) + sizeof(""); - copy = (unsigned char *)hooks->allocate(length); - if (copy == NULL) { - return NULL; - } - memcpy(copy, string, length); - - return copy; -} - -CJSON_PUBLIC(void) cJSON_InitHooks(cJSON_Hooks *hooks) { - if (hooks == NULL) { - /* Reset hooks */ - global_hooks.allocate = malloc; - global_hooks.deallocate = free; - global_hooks.reallocate = realloc; - return; - } - - global_hooks.allocate = malloc; - if (hooks->malloc_fn != NULL) { - global_hooks.allocate = hooks->malloc_fn; - } - - global_hooks.deallocate = free; - if (hooks->free_fn != NULL) { - global_hooks.deallocate = hooks->free_fn; - } - - /* use realloc only if both free and malloc are used */ - global_hooks.reallocate = NULL; - if ((global_hooks.allocate == malloc) && - (global_hooks.deallocate == free)) { - global_hooks.reallocate = realloc; - } -} - -/* Internal constructor. */ -static cJSON *cJSON_New_Item(const internal_hooks *const hooks) { - cJSON *node = (cJSON *)hooks->allocate(sizeof(cJSON)); - if (node) { - memset(node, '\0', sizeof(cJSON)); - } - - return node; -} - -/* Delete a cJSON structure. */ -CJSON_PUBLIC(void) cJSON_Delete(cJSON *item) { - cJSON *next = NULL; - while (item != NULL) { - next = item->next; - if (!(item->type & cJSON_IsReference) && - (item->child != NULL)) { - cJSON_Delete(item->child); - } - if (!(item->type & cJSON_IsReference) && - (item->valuestring != NULL)) { - global_hooks.deallocate(item->valuestring); - } - if (!(item->type & cJSON_StringIsConst) && - (item->string != NULL)) { - global_hooks.deallocate(item->string); - } - global_hooks.deallocate(item); - item = next; - } -} - -/* get the decimal point character of the current locale */ -static unsigned char get_decimal_point(void) { -#ifdef ENABLE_LOCALES - struct lconv *lconv = localeconv(); - return (unsigned char)lconv->decimal_point[0]; -#else - return '.'; -#endif -} - -typedef struct { - const unsigned char *content; - size_t length; - size_t offset; - size_t depth; /* How deeply nested (in arrays/objects) is the input at - the current offset. */ - internal_hooks hooks; -} parse_buffer; - -/* check if the given size is left to read in a given parse buffer (starting - * with 1) */ -#define can_read(buffer, size) \ - ((buffer != NULL) && (((buffer)->offset + size) <= (buffer)->length)) -/* check if the buffer can be accessed at the given index (starting with 0) */ -#define can_access_at_index(buffer, index) \ - ((buffer != NULL) && (((buffer)->offset + index) < (buffer)->length)) -#define cannot_access_at_index(buffer, index) \ - (!can_access_at_index(buffer, index)) -/* get a pointer to the buffer at the position */ -#define buffer_at_offset(buffer) ((buffer)->content + (buffer)->offset) - -/* Parse the input text to generate a number, and populate the result into item. - */ -static cJSON_bool parse_number(cJSON *const item, - parse_buffer *const input_buffer) { - double number = 0; - unsigned char *after_end = NULL; - unsigned char number_c_string[64]; - unsigned char decimal_point = get_decimal_point(); - size_t i = 0; - - if ((input_buffer == NULL) || (input_buffer->content == NULL)) { - return false; - } - - /* copy the number into a temporary buffer and replace '.' with the - * decimal point of the current locale (for strtod) - * This also takes care of '\0' not necessarily being available for - * marking the end of the input */ - for (i = 0; (i < (sizeof(number_c_string) - 1)) && - can_access_at_index(input_buffer, i); - i++) { - switch (buffer_at_offset(input_buffer)[i]) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case '+': - case '-': - case 'e': - case 'E': - number_c_string[i] = buffer_at_offset(input_buffer)[i]; - break; - - case '.': - number_c_string[i] = decimal_point; - break; - - default: - goto loop_end; - } - } -loop_end: - number_c_string[i] = '\0'; - - number = strtod((const char *)number_c_string, (char **)&after_end); - if (number_c_string == after_end) { - return false; /* parse_error */ - } - - item->valuedouble = number; - - /* use saturation in case of overflow */ - if (number >= INT_MAX) { - item->valueint = INT_MAX; - } else if (number <= (double)INT_MIN) { - item->valueint = INT_MIN; - } else { - item->valueint = (int)number; - } - - item->type = cJSON_Number; - - input_buffer->offset += (size_t)(after_end - number_c_string); - return true; -} - -/* don't ask me, but the original cJSON_SetNumberValue returns an integer or - * double */ -CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number) { - if (number >= INT_MAX) { - object->valueint = INT_MAX; - } else if (number <= (double)INT_MIN) { - object->valueint = INT_MIN; - } else { - object->valueint = (int)number; - } - - return object->valuedouble = number; -} - -CJSON_PUBLIC(char *) -cJSON_SetValuestring(cJSON *object, const char *valuestring) { - char *copy = NULL; - /* if object's type is not cJSON_String or is cJSON_IsReference, it - * should not set valuestring */ - if (!(object->type & cJSON_String) || - (object->type & cJSON_IsReference)) { - return NULL; - } - if (strlen(valuestring) <= strlen(object->valuestring)) { - strcpy(object->valuestring, valuestring); - return object->valuestring; - } - copy = (char *)cJSON_strdup((const unsigned char *)valuestring, - &global_hooks); - if (copy == NULL) { - return NULL; - } - if (object->valuestring != NULL) { - cJSON_free(object->valuestring); - } - object->valuestring = copy; - - return copy; -} - -typedef struct { - unsigned char *buffer; - size_t length; - size_t offset; - size_t depth; /* current nesting depth (for formatted printing) */ - cJSON_bool noalloc; - cJSON_bool format; /* is this print a formatted print */ - internal_hooks hooks; -} printbuffer; - -/* realloc printbuffer if necessary to have at least "needed" bytes more */ -static unsigned char *ensure(printbuffer *const p, size_t needed) { - unsigned char *newbuffer = NULL; - size_t newsize = 0; - - if ((p == NULL) || (p->buffer == NULL)) { - return NULL; - } - - if ((p->length > 0) && (p->offset >= p->length)) { - /* make sure that offset is valid */ - return NULL; - } - - if (needed > INT_MAX) { - /* sizes bigger than INT_MAX are currently not supported */ - return NULL; - } - - needed += p->offset + 1; - if (needed <= p->length) { - return p->buffer + p->offset; - } - - if (p->noalloc) { - return NULL; - } - - /* calculate new buffer size */ - if (needed > (INT_MAX / 2)) { - /* overflow of int, use INT_MAX if possible */ - if (needed <= INT_MAX) { - newsize = INT_MAX; - } else { - return NULL; - } - } else { - newsize = needed * 2; - } - - if (p->hooks.reallocate != NULL) { - /* reallocate with realloc if available */ - newbuffer = - (unsigned char *)p->hooks.reallocate(p->buffer, newsize); - if (newbuffer == NULL) { - p->hooks.deallocate(p->buffer); - p->length = 0; - p->buffer = NULL; - - return NULL; - } - } else { - /* otherwise reallocate manually */ - newbuffer = (unsigned char *)p->hooks.allocate(newsize); - if (!newbuffer) { - p->hooks.deallocate(p->buffer); - p->length = 0; - p->buffer = NULL; - - return NULL; - } - if (newbuffer) { - memcpy(newbuffer, p->buffer, p->offset + 1); - } - p->hooks.deallocate(p->buffer); - } - p->length = newsize; - p->buffer = newbuffer; - - return newbuffer + p->offset; -} - -/* calculate the new length of the string in a printbuffer and update the offset - */ -static void update_offset(printbuffer *const buffer) { - const unsigned char *buffer_pointer = NULL; - if ((buffer == NULL) || (buffer->buffer == NULL)) { - return; - } - buffer_pointer = buffer->buffer + buffer->offset; - - buffer->offset += strlen((const char *)buffer_pointer); -} - -/* securely comparison of floating-point variables */ -static cJSON_bool compare_double(double a, double b) { - double maxVal = fabs(a) > fabs(b) ? fabs(a) : fabs(b); - return (fabs(a - b) <= maxVal * DBL_EPSILON); -} - -/* Render the number nicely from the given item into a string. */ -static cJSON_bool print_number(const cJSON *const item, - printbuffer *const output_buffer) { - unsigned char *output_pointer = NULL; - double d = item->valuedouble; - int length = 0; - size_t i = 0; - unsigned char number_buffer[26] = { - 0}; /* temporary buffer to print the number into */ - unsigned char decimal_point = get_decimal_point(); - double test = 0.0; - - if (output_buffer == NULL) { - return false; - } - - /* This checks for NaN and Infinity */ - if (isnan(d) || isinf(d)) { - length = sprintf((char *)number_buffer, "null"); - } else { - /* Try 15 decimal places of precision to avoid nonsignificant - * nonzero digits */ - length = sprintf((char *)number_buffer, "%1.15g", d); - - /* Check whether the original double can be recovered */ - if ((sscanf((char *)number_buffer, "%lg", &test) != 1) || - !compare_double((double)test, d)) { - /* If not, print with 17 decimal places of precision */ - length = sprintf((char *)number_buffer, "%1.17g", d); - } - } - - /* sprintf failed or buffer overrun occurred */ - if ((length < 0) || (length > (int)(sizeof(number_buffer) - 1))) { - return false; - } - - /* reserve appropriate space in the output */ - output_pointer = ensure(output_buffer, (size_t)length + sizeof("")); - if (output_pointer == NULL) { - return false; - } - - /* copy the printed number to the output and replace locale - * dependent decimal point with '.' */ - for (i = 0; i < ((size_t)length); i++) { - if (number_buffer[i] == decimal_point) { - output_pointer[i] = '.'; - continue; - } - - output_pointer[i] = number_buffer[i]; - } - output_pointer[i] = '\0'; - - output_buffer->offset += (size_t)length; - - return true; -} - -/* parse 4 digit hexadecimal number */ -static unsigned parse_hex4(const unsigned char *const input) { - unsigned int h = 0; - size_t i = 0; - - for (i = 0; i < 4; i++) { - /* parse digit */ - if ((input[i] >= '0') && (input[i] <= '9')) { - h += (unsigned int)input[i] - '0'; - } else if ((input[i] >= 'A') && (input[i] <= 'F')) { - h += (unsigned int)10 + input[i] - 'A'; - } else if ((input[i] >= 'a') && (input[i] <= 'f')) { - h += (unsigned int)10 + input[i] - 'a'; - } else /* invalid */ - { - return 0; - } - - if (i < 3) { - /* shift left to make place for the next nibble */ - h = h << 4; - } - } - - return h; -} - -/* converts a UTF-16 literal to UTF-8 - * A literal can be one or two sequences of the form \uXXXX */ -static unsigned char -utf16_literal_to_utf8(const unsigned char *const input_pointer, - const unsigned char *const input_end, - unsigned char **output_pointer) { - long unsigned int codepoint = 0; - unsigned int first_code = 0; - const unsigned char *first_sequence = input_pointer; - unsigned char utf8_length = 0; - unsigned char utf8_position = 0; - unsigned char sequence_length = 0; - unsigned char first_byte_mark = 0; - - if ((input_end - first_sequence) < 6) { - /* input ends unexpectedly */ - goto fail; - } - - /* get the first utf16 sequence */ - first_code = parse_hex4(first_sequence + 2); - - /* check that the code is valid */ - if (((first_code >= 0xDC00) && (first_code <= 0xDFFF))) { - goto fail; - } - - /* UTF16 surrogate pair */ - if ((first_code >= 0xD800) && (first_code <= 0xDBFF)) { - const unsigned char *second_sequence = first_sequence + 6; - unsigned int second_code = 0; - sequence_length = 12; /* \uXXXX\uXXXX */ - - if ((input_end - second_sequence) < 6) { - /* input ends unexpectedly */ - goto fail; - } - - if ((second_sequence[0] != '\\') || - (second_sequence[1] != 'u')) { - /* missing second half of the surrogate pair */ - goto fail; - } - - /* get the second utf16 sequence */ - second_code = parse_hex4(second_sequence + 2); - /* check that the code is valid */ - if ((second_code < 0xDC00) || (second_code > 0xDFFF)) { - /* invalid second half of the surrogate pair */ - goto fail; - } - - - /* calculate the unicode codepoint from the surrogate pair */ - codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | - (second_code & 0x3FF)); - } else { - sequence_length = 6; /* \uXXXX */ - codepoint = first_code; - } - - /* encode as UTF-8 - * takes at maximum 4 bytes to encode: - * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ - if (codepoint < 0x80) { - /* normal ascii, encoding 0xxxxxxx */ - utf8_length = 1; - } else if (codepoint < 0x800) { - /* two bytes, encoding 110xxxxx 10xxxxxx */ - utf8_length = 2; - first_byte_mark = 0xC0; /* 11000000 */ - } else if (codepoint < 0x10000) { - /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */ - utf8_length = 3; - first_byte_mark = 0xE0; /* 11100000 */ - } else if (codepoint <= 0x10FFFF) { - /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */ - utf8_length = 4; - first_byte_mark = 0xF0; /* 11110000 */ - } else { - /* invalid unicode codepoint */ - goto fail; - } - - /* encode as utf8 */ - for (utf8_position = (unsigned char)(utf8_length - 1); - utf8_position > 0; utf8_position--) { - /* 10xxxxxx */ - (*output_pointer)[utf8_position] = - (unsigned char)((codepoint | 0x80) & 0xBF); - codepoint >>= 6; - } - /* encode first byte */ - if (utf8_length > 1) { - (*output_pointer)[0] = - (unsigned char)((codepoint | first_byte_mark) & 0xFF); - } else { - (*output_pointer)[0] = (unsigned char)(codepoint & 0x7F); - } - - *output_pointer += utf8_length; - - return sequence_length; - -fail: - return 0; -} - -/* Parse the input text into an unescaped cinput, and populate item. */ -static cJSON_bool parse_string(cJSON *const item, - parse_buffer *const input_buffer) { - const unsigned char *input_pointer = buffer_at_offset(input_buffer) + 1; - const unsigned char *input_end = buffer_at_offset(input_buffer) + 1; - unsigned char *output_pointer = NULL; - unsigned char *output = NULL; - - /* not a string */ - if (buffer_at_offset(input_buffer)[0] != '\"') { - goto fail; - } - - { - /* calculate approximate size of the output (overestimate) */ - size_t allocation_length = 0; - size_t skipped_bytes = 0; - while (((size_t)(input_end - input_buffer->content) < - input_buffer->length) && - (*input_end != '\"')) { - /* is escape sequence */ - if (input_end[0] == '\\') { - if ((size_t)(input_end + 1 - - input_buffer->content) >= - input_buffer->length) { - /* prevent buffer overflow when last - * input character is a backslash */ - goto fail; - } - skipped_bytes++; - input_end++; - } - input_end++; - } - if (((size_t)(input_end - input_buffer->content) >= - input_buffer->length) || - (*input_end != '\"')) { - goto fail; /* string ended unexpectedly */ - } - - /* This is at most how much we need for the output */ - allocation_length = - (size_t)(input_end - buffer_at_offset(input_buffer)) - - skipped_bytes; - output = (unsigned char *)input_buffer->hooks.allocate( - allocation_length + sizeof("")); - if (output == NULL) { - goto fail; /* allocation failure */ - } - } - - output_pointer = output; - /* loop through the string literal */ - while (input_pointer < input_end) { - if (*input_pointer != '\\') { - *output_pointer++ = *input_pointer++; - } - /* escape sequence */ - else { - unsigned char sequence_length = 2; - if ((input_end - input_pointer) < 1) { - goto fail; - } - - switch (input_pointer[1]) { - case 'b': - *output_pointer++ = '\b'; - break; - case 'f': - *output_pointer++ = '\f'; - break; - case 'n': - *output_pointer++ = '\n'; - break; - case 'r': - *output_pointer++ = '\r'; - break; - case 't': - *output_pointer++ = '\t'; - break; - case '\"': - case '\\': - case '/': - *output_pointer++ = input_pointer[1]; - break; - - /* UTF-16 literal */ - case 'u': - sequence_length = utf16_literal_to_utf8( - input_pointer, input_end, &output_pointer); - if (sequence_length == 0) { - /* failed to convert UTF16-literal to - * UTF-8 */ - goto fail; - } - break; - - default: - goto fail; - } - input_pointer += sequence_length; - } - } - - /* zero terminate the output */ - *output_pointer = '\0'; - - item->type = cJSON_String; - item->valuestring = (char *)output; - - input_buffer->offset = (size_t)(input_end - input_buffer->content); - input_buffer->offset++; - - return true; - -fail: - if (output != NULL) { - input_buffer->hooks.deallocate(output); - } - - if (input_pointer != NULL) { - input_buffer->offset = - (size_t)(input_pointer - input_buffer->content); - } - - return false; -} - -/* Render the cstring provided to an escaped version that can be printed. */ -static cJSON_bool print_string_ptr(const unsigned char *const input, - printbuffer *const output_buffer) { - const unsigned char *input_pointer = NULL; - unsigned char *output = NULL; - unsigned char *output_pointer = NULL; - size_t output_length = 0; - /* numbers of additional characters needed for escaping */ - size_t escape_characters = 0; - - if (output_buffer == NULL) { - return false; - } - - /* empty string */ - if (input == NULL) { - output = ensure(output_buffer, sizeof("\"\"")); - if (output == NULL) { - return false; - } - strcpy((char *)output, "\"\""); - - return true; - } - - /* set "flag" to 1 if something needs to be escaped */ - for (input_pointer = input; *input_pointer; input_pointer++) { - switch (*input_pointer) { - case '\"': - case '\\': - case '\b': - case '\f': - case '\n': - case '\r': - case '\t': - /* one character escape sequence */ - escape_characters++; - break; - default: - if (*input_pointer < 32) { - /* UTF-16 escape sequence uXXXX */ - escape_characters += 5; - } - break; - } - } - output_length = (size_t)(input_pointer - input) + escape_characters; - - output = ensure(output_buffer, output_length + sizeof("\"\"")); - if (output == NULL) { - return false; - } - - /* no characters have to be escaped */ - if (escape_characters == 0) { - output[0] = '\"'; - memcpy(output + 1, input, output_length); - output[output_length + 1] = '\"'; - output[output_length + 2] = '\0'; - - return true; - } - - output[0] = '\"'; - output_pointer = output + 1; - /* copy the string */ - for (input_pointer = input; *input_pointer != '\0'; - (void)input_pointer++, output_pointer++) { - if ((*input_pointer > 31) && (*input_pointer != '\"') && - (*input_pointer != '\\')) { - /* normal character, copy */ - *output_pointer = *input_pointer; - } else { - /* character needs to be escaped */ - *output_pointer++ = '\\'; - switch (*input_pointer) { - case '\\': - *output_pointer = '\\'; - break; - case '\"': - *output_pointer = '\"'; - break; - case '\b': - *output_pointer = 'b'; - break; - case '\f': - *output_pointer = 'f'; - break; - case '\n': - *output_pointer = 'n'; - break; - case '\r': - *output_pointer = 'r'; - break; - case '\t': - *output_pointer = 't'; - break; - default: - /* escape and print as unicode codepoint */ - sprintf((char *)output_pointer, "u%04x", - *input_pointer); - output_pointer += 4; - break; - } - } - } - output[output_length + 1] = '\"'; - output[output_length + 2] = '\0'; - - return true; -} - -/* Invoke print_string_ptr (which is useful) on an item. */ -static cJSON_bool print_string(const cJSON *const item, printbuffer *const p) { - return print_string_ptr((unsigned char *)item->valuestring, p); -} - -/* Predeclare these prototypes. */ -static cJSON_bool parse_value(cJSON *const item, - parse_buffer *const input_buffer); -static cJSON_bool print_value(const cJSON *const item, - printbuffer *const output_buffer); -static cJSON_bool parse_array(cJSON *const item, - parse_buffer *const input_buffer); -static cJSON_bool print_array(const cJSON *const item, - printbuffer *const output_buffer); -static cJSON_bool parse_object(cJSON *const item, - parse_buffer *const input_buffer); -static cJSON_bool print_object(const cJSON *const item, - printbuffer *const output_buffer); - -/* Utility to jump whitespace and cr/lf */ -static parse_buffer *buffer_skip_whitespace(parse_buffer *const buffer) { - if ((buffer == NULL) || (buffer->content == NULL)) { - return NULL; - } - - if (cannot_access_at_index(buffer, 0)) { - return buffer; - } - - while (can_access_at_index(buffer, 0) && - (buffer_at_offset(buffer)[0] <= 32)) { - buffer->offset++; - } - - if (buffer->offset == buffer->length) { - buffer->offset--; - } - - return buffer; -} - -/* skip the UTF-8 BOM (byte order mark) if it is at the beginning of a buffer */ -static parse_buffer *skip_utf8_bom(parse_buffer *const buffer) { - if ((buffer == NULL) || (buffer->content == NULL) || - (buffer->offset != 0)) { - return NULL; - } - - if (can_access_at_index(buffer, 4) && - (strncmp((const char *)buffer_at_offset(buffer), "\xEF\xBB\xBF", - 3) == 0)) { - buffer->offset += 3; - } - - return buffer; -} - -CJSON_PUBLIC(cJSON *) -cJSON_ParseWithOpts(const char *value, - const char **return_parse_end, - cJSON_bool require_null_terminated) { - size_t buffer_length; - - if (NULL == value) { - return NULL; - } - - /* Adding null character size due to require_null_terminated. */ - buffer_length = strlen(value) + sizeof(""); - - return cJSON_ParseWithLengthOpts(value, buffer_length, return_parse_end, - require_null_terminated); -} - -/* Parse an object - create a new root, and populate. */ -CJSON_PUBLIC(cJSON *) -cJSON_ParseWithLengthOpts(const char *value, - size_t buffer_length, - const char **return_parse_end, - cJSON_bool require_null_terminated) { - parse_buffer buffer = {0, 0, 0, 0, {0, 0, 0}}; - cJSON *item = NULL; - - /* reset error position */ - global_error.json = NULL; - global_error.position = 0; - - if (value == NULL || 0 == buffer_length) { - goto fail; - } - - buffer.content = (const unsigned char *)value; - buffer.length = buffer_length; - buffer.offset = 0; - buffer.hooks = global_hooks; - - item = cJSON_New_Item(&global_hooks); - if (item == NULL) /* memory fail */ - { - goto fail; - } - - if (!parse_value(item, - buffer_skip_whitespace(skip_utf8_bom(&buffer)))) { - /* parse failure. ep is set. */ - goto fail; - } - - /* if we require null-terminated JSON without appended garbage, skip and - * then check for a null terminator */ - if (require_null_terminated) { - buffer_skip_whitespace(&buffer); - if ((buffer.offset >= buffer.length) || - buffer_at_offset(&buffer)[0] != '\0') { - goto fail; - } - } - if (return_parse_end) { - *return_parse_end = (const char *)buffer_at_offset(&buffer); - } - - return item; - -fail: - if (item != NULL) { - cJSON_Delete(item); - } - - if (value != NULL) { - error local_error; - local_error.json = (const unsigned char *)value; - local_error.position = 0; - - if (buffer.offset < buffer.length) { - local_error.position = buffer.offset; - } else if (buffer.length > 0) { - local_error.position = buffer.length - 1; - } - - if (return_parse_end != NULL) { - *return_parse_end = (const char *)local_error.json + - local_error.position; - } - - global_error = local_error; - } - - return NULL; -} - -/* Default options for cJSON_Parse */ -CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value) { - return cJSON_ParseWithOpts(value, 0, 0); -} - -CJSON_PUBLIC(cJSON *) -cJSON_ParseWithLength(const char *value, size_t buffer_length) { - return cJSON_ParseWithLengthOpts(value, buffer_length, 0, 0); -} - -#define cjson_min(a, b) (((a) < (b)) ? (a) : (b)) - -static unsigned char *print(const cJSON *const item, - cJSON_bool format, - const internal_hooks *const hooks) { - static const size_t default_buffer_size = 256; - printbuffer buffer[1]; - unsigned char *printed = NULL; - - memset(buffer, 0, sizeof(buffer)); - - /* create buffer */ - buffer->buffer = (unsigned char *)hooks->allocate(default_buffer_size); - buffer->length = default_buffer_size; - buffer->format = format; - buffer->hooks = *hooks; - if (buffer->buffer == NULL) { - goto fail; - } - - /* print the value */ - if (!print_value(item, buffer)) { - goto fail; - } - update_offset(buffer); - - /* check if reallocate is available */ - if (hooks->reallocate != NULL) { - printed = (unsigned char *)hooks->reallocate( - buffer->buffer, buffer->offset + 1); - if (printed == NULL) { - goto fail; - } - buffer->buffer = NULL; - } else /* otherwise copy the JSON over to a new buffer */ - { - printed = (unsigned char *)hooks->allocate(buffer->offset + 1); - if (printed == NULL) { - goto fail; - } - memcpy(printed, buffer->buffer, - cjson_min(buffer->length, buffer->offset + 1)); - printed[buffer->offset] = '\0'; /* just to be sure */ - - /* free the buffer */ - hooks->deallocate(buffer->buffer); - } - - return printed; - -fail: - if (buffer->buffer != NULL) { - hooks->deallocate(buffer->buffer); - } - - if (printed != NULL) { - hooks->deallocate(printed); - } - - return NULL; -} - -/* Render a cJSON item/entity/structure to text. */ -CJSON_PUBLIC(char *) cJSON_Print(const cJSON *item) { - return (char *)print(item, true, &global_hooks); -} - -CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item) { - return (char *)print(item, false, &global_hooks); -} - -CJSON_PUBLIC(char *) -cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt) { - printbuffer p = {0, 0, 0, 0, 0, 0, {0, 0, 0}}; - - if (prebuffer < 0) { - return NULL; - } - - p.buffer = (unsigned char *)global_hooks.allocate((size_t)prebuffer); - if (!p.buffer) { - return NULL; - } - - p.length = (size_t)prebuffer; - p.offset = 0; - p.noalloc = false; - p.format = fmt; - p.hooks = global_hooks; - - if (!print_value(item, &p)) { - global_hooks.deallocate(p.buffer); - return NULL; - } - - return (char *)p.buffer; -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_PrintPreallocated(cJSON *item, - char *buffer, - const int length, - const cJSON_bool format) { - printbuffer p = {0, 0, 0, 0, 0, 0, {0, 0, 0}}; - - if ((length < 0) || (buffer == NULL)) { - return false; - } - - p.buffer = (unsigned char *)buffer; - p.length = (size_t)length; - p.offset = 0; - p.noalloc = true; - p.format = format; - p.hooks = global_hooks; - - return print_value(item, &p); -} - -/* Parser core - when encountering text, process appropriately. */ -static cJSON_bool parse_value(cJSON *const item, - parse_buffer *const input_buffer) { - if ((input_buffer == NULL) || (input_buffer->content == NULL)) { - return false; /* no input */ - } - - /* parse the different types of values */ - /* null */ - if (can_read(input_buffer, 4) && - (strncmp((const char *)buffer_at_offset(input_buffer), "null", 4) == - 0)) { - item->type = cJSON_NULL; - input_buffer->offset += 4; - return true; - } - /* false */ - if (can_read(input_buffer, 5) && - (strncmp((const char *)buffer_at_offset(input_buffer), "false", - 5) == 0)) { - item->type = cJSON_False; - input_buffer->offset += 5; - return true; - } - /* true */ - if (can_read(input_buffer, 4) && - (strncmp((const char *)buffer_at_offset(input_buffer), "true", 4) == - 0)) { - item->type = cJSON_True; - item->valueint = 1; - input_buffer->offset += 4; - return true; - } - /* string */ - if (can_access_at_index(input_buffer, 0) && - (buffer_at_offset(input_buffer)[0] == '\"')) { - return parse_string(item, input_buffer); - } - /* number */ - if (can_access_at_index(input_buffer, 0) && - ((buffer_at_offset(input_buffer)[0] == '-') || - ((buffer_at_offset(input_buffer)[0] >= '0') && - (buffer_at_offset(input_buffer)[0] <= '9')))) { - return parse_number(item, input_buffer); - } - /* array */ - if (can_access_at_index(input_buffer, 0) && - (buffer_at_offset(input_buffer)[0] == '[')) { - return parse_array(item, input_buffer); - } - /* object */ - if (can_access_at_index(input_buffer, 0) && - (buffer_at_offset(input_buffer)[0] == '{')) { - return parse_object(item, input_buffer); - } - - return false; -} - -/* Render a value to text. */ -static cJSON_bool print_value(const cJSON *const item, - printbuffer *const output_buffer) { - unsigned char *output = NULL; - - if ((item == NULL) || (output_buffer == NULL)) { - return false; - } - - switch ((item->type) & 0xFF) { - case cJSON_NULL: - output = ensure(output_buffer, 5); - if (output == NULL) { - return false; - } - strcpy((char *)output, "null"); - return true; - - case cJSON_False: - output = ensure(output_buffer, 6); - if (output == NULL) { - return false; - } - strcpy((char *)output, "false"); - return true; - - case cJSON_True: - output = ensure(output_buffer, 5); - if (output == NULL) { - return false; - } - strcpy((char *)output, "true"); - return true; - - case cJSON_Number: - return print_number(item, output_buffer); - - case cJSON_Raw: { - size_t raw_length = 0; - if (item->valuestring == NULL) { - return false; - } - - raw_length = strlen(item->valuestring) + sizeof(""); - output = ensure(output_buffer, raw_length); - if (output == NULL) { - return false; - } - memcpy(output, item->valuestring, raw_length); - return true; - } - - case cJSON_String: - return print_string(item, output_buffer); - - case cJSON_Array: - return print_array(item, output_buffer); - - case cJSON_Object: - return print_object(item, output_buffer); - - default: - return false; - } -} - -/* Build an array from input text. */ -static cJSON_bool parse_array(cJSON *const item, - parse_buffer *const input_buffer) { - cJSON *head = NULL; /* head of the linked list */ - cJSON *current_item = NULL; - - if (input_buffer->depth >= CJSON_NESTING_LIMIT) { - return false; /* to deeply nested */ - } - input_buffer->depth++; - - if (buffer_at_offset(input_buffer)[0] != '[') { - /* not an array */ - goto fail; - } - - input_buffer->offset++; - buffer_skip_whitespace(input_buffer); - if (can_access_at_index(input_buffer, 0) && - (buffer_at_offset(input_buffer)[0] == ']')) { - /* empty array */ - goto success; - } - - /* check if we skipped to the end of the buffer */ - if (cannot_access_at_index(input_buffer, 0)) { - input_buffer->offset--; - goto fail; - } - - /* step back to character in front of the first element */ - input_buffer->offset--; - /* loop through the comma separated array elements */ - do { - /* allocate next item */ - cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks)); - if (new_item == NULL) { - goto fail; /* allocation failure */ - } - - /* attach next item to list */ - if (head == NULL) { - /* start the linked list */ - current_item = head = new_item; - } else { - /* add to the end and advance */ - current_item->next = new_item; - new_item->prev = current_item; - current_item = new_item; - } - - /* parse next value */ - input_buffer->offset++; - buffer_skip_whitespace(input_buffer); - if (!parse_value(current_item, input_buffer)) { - goto fail; /* failed to parse value */ - } - buffer_skip_whitespace(input_buffer); - } while (can_access_at_index(input_buffer, 0) && - (buffer_at_offset(input_buffer)[0] == ',')); - - if (cannot_access_at_index(input_buffer, 0) || - buffer_at_offset(input_buffer)[0] != ']') { - goto fail; /* expected end of array */ - } - -success: - input_buffer->depth--; - - if (head != NULL) { - head->prev = current_item; - } - - item->type = cJSON_Array; - item->child = head; - - input_buffer->offset++; - - return true; - -fail: - if (head != NULL) { - cJSON_Delete(head); - } - - return false; -} - -/* Render an array to text */ -static cJSON_bool print_array(const cJSON *const item, - printbuffer *const output_buffer) { - unsigned char *output_pointer = NULL; - size_t length = 0; - cJSON *current_element = item->child; - - if (output_buffer == NULL) { - return false; - } - - /* Compose the output array. */ - /* opening square bracket */ - output_pointer = ensure(output_buffer, 1); - if (output_pointer == NULL) { - return false; - } - - *output_pointer = '['; - output_buffer->offset++; - output_buffer->depth++; - - while (current_element != NULL) { - if (!print_value(current_element, output_buffer)) { - return false; - } - update_offset(output_buffer); - if (current_element->next) { - length = (size_t)(output_buffer->format ? 2 : 1); - output_pointer = ensure(output_buffer, length + 1); - if (output_pointer == NULL) { - return false; - } - *output_pointer++ = ','; - if (output_buffer->format) { - *output_pointer++ = ' '; - } - *output_pointer = '\0'; - output_buffer->offset += length; - } - current_element = current_element->next; - } - - output_pointer = ensure(output_buffer, 2); - if (output_pointer == NULL) { - return false; - } - *output_pointer++ = ']'; - *output_pointer = '\0'; - output_buffer->depth--; - - return true; -} - -/* Build an object from the text. */ -static cJSON_bool parse_object(cJSON *const item, - parse_buffer *const input_buffer) { - cJSON *head = NULL; /* linked list head */ - cJSON *current_item = NULL; - - if (input_buffer->depth >= CJSON_NESTING_LIMIT) { - return false; /* to deeply nested */ - } - input_buffer->depth++; - - if (cannot_access_at_index(input_buffer, 0) || - (buffer_at_offset(input_buffer)[0] != '{')) { - goto fail; /* not an object */ - } - - input_buffer->offset++; - buffer_skip_whitespace(input_buffer); - if (can_access_at_index(input_buffer, 0) && - (buffer_at_offset(input_buffer)[0] == '}')) { - goto success; /* empty object */ - } - - /* check if we skipped to the end of the buffer */ - if (cannot_access_at_index(input_buffer, 0)) { - input_buffer->offset--; - goto fail; - } - - /* step back to character in front of the first element */ - input_buffer->offset--; - /* loop through the comma separated array elements */ - do { - /* allocate next item */ - cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks)); - if (new_item == NULL) { - goto fail; /* allocation failure */ - } - - /* attach next item to list */ - if (head == NULL) { - /* start the linked list */ - current_item = head = new_item; - } else { - /* add to the end and advance */ - current_item->next = new_item; - new_item->prev = current_item; - current_item = new_item; - } - - /* parse the name of the child */ - input_buffer->offset++; - buffer_skip_whitespace(input_buffer); - if (!parse_string(current_item, input_buffer)) { - goto fail; /* failed to parse name */ - } - buffer_skip_whitespace(input_buffer); - - /* swap valuestring and string, because we parsed the name */ - current_item->string = current_item->valuestring; - current_item->valuestring = NULL; - - if (cannot_access_at_index(input_buffer, 0) || - (buffer_at_offset(input_buffer)[0] != ':')) { - goto fail; /* invalid object */ - } - - /* parse the value */ - input_buffer->offset++; - buffer_skip_whitespace(input_buffer); - if (!parse_value(current_item, input_buffer)) { - goto fail; /* failed to parse value */ - } - buffer_skip_whitespace(input_buffer); - } while (can_access_at_index(input_buffer, 0) && - (buffer_at_offset(input_buffer)[0] == ',')); - - if (cannot_access_at_index(input_buffer, 0) || - (buffer_at_offset(input_buffer)[0] != '}')) { - goto fail; /* expected end of object */ - } - -success: - input_buffer->depth--; - - if (head != NULL) { - head->prev = current_item; - } - - item->type = cJSON_Object; - item->child = head; - - input_buffer->offset++; - return true; - -fail: - if (head != NULL) { - cJSON_Delete(head); - } - - return false; -} - -/* Render an object to text. */ -static cJSON_bool print_object(const cJSON *const item, - printbuffer *const output_buffer) { - unsigned char *output_pointer = NULL; - size_t length = 0; - cJSON *current_item = item->child; - - if (output_buffer == NULL) { - return false; - } - - /* Compose the output: */ - length = (size_t)(output_buffer->format ? 2 : 1); /* fmt: {\n */ - output_pointer = ensure(output_buffer, length + 1); - if (output_pointer == NULL) { - return false; - } - - *output_pointer++ = '{'; - output_buffer->depth++; - if (output_buffer->format) { - *output_pointer++ = '\n'; - } - output_buffer->offset += length; - - while (current_item) { - if (output_buffer->format) { - size_t i; - output_pointer = - ensure(output_buffer, output_buffer->depth); - if (output_pointer == NULL) { - return false; - } - for (i = 0; i < output_buffer->depth; i++) { - *output_pointer++ = '\t'; - } - output_buffer->offset += output_buffer->depth; - } - - /* print key */ - if (!print_string_ptr((unsigned char *)current_item->string, - output_buffer)) { - return false; - } - update_offset(output_buffer); - - length = (size_t)(output_buffer->format ? 2 : 1); - output_pointer = ensure(output_buffer, length); - if (output_pointer == NULL) { - return false; - } - *output_pointer++ = ':'; - if (output_buffer->format) { - *output_pointer++ = '\t'; - } - output_buffer->offset += length; - - /* print value */ - if (!print_value(current_item, output_buffer)) { - return false; - } - update_offset(output_buffer); - - /* print comma if not last */ - length = ((size_t)(output_buffer->format ? 1 : 0) + - (size_t)(current_item->next ? 1 : 0)); - output_pointer = ensure(output_buffer, length + 1); - if (output_pointer == NULL) { - return false; - } - if (current_item->next) { - *output_pointer++ = ','; - } - - if (output_buffer->format) { - *output_pointer++ = '\n'; - } - *output_pointer = '\0'; - output_buffer->offset += length; - - current_item = current_item->next; - } - - output_pointer = - ensure(output_buffer, - output_buffer->format ? (output_buffer->depth + 1) : 2); - if (output_pointer == NULL) { - return false; - } - if (output_buffer->format) { - size_t i; - for (i = 0; i < (output_buffer->depth - 1); i++) { - *output_pointer++ = '\t'; - } - } - *output_pointer++ = '}'; - *output_pointer = '\0'; - output_buffer->depth--; - - return true; -} - -/* Get Array size/item / object item. */ -CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array) { - cJSON *child = NULL; - size_t size = 0; - - if (array == NULL) { - return 0; - } - - child = array->child; - - while (child != NULL) { - size++; - child = child->next; - } - - /* FIXME: Can overflow here. Cannot be fixed without breaking the API */ - - return (int)size; -} - -static cJSON *get_array_item(const cJSON *array, size_t index) { - cJSON *current_child = NULL; - - if (array == NULL) { - return NULL; - } - - current_child = array->child; - while ((current_child != NULL) && (index > 0)) { - index--; - current_child = current_child->next; - } - - return current_child; -} - -CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index) { - if (index < 0) { - return NULL; - } - - return get_array_item(array, (size_t)index); -} - -static cJSON *get_object_item(const cJSON *const object, - const char *const name, - const cJSON_bool case_sensitive) { - cJSON *current_element = NULL; - - if ((object == NULL) || (name == NULL)) { - return NULL; - } - - current_element = object->child; - if (case_sensitive) { - while ((current_element != NULL) && - (current_element->string != NULL) && - (strcmp(name, current_element->string) != 0)) { - current_element = current_element->next; - } - } else { - while ((current_element != NULL) && - (case_insensitive_strcmp( - (const unsigned char *)name, - (const unsigned char *)(current_element->string)) != - 0)) { - current_element = current_element->next; - } - } - - if ((current_element == NULL) || (current_element->string == NULL)) { - return NULL; - } - - return current_element; -} - -CJSON_PUBLIC(cJSON *) -cJSON_GetObjectItem(const cJSON *const object, const char *const string) { - return get_object_item(object, string, false); -} - -CJSON_PUBLIC(cJSON *) -cJSON_GetObjectItemCaseSensitive(const cJSON *const object, - const char *const string) { - return get_object_item(object, string, true); -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_HasObjectItem(const cJSON *object, const char *string) { - return cJSON_GetObjectItem(object, string) ? 1 : 0; -} - -/* Utility for array list handling. */ -static void suffix_object(cJSON *prev, cJSON *item) { - prev->next = item; - item->prev = prev; -} - -/* Utility for handling references. */ -static cJSON *create_reference(const cJSON *item, - const internal_hooks *const hooks) { - cJSON *reference = NULL; - if (item == NULL) { - return NULL; - } - - reference = cJSON_New_Item(hooks); - if (reference == NULL) { - return NULL; - } - - memcpy(reference, item, sizeof(cJSON)); - reference->string = NULL; - reference->type |= cJSON_IsReference; - reference->next = reference->prev = NULL; - return reference; -} - -static cJSON_bool add_item_to_array(cJSON *array, cJSON *item) { - cJSON *child = NULL; - - if ((item == NULL) || (array == NULL) || (array == item)) { - return false; - } - - child = array->child; - /* - * To find the last item in array quickly, we use prev in array - */ - if (child == NULL) { - /* list is empty, start new one */ - array->child = item; - item->prev = item; - item->next = NULL; - } else { - /* append to the end */ - if (child->prev) { - suffix_object(child->prev, item); - array->child->prev = item; - } - } - - return true; -} - -/* Add item to array/object. */ -CJSON_PUBLIC(cJSON_bool) cJSON_AddItemToArray(cJSON *array, cJSON *item) { - return add_item_to_array(array, item); -} - -#if defined(__clang__) || \ - (defined(__GNUC__) && \ - ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5)))) -#pragma GCC diagnostic push -#endif -#ifdef __GNUC__ -#pragma GCC diagnostic ignored "-Wcast-qual" -#endif -/* helper function to cast away const */ -static void *cast_away_const(const void *string) { - return (void *)string; -} -#if defined(__clang__) || \ - (defined(__GNUC__) && \ - ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5)))) -#pragma GCC diagnostic pop -#endif - - -static cJSON_bool add_item_to_object(cJSON *const object, - const char *const string, - cJSON *const item, - const internal_hooks *const hooks, - const cJSON_bool constant_key) { - char *new_key = NULL; - int new_type = cJSON_Invalid; - - if ((object == NULL) || (string == NULL) || (item == NULL) || - (object == item)) { - return false; - } - - if (constant_key) { - new_key = (char *)cast_away_const(string); - new_type = item->type | cJSON_StringIsConst; - } else { - new_key = - (char *)cJSON_strdup((const unsigned char *)string, hooks); - if (new_key == NULL) { - return false; - } - - new_type = item->type & ~cJSON_StringIsConst; - } - - if (!(item->type & cJSON_StringIsConst) && (item->string != NULL)) { - hooks->deallocate(item->string); - } - - item->string = new_key; - item->type = new_type; - - return add_item_to_array(object, item); -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_AddItemToObject(cJSON *object, const char *string, cJSON *item) { - return add_item_to_object(object, string, item, &global_hooks, false); -} - -/* Add an item to an object with constant string as key */ -CJSON_PUBLIC(cJSON_bool) -cJSON_AddItemToObjectCS(cJSON *object, const char *string, cJSON *item) { - return add_item_to_object(object, string, item, &global_hooks, true); -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item) { - if (array == NULL) { - return false; - } - - return add_item_to_array(array, create_reference(item, &global_hooks)); -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item) { - if ((object == NULL) || (string == NULL)) { - return false; - } - - return add_item_to_object(object, string, - create_reference(item, &global_hooks), - &global_hooks, false); -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddNullToObject(cJSON *const object, const char *const name) { - cJSON *null = cJSON_CreateNull(); - if (add_item_to_object(object, name, null, &global_hooks, false)) { - return null; - } - - cJSON_Delete(null); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddTrueToObject(cJSON *const object, const char *const name) { - cJSON *true_item = cJSON_CreateTrue(); - if (add_item_to_object(object, name, true_item, &global_hooks, false)) { - return true_item; - } - - cJSON_Delete(true_item); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddFalseToObject(cJSON *const object, const char *const name) { - cJSON *false_item = cJSON_CreateFalse(); - if (add_item_to_object(object, name, false_item, &global_hooks, - false)) { - return false_item; - } - - cJSON_Delete(false_item); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddBoolToObject(cJSON *const object, - const char *const name, - const cJSON_bool boolean) { - cJSON *bool_item = cJSON_CreateBool(boolean); - if (add_item_to_object(object, name, bool_item, &global_hooks, false)) { - return bool_item; - } - - cJSON_Delete(bool_item); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddNumberToObject(cJSON *const object, - const char *const name, - const double number) { - cJSON *number_item = cJSON_CreateNumber(number); - if (add_item_to_object(object, name, number_item, &global_hooks, - false)) { - return number_item; - } - - cJSON_Delete(number_item); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddStringToObject(cJSON *const object, - const char *const name, - const char *const string) { - cJSON *string_item = cJSON_CreateString(string); - if (add_item_to_object(object, name, string_item, &global_hooks, - false)) { - return string_item; - } - - cJSON_Delete(string_item); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddRawToObject(cJSON *const object, - const char *const name, - const char *const raw) { - cJSON *raw_item = cJSON_CreateRaw(raw); - if (add_item_to_object(object, name, raw_item, &global_hooks, false)) { - return raw_item; - } - - cJSON_Delete(raw_item); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddObjectToObject(cJSON *const object, const char *const name) { - cJSON *object_item = cJSON_CreateObject(); - if (add_item_to_object(object, name, object_item, &global_hooks, - false)) { - return object_item; - } - - cJSON_Delete(object_item); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_AddArrayToObject(cJSON *const object, const char *const name) { - cJSON *array = cJSON_CreateArray(); - if (add_item_to_object(object, name, array, &global_hooks, false)) { - return array; - } - - cJSON_Delete(array); - return NULL; -} - -CJSON_PUBLIC(cJSON *) -cJSON_DetachItemViaPointer(cJSON *parent, cJSON *const item) { - if ((parent == NULL) || (item == NULL)) { - return NULL; - } - - if (item != parent->child) { - /* not the first element */ - item->prev->next = item->next; - } - if (item->next != NULL) { - /* not the last element */ - item->next->prev = item->prev; - } - - if (item == parent->child) { - /* first element */ - parent->child = item->next; - } else if (item->next == NULL) { - /* last element */ - parent->child->prev = item->prev; - } - - /* make sure the detached item doesn't point anywhere anymore */ - item->prev = NULL; - item->next = NULL; - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which) { - if (which < 0) { - return NULL; - } - - return cJSON_DetachItemViaPointer(array, - get_array_item(array, (size_t)which)); -} - -CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which) { - cJSON_Delete(cJSON_DetachItemFromArray(array, which)); -} - -CJSON_PUBLIC(cJSON *) -cJSON_DetachItemFromObject(cJSON *object, const char *string) { - cJSON *to_detach = cJSON_GetObjectItem(object, string); - - return cJSON_DetachItemViaPointer(object, to_detach); -} - -CJSON_PUBLIC(cJSON *) -cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string) { - cJSON *to_detach = cJSON_GetObjectItemCaseSensitive(object, string); - - return cJSON_DetachItemViaPointer(object, to_detach); -} - -CJSON_PUBLIC(void) -cJSON_DeleteItemFromObject(cJSON *object, const char *string) { - cJSON_Delete(cJSON_DetachItemFromObject(object, string)); -} - -CJSON_PUBLIC(void) -cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string) { - cJSON_Delete(cJSON_DetachItemFromObjectCaseSensitive(object, string)); -} - -/* Replace array/object items with new ones. */ -CJSON_PUBLIC(cJSON_bool) -cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newitem) { - cJSON *after_inserted = NULL; - - if (which < 0) { - return false; - } - - after_inserted = get_array_item(array, (size_t)which); - if (after_inserted == NULL) { - return add_item_to_array(array, newitem); - } - - newitem->next = after_inserted; - newitem->prev = after_inserted->prev; - after_inserted->prev = newitem; - if (after_inserted == array->child) { - array->child = newitem; - } else { - newitem->prev->next = newitem; - } - return true; -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_ReplaceItemViaPointer(cJSON *const parent, - cJSON *const item, - cJSON *replacement) { - if ((parent == NULL) || (replacement == NULL) || (item == NULL)) { - return false; - } - - if (replacement == item) { - return true; - } - - replacement->next = item->next; - replacement->prev = item->prev; - - if (replacement->next != NULL) { - replacement->next->prev = replacement; - } - if (parent->child == item) { - if (parent->child->prev == parent->child) { - replacement->prev = replacement; - } - parent->child = replacement; - } else { /* - * To find the last item in array quickly, we use prev in - * array. We can't modify the last item's next pointer where - * this item was the parent's child - */ - if (replacement->prev != NULL) { - replacement->prev->next = replacement; - } - if (replacement->next == NULL) { - parent->child->prev = replacement; - } - } - - item->next = NULL; - item->prev = NULL; - cJSON_Delete(item); - - return true; -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem) { - if (which < 0) { - return false; - } - - return cJSON_ReplaceItemViaPointer( - array, get_array_item(array, (size_t)which), newitem); -} - -static cJSON_bool replace_item_in_object(cJSON *object, - const char *string, - cJSON *replacement, - cJSON_bool case_sensitive) { - if ((replacement == NULL) || (string == NULL)) { - return false; - } - - /* replace the name in the replacement */ - if (!(replacement->type & cJSON_StringIsConst) && - (replacement->string != NULL)) { - cJSON_free(replacement->string); - } - replacement->string = - (char *)cJSON_strdup((const unsigned char *)string, &global_hooks); - replacement->type &= ~cJSON_StringIsConst; - - return cJSON_ReplaceItemViaPointer( - object, get_object_item(object, string, case_sensitive), - replacement); -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_ReplaceItemInObject(cJSON *object, const char *string, cJSON *newitem) { - return replace_item_in_object(object, string, newitem, false); -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object, - const char *string, - cJSON *newitem) { - return replace_item_in_object(object, string, newitem, true); -} - -/* Create basic types: */ -CJSON_PUBLIC(cJSON *) cJSON_CreateNull(void) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = cJSON_NULL; - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateTrue(void) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = cJSON_True; - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = cJSON_False; - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool boolean) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = boolean ? cJSON_True : cJSON_False; - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = cJSON_Number; - item->valuedouble = num; - - /* use saturation in case of overflow */ - if (num >= INT_MAX) { - item->valueint = INT_MAX; - } else if (num <= (double)INT_MIN) { - item->valueint = INT_MIN; - } else { - item->valueint = (int)num; - } - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = cJSON_String; - item->valuestring = (char *)cJSON_strdup( - (const unsigned char *)string, &global_hooks); - if (!item->valuestring) { - cJSON_Delete(item); - return NULL; - } - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item != NULL) { - item->type = cJSON_String | cJSON_IsReference; - item->valuestring = (char *)cast_away_const(string); - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item != NULL) { - item->type = cJSON_Object | cJSON_IsReference; - item->child = (cJSON *)cast_away_const(child); - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item != NULL) { - item->type = cJSON_Array | cJSON_IsReference; - item->child = (cJSON *)cast_away_const(child); - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = cJSON_Raw; - item->valuestring = (char *)cJSON_strdup( - (const unsigned char *)raw, &global_hooks); - if (!item->valuestring) { - cJSON_Delete(item); - return NULL; - } - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateArray(void) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = cJSON_Array; - } - - return item; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateObject(void) { - cJSON *item = cJSON_New_Item(&global_hooks); - if (item) { - item->type = cJSON_Object; - } - - return item; -} - -/* Create Arrays: */ -CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count) { - size_t i = 0; - cJSON *n = NULL; - cJSON *p = NULL; - cJSON *a = NULL; - - if ((count < 0) || (numbers == NULL)) { - return NULL; - } - - a = cJSON_CreateArray(); - for (i = 0; a && (i < (size_t)count); i++) { - n = cJSON_CreateNumber(numbers[i]); - if (!n) { - cJSON_Delete(a); - return NULL; - } - if (!i) { - a->child = n; - } else { - suffix_object(p, n); - } - p = n; - } - a->child->prev = n; - - return a; -} - -CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count) { - size_t i = 0; - cJSON *n = NULL; - cJSON *p = NULL; - cJSON *a = NULL; - - if ((count < 0) || (numbers == NULL)) { - return NULL; - } - - a = cJSON_CreateArray(); - - for (i = 0; a && (i < (size_t)count); i++) { - n = cJSON_CreateNumber((double)numbers[i]); - if (!n) { - cJSON_Delete(a); - return NULL; - } - if (!i) { - a->child = n; - } else { - suffix_object(p, n); - } - p = n; - } - a->child->prev = n; - - return a; -} - -CJSON_PUBLIC(cJSON *) -cJSON_CreateDoubleArray(const double *numbers, int count) { - size_t i = 0; - cJSON *n = NULL; - cJSON *p = NULL; - cJSON *a = NULL; - - if ((count < 0) || (numbers == NULL)) { - return NULL; - } - - a = cJSON_CreateArray(); - - for (i = 0; a && (i < (size_t)count); i++) { - n = cJSON_CreateNumber(numbers[i]); - if (!n) { - cJSON_Delete(a); - return NULL; - } - if (!i) { - a->child = n; - } else { - suffix_object(p, n); - } - p = n; - } - a->child->prev = n; - - return a; -} - -CJSON_PUBLIC(cJSON *) -cJSON_CreateStringArray(const char *const *strings, int count) { - size_t i = 0; - cJSON *n = NULL; - cJSON *p = NULL; - cJSON *a = NULL; - - if ((count < 0) || (strings == NULL)) { - return NULL; - } - - a = cJSON_CreateArray(); - - for (i = 0; a && (i < (size_t)count); i++) { - n = cJSON_CreateString(strings[i]); - if (!n) { - cJSON_Delete(a); - return NULL; - } - if (!i) { - a->child = n; - } else { - suffix_object(p, n); - } - p = n; - } - a->child->prev = n; - - return a; -} - -/* Duplication */ -CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse) { - cJSON *newitem = NULL; - cJSON *child = NULL; - cJSON *next = NULL; - cJSON *newchild = NULL; - - /* Bail on bad ptr */ - if (!item) { - goto fail; - } - /* Create new item */ - newitem = cJSON_New_Item(&global_hooks); - if (!newitem) { - goto fail; - } - /* Copy over all vars */ - newitem->type = item->type & (~cJSON_IsReference); - newitem->valueint = item->valueint; - newitem->valuedouble = item->valuedouble; - if (item->valuestring) { - newitem->valuestring = (char *)cJSON_strdup( - (unsigned char *)item->valuestring, &global_hooks); - if (!newitem->valuestring) { - goto fail; - } - } - if (item->string) { - newitem->string = - (item->type & cJSON_StringIsConst) - ? item->string - : (char *)cJSON_strdup((unsigned char *)item->string, - &global_hooks); - if (!newitem->string) { - goto fail; - } - } - /* If non-recursive, then we're done! */ - if (!recurse) { - return newitem; - } - /* Walk the ->next chain for the child. */ - child = item->child; - while (child != NULL) { - newchild = cJSON_Duplicate( - child, true); /* Duplicate (with recurse) each item in the - ->next chain */ - if (!newchild) { - goto fail; - } - if (next != NULL) { - /* If newitem->child already set, then crosswire ->prev - * and ->next and move on */ - next->next = newchild; - newchild->prev = next; - next = newchild; - } else { - /* Set newitem->child and move to it */ - newitem->child = newchild; - next = newchild; - } - child = child->next; - } - if (newitem && newitem->child) { - newitem->child->prev = newchild; - } - - return newitem; - -fail: - if (newitem != NULL) { - cJSON_Delete(newitem); - } - - return NULL; -} - -static void skip_oneline_comment(char **input) { - *input += static_strlen("//"); - - for (; (*input)[0] != '\0'; ++(*input)) { - if ((*input)[0] == '\n') { - *input += static_strlen("\n"); - return; - } - } -} - -static void skip_multiline_comment(char **input) { - *input += static_strlen("/*"); - - for (; (*input)[0] != '\0'; ++(*input)) { - if (((*input)[0] == '*') && ((*input)[1] == '/')) { - *input += static_strlen("*/"); - return; - } - } -} - -static void minify_string(char **input, char **output) { - (*output)[0] = (*input)[0]; - *input += static_strlen("\""); - *output += static_strlen("\""); - - - for (; (*input)[0] != '\0'; (void)++(*input), ++(*output)) { - (*output)[0] = (*input)[0]; - - if ((*input)[0] == '\"') { - (*output)[0] = '\"'; - *input += static_strlen("\""); - *output += static_strlen("\""); - return; - } else if (((*input)[0] == '\\') && ((*input)[1] == '\"')) { - (*output)[1] = (*input)[1]; - *input += static_strlen("\""); - *output += static_strlen("\""); - } - } -} - -CJSON_PUBLIC(void) cJSON_Minify(char *json) { - char *into = json; - - if (json == NULL) { - return; - } - - while (json[0] != '\0') { - switch (json[0]) { - case ' ': - case '\t': - case '\r': - case '\n': - json++; - break; - - case '/': - if (json[1] == '/') { - skip_oneline_comment(&json); - } else if (json[1] == '*') { - skip_multiline_comment(&json); - } else { - json++; - } - break; - - case '\"': - minify_string(&json, (char **)&into); - break; - - default: - into[0] = json[0]; - json++; - into++; - } - } - - /* and null-terminate. */ - *into = '\0'; -} - -CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xFF) == cJSON_Invalid; -} - -CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xFF) == cJSON_False; -} - -CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xff) == cJSON_True; -} - - -CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & (cJSON_True | cJSON_False)) != 0; -} -CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xFF) == cJSON_NULL; -} - -CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xFF) == cJSON_Number; -} - -CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xFF) == cJSON_String; -} - -CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xFF) == cJSON_Array; -} - -CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xFF) == cJSON_Object; -} - -CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON *const item) { - if (item == NULL) { - return false; - } - - return (item->type & 0xFF) == cJSON_Raw; -} - -CJSON_PUBLIC(cJSON_bool) -cJSON_Compare(const cJSON *const a, - const cJSON *const b, - const cJSON_bool case_sensitive) { - if ((a == NULL) || (b == NULL) || - ((a->type & 0xFF) != (b->type & 0xFF)) || cJSON_IsInvalid(a)) { - return false; - } - - /* check if type is valid */ - switch (a->type & 0xFF) { - case cJSON_False: - case cJSON_True: - case cJSON_NULL: - case cJSON_Number: - case cJSON_String: - case cJSON_Raw: - case cJSON_Array: - case cJSON_Object: - break; - - default: - return false; - } - - /* identical objects are equal */ - if (a == b) { - return true; - } - - switch (a->type & 0xFF) { - /* in these cases and equal type is enough */ - case cJSON_False: - case cJSON_True: - case cJSON_NULL: - return true; - - case cJSON_Number: - if (compare_double(a->valuedouble, b->valuedouble)) { - return true; - } - return false; - - case cJSON_String: - case cJSON_Raw: - if ((a->valuestring == NULL) || (b->valuestring == NULL)) { - return false; - } - if (strcmp(a->valuestring, b->valuestring) == 0) { - return true; - } - - return false; - - case cJSON_Array: { - cJSON *a_element = a->child; - cJSON *b_element = b->child; - - for (; (a_element != NULL) && (b_element != NULL);) { - if (!cJSON_Compare(a_element, b_element, - case_sensitive)) { - return false; - } - - a_element = a_element->next; - b_element = b_element->next; - } - - /* one of the arrays is longer than the other */ - if (a_element != b_element) { - return false; - } - - return true; - } - - case cJSON_Object: { - cJSON *a_element = NULL; - cJSON *b_element = NULL; - cJSON_ArrayForEach(a_element, a) { - /* TODO This has O(n^2) runtime, which is horrible! */ - b_element = get_object_item(b, a_element->string, - case_sensitive); - if (b_element == NULL) { - return false; - } - - if (!cJSON_Compare(a_element, b_element, - case_sensitive)) { - return false; - } - } - - /* doing this twice, once on a and b to prevent true comparison - * if a subset of b - * TODO: Do this the proper way, this is just a fix for now */ - cJSON_ArrayForEach(b_element, b) { - a_element = get_object_item(a, b_element->string, - case_sensitive); - if (a_element == NULL) { - return false; - } - - if (!cJSON_Compare(b_element, a_element, - case_sensitive)) { - return false; - } - } - - return true; - } - - default: - return false; - } -} - -CJSON_PUBLIC(void *) cJSON_malloc(size_t size) { - return global_hooks.allocate(size); -} - -CJSON_PUBLIC(void) cJSON_free(void *object) { - global_hooks.deallocate(object); -} diff --git a/src/third_party/librdkafka/dist/src/cJSON.h b/src/third_party/librdkafka/dist/src/cJSON.h deleted file mode 100644 index 1b5655c7b64..00000000000 --- a/src/third_party/librdkafka/dist/src/cJSON.h +++ /dev/null @@ -1,398 +0,0 @@ -/* - Copyright (c) 2009-2017 Dave Gamble and cJSON contributors - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#ifndef cJSON__h -#define cJSON__h - -#ifdef __cplusplus -extern "C" { -#endif - -#if !defined(__WINDOWS__) && \ - (defined(WIN32) || defined(WIN64) || defined(_MSC_VER) || defined(_WIN32)) -#define __WINDOWS__ -#endif - -#ifdef __WINDOWS__ - -/* When compiling for windows, we specify a specific calling convention to avoid -issues where we are being called from a project with a different default calling -convention. For windows you have 3 define options: - -CJSON_HIDE_SYMBOLS - Define this in the case where you don't want to ever -dllexport symbols CJSON_EXPORT_SYMBOLS - Define this on library build when you -want to dllexport symbols (default) CJSON_IMPORT_SYMBOLS - Define this if you -want to dllimport symbol - -For *nix builds that support visibility attribute, you can define similar -behavior by - -setting default visibility to hidden by adding --fvisibility=hidden (for gcc) -or --xldscope=hidden (for sun cc) -to CFLAGS - -then using the CJSON_API_VISIBILITY flag to "export" the same symbols the way -CJSON_EXPORT_SYMBOLS does - -*/ - -#define CJSON_CDECL __cdecl -#define CJSON_STDCALL __stdcall - -/* export symbols by default, this is necessary for copy pasting the C and - * header file */ -#if !defined(CJSON_HIDE_SYMBOLS) && !defined(CJSON_IMPORT_SYMBOLS) && \ - !defined(CJSON_EXPORT_SYMBOLS) -#define CJSON_EXPORT_SYMBOLS -#endif - -#if defined(CJSON_HIDE_SYMBOLS) -#define CJSON_PUBLIC(type) type CJSON_STDCALL -#elif defined(CJSON_EXPORT_SYMBOLS) -#define CJSON_PUBLIC(type) __declspec(dllexport) type CJSON_STDCALL -#elif defined(CJSON_IMPORT_SYMBOLS) -#define CJSON_PUBLIC(type) __declspec(dllimport) type CJSON_STDCALL -#endif -#else /* !__WINDOWS__ */ -#define CJSON_CDECL -#define CJSON_STDCALL - -#if (defined(__GNUC__) || defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && \ - defined(CJSON_API_VISIBILITY) -#define CJSON_PUBLIC(type) __attribute__((visibility("default"))) type -#else -#define CJSON_PUBLIC(type) type -#endif -#endif - -/* project version */ -#define CJSON_VERSION_MAJOR 1 -#define CJSON_VERSION_MINOR 7 -#define CJSON_VERSION_PATCH 14 - -#include - -/* cJSON Types: */ -#define cJSON_Invalid (0) -#define cJSON_False (1 << 0) -#define cJSON_True (1 << 1) -#define cJSON_NULL (1 << 2) -#define cJSON_Number (1 << 3) -#define cJSON_String (1 << 4) -#define cJSON_Array (1 << 5) -#define cJSON_Object (1 << 6) -#define cJSON_Raw (1 << 7) /* raw json */ - -#define cJSON_IsReference 256 -#define cJSON_StringIsConst 512 - -/* The cJSON structure: */ -typedef struct cJSON { - /* next/prev allow you to walk array/object chains. Alternatively, use - * GetArraySize/GetArrayItem/GetObjectItem */ - struct cJSON *next; - struct cJSON *prev; - /* An array or object item will have a child pointer pointing to a chain - * of the items in the array/object. */ - struct cJSON *child; - - /* The type of the item, as above. */ - int type; - - /* The item's string, if type==cJSON_String and type == cJSON_Raw */ - char *valuestring; - /* writing to valueint is DEPRECATED, use cJSON_SetNumberValue instead - */ - int valueint; - /* The item's number, if type==cJSON_Number */ - double valuedouble; - - /* The item's name string, if this item is the child of, or is in the - * list of subitems of an object. */ - char *string; -} cJSON; - -typedef struct cJSON_Hooks { - /* malloc/free are CDECL on Windows regardless of the default calling - * convention of the compiler, so ensure the hooks allow passing those - * functions directly. */ - void *(CJSON_CDECL *malloc_fn)(size_t sz); - void(CJSON_CDECL *free_fn)(void *ptr); -} cJSON_Hooks; - -typedef int cJSON_bool; - -/* Limits how deeply nested arrays/objects can be before cJSON rejects to parse - * them. This is to prevent stack overflows. */ -#ifndef CJSON_NESTING_LIMIT -#define CJSON_NESTING_LIMIT 1000 -#endif - -/* returns the version of cJSON as a string */ -CJSON_PUBLIC(const char *) cJSON_Version(void); - -/* Supply malloc, realloc and free functions to cJSON */ -CJSON_PUBLIC(void) cJSON_InitHooks(cJSON_Hooks *hooks); - -/* Memory Management: the caller is always responsible to free the results from - * all variants of cJSON_Parse (with cJSON_Delete) and cJSON_Print (with stdlib - * free, cJSON_Hooks.free_fn, or cJSON_free as appropriate). The exception is - * cJSON_PrintPreallocated, where the caller has full responsibility of the - * buffer. */ -/* Supply a block of JSON, and this returns a cJSON object you can interrogate. - */ -CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value); -CJSON_PUBLIC(cJSON *) -cJSON_ParseWithLength(const char *value, size_t buffer_length); -/* ParseWithOpts allows you to require (and check) that the JSON is null - * terminated, and to retrieve the pointer to the final byte parsed. */ -/* If you supply a ptr in return_parse_end and parsing fails, then - * return_parse_end will contain a pointer to the error so will match - * cJSON_GetErrorPtr(). */ -CJSON_PUBLIC(cJSON *) -cJSON_ParseWithOpts(const char *value, - const char **return_parse_end, - cJSON_bool require_null_terminated); -CJSON_PUBLIC(cJSON *) -cJSON_ParseWithLengthOpts(const char *value, - size_t buffer_length, - const char **return_parse_end, - cJSON_bool require_null_terminated); - -/* Render a cJSON entity to text for transfer/storage. */ -CJSON_PUBLIC(char *) cJSON_Print(const cJSON *item); -/* Render a cJSON entity to text for transfer/storage without any formatting. */ -CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item); -/* Render a cJSON entity to text using a buffered strategy. prebuffer is a guess - * at the final size. guessing well reduces reallocation. fmt=0 gives - * unformatted, =1 gives formatted */ -CJSON_PUBLIC(char *) -cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt); -/* Render a cJSON entity to text using a buffer already allocated in memory with - * given length. Returns 1 on success and 0 on failure. */ -/* NOTE: cJSON is not always 100% accurate in estimating how much memory it will - * use, so to be safe allocate 5 bytes more than you actually need */ -CJSON_PUBLIC(cJSON_bool) -cJSON_PrintPreallocated(cJSON *item, - char *buffer, - const int length, - const cJSON_bool format); -/* Delete a cJSON entity and all subentities. */ -CJSON_PUBLIC(void) cJSON_Delete(cJSON *item); - -/* Returns the number of items in an array (or object). */ -CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array); -/* Retrieve item number "index" from array "array". Returns NULL if - * unsuccessful. */ -CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index); -/* Get item "string" from object. Case insensitive. */ -CJSON_PUBLIC(cJSON *) -cJSON_GetObjectItem(const cJSON *const object, const char *const string); -CJSON_PUBLIC(cJSON *) -cJSON_GetObjectItemCaseSensitive(const cJSON *const object, - const char *const string); -CJSON_PUBLIC(cJSON_bool) -cJSON_HasObjectItem(const cJSON *object, const char *string); -/* For analysing failed parses. This returns a pointer to the parse error. - * You'll probably need to look a few chars back to make sense of it. Defined - * when cJSON_Parse() returns 0. 0 when cJSON_Parse() succeeds. */ -CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void); - -/* Check item type and return its value */ -CJSON_PUBLIC(char *) cJSON_GetStringValue(const cJSON *const item); -CJSON_PUBLIC(double) cJSON_GetNumberValue(const cJSON *const item); - -/* These functions check the type of an item */ -CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON *const item); -CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON *const item); - -/* These calls create a cJSON item of the appropriate type. */ -CJSON_PUBLIC(cJSON *) cJSON_CreateNull(void); -CJSON_PUBLIC(cJSON *) cJSON_CreateTrue(void); -CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void); -CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool boolean); -CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num); -CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string); -/* raw json */ -CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw); -CJSON_PUBLIC(cJSON *) cJSON_CreateArray(void); -CJSON_PUBLIC(cJSON *) cJSON_CreateObject(void); - -/* Create a string where valuestring references a string so - * it will not be freed by cJSON_Delete */ -CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string); -/* Create an object/array that only references it's elements so - * they will not be freed by cJSON_Delete */ -CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child); -CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child); - -/* These utilities create an Array of count items. - * The parameter count cannot be greater than the number of elements in the - * number array, otherwise array access will be out of bounds.*/ -CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count); -CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count); -CJSON_PUBLIC(cJSON *) cJSON_CreateDoubleArray(const double *numbers, int count); -CJSON_PUBLIC(cJSON *) -cJSON_CreateStringArray(const char *const *strings, int count); - -/* Append item to the specified array/object. */ -CJSON_PUBLIC(cJSON_bool) cJSON_AddItemToArray(cJSON *array, cJSON *item); -CJSON_PUBLIC(cJSON_bool) -cJSON_AddItemToObject(cJSON *object, const char *string, cJSON *item); -/* Use this when string is definitely const (i.e. a literal, or as good as), and - * will definitely survive the cJSON object. WARNING: When this function was - * used, make sure to always check that (item->type & cJSON_StringIsConst) is - * zero before writing to `item->string` */ -CJSON_PUBLIC(cJSON_bool) -cJSON_AddItemToObjectCS(cJSON *object, const char *string, cJSON *item); -/* Append reference to item to the specified array/object. Use this when you - * want to add an existing cJSON to a new cJSON, but don't want to corrupt your - * existing cJSON. */ -CJSON_PUBLIC(cJSON_bool) -cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item); -CJSON_PUBLIC(cJSON_bool) -cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item); - -/* Remove/Detach items from Arrays/Objects. */ -CJSON_PUBLIC(cJSON *) -cJSON_DetachItemViaPointer(cJSON *parent, cJSON *const item); -CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which); -CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which); -CJSON_PUBLIC(cJSON *) -cJSON_DetachItemFromObject(cJSON *object, const char *string); -CJSON_PUBLIC(cJSON *) -cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string); -CJSON_PUBLIC(void) -cJSON_DeleteItemFromObject(cJSON *object, const char *string); -CJSON_PUBLIC(void) -cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string); - -/* Update array items. */ -CJSON_PUBLIC(cJSON_bool) -cJSON_InsertItemInArray( - cJSON *array, - int which, - cJSON *newitem); /* Shifts pre-existing items to the right. */ -CJSON_PUBLIC(cJSON_bool) -cJSON_ReplaceItemViaPointer(cJSON *const parent, - cJSON *const item, - cJSON *replacement); -CJSON_PUBLIC(cJSON_bool) -cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem); -CJSON_PUBLIC(cJSON_bool) -cJSON_ReplaceItemInObject(cJSON *object, const char *string, cJSON *newitem); -CJSON_PUBLIC(cJSON_bool) -cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object, - const char *string, - cJSON *newitem); - -/* Duplicate a cJSON item */ -CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse); -/* Duplicate will create a new, identical cJSON item to the one you pass, in new - * memory that will need to be released. With recurse!=0, it will duplicate any - * children connected to the item. - * The item->next and ->prev pointers are always zero on return from Duplicate. - */ -/* Recursively compare two cJSON items for equality. If either a or b is NULL or - * invalid, they will be considered unequal. - * case_sensitive determines if object keys are treated case sensitive (1) or - * case insensitive (0) */ -CJSON_PUBLIC(cJSON_bool) -cJSON_Compare(const cJSON *const a, - const cJSON *const b, - const cJSON_bool case_sensitive); - -/* Minify a strings, remove blank characters(such as ' ', '\t', '\r', '\n') from - * strings. The input pointer json cannot point to a read-only address area, - * such as a string constant, - * but should point to a readable and writable adress area. */ -CJSON_PUBLIC(void) cJSON_Minify(char *json); - -/* Helper functions for creating and adding items to an object at the same time. - * They return the added item or NULL on failure. */ -CJSON_PUBLIC(cJSON *) -cJSON_AddNullToObject(cJSON *const object, const char *const name); -CJSON_PUBLIC(cJSON *) -cJSON_AddTrueToObject(cJSON *const object, const char *const name); -CJSON_PUBLIC(cJSON *) -cJSON_AddFalseToObject(cJSON *const object, const char *const name); -CJSON_PUBLIC(cJSON *) -cJSON_AddBoolToObject(cJSON *const object, - const char *const name, - const cJSON_bool boolean); -CJSON_PUBLIC(cJSON *) -cJSON_AddNumberToObject(cJSON *const object, - const char *const name, - const double number); -CJSON_PUBLIC(cJSON *) -cJSON_AddStringToObject(cJSON *const object, - const char *const name, - const char *const string); -CJSON_PUBLIC(cJSON *) -cJSON_AddRawToObject(cJSON *const object, - const char *const name, - const char *const raw); -CJSON_PUBLIC(cJSON *) -cJSON_AddObjectToObject(cJSON *const object, const char *const name); -CJSON_PUBLIC(cJSON *) -cJSON_AddArrayToObject(cJSON *const object, const char *const name); - -/* When assigning an integer value, it needs to be propagated to valuedouble - * too. */ -#define cJSON_SetIntValue(object, number) \ - ((object) ? (object)->valueint = (object)->valuedouble = (number) \ - : (number)) -/* helper for the cJSON_SetNumberValue macro */ -CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number); -#define cJSON_SetNumberValue(object, number) \ - ((object != NULL) ? cJSON_SetNumberHelper(object, (double)number) \ - : (number)) -/* Change the valuestring of a cJSON_String object, only takes effect when type - * of object is cJSON_String */ -CJSON_PUBLIC(char *) -cJSON_SetValuestring(cJSON *object, const char *valuestring); - -/* Macro for iterating over an array or object */ -#define cJSON_ArrayForEach(element, array) \ - for (element = (array != NULL) ? (array)->child : NULL; \ - element != NULL; element = element->next) - -/* malloc/free objects using the malloc/free functions that have been set with - * cJSON_InitHooks */ -CJSON_PUBLIC(void *) cJSON_malloc(size_t size); -CJSON_PUBLIC(void) cJSON_free(void *object); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/third_party/librdkafka/dist/src/crc32c.h b/src/third_party/librdkafka/dist/src/crc32c.h index 21c7badc7f1..d768afc6763 100644 --- a/src/third_party/librdkafka/dist/src/crc32c.h +++ b/src/third_party/librdkafka/dist/src/crc32c.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/lz4.c b/src/third_party/librdkafka/dist/src/lz4.c index 01c43c7c240..99773ea1c99 100644 --- a/src/third_party/librdkafka/dist/src/lz4.c +++ b/src/third_party/librdkafka/dist/src/lz4.c @@ -1,6 +1,6 @@ /* KLZ4 - Fast LZ compression algorithm - Copyright (C) 2011-present, Yann Collet. + Copyright (C) 2011-2020, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) @@ -124,6 +124,7 @@ #if defined(_MSC_VER) && (_MSC_VER >= 1400) /* Visual Studio 2005+ */ # include /* only present in VS2005+ */ # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 6237) /* disable: C6237: conditional expression is always 0 */ #endif /* _MSC_VER */ #ifndef KLZ4_FORCE_INLINE @@ -187,7 +188,27 @@ /*-************************************ * Memory routines **************************************/ -#ifdef KLZ4_USER_MEMORY_FUNCTIONS + +/*! KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION : + * Disable relatively high-level KLZ4/HC functions that use dynamic memory + * allocation functions (malloc(), calloc(), free()). + * + * Note that this is a compile-time switch. And since it disables + * public/stable KLZ4 v1 API functions, we don't recommend using this + * symbol to generate a library for distribution. + * + * The following public functions are removed when this symbol is defined. + * - lz4 : KLZ4_createStream, KLZ4_freeStream, + * KLZ4_createStreamDecode, KLZ4_freeStreamDecode, KLZ4_create (deprecated) + * - lz4hc : KLZ4_createStreamHC, KLZ4_freeStreamHC, + * KLZ4_createHC (deprecated), KLZ4_freeHC (deprecated) + * - lz4frame, lz4file : All KLZ4F_* functions + */ +#if defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +# define ALLOC(s) lz4_error_memory_allocation_is_disabled +# define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled +# define FREEMEM(p) lz4_error_memory_allocation_is_disabled +#elif defined(KLZ4_USER_MEMORY_FUNCTIONS) /* memory management functions can be customized by user project. * Below functions must exist somewhere in the Project * and be available at link time */ @@ -198,6 +219,8 @@ void KLZ4_free(void* p); # define ALLOC_AND_ZERO(s) KLZ4_calloc(1,s) # define FREEMEM(p) KLZ4_free(p) #else +/* NOTE: While upgrading the lz4 version, replace the original `#else` block + * in the code with this block, and retain this comment. */ struct rdkafka_s; extern void *rd_kafka_mem_malloc(struct rdkafka_s *rk, size_t s); extern void *rd_kafka_mem_calloc(struct rdkafka_s *rk, size_t n, size_t s); @@ -207,8 +230,13 @@ extern void rd_kafka_mem_free(struct rdkafka_s *rk, void *p); # define FREEMEM(p) rd_kafka_mem_free(NULL, p) #endif -#include /* memset, memcpy */ -#define MEM_INIT(p,v,s) memset((p),(v),(s)) +#if ! KLZ4_FREESTANDING +# include /* memset, memcpy */ +#endif +#if !defined(KLZ4_memset) +# define KLZ4_memset(p,v,s) memset((p),(v),(s)) +#endif +#define MEM_INIT(p,v,s) KLZ4_memset((p),(v),(s)) /*-************************************ @@ -319,10 +347,20 @@ typedef enum { * memcpy() as if it were standard compliant, so it can inline it in freestanding * environments. This is needed when decompressing the Linux Kernel, for example. */ -#if defined(__GNUC__) && (__GNUC__ >= 4) -#define KLZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) -#else -#define KLZ4_memcpy(dst, src, size) memcpy(dst, src, size) +#if !defined(KLZ4_memcpy) +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define KLZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) +# else +# define KLZ4_memcpy(dst, src, size) memcpy(dst, src, size) +# endif +#endif + +#if !defined(KLZ4_memmove) +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define KLZ4_memmove __builtin_memmove +# else +# define KLZ4_memmove memmove +# endif #endif static unsigned KLZ4_isLittleEndian(void) @@ -346,14 +384,14 @@ static void KLZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ /* currently only defined for gcc and icc */ -typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign; +typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) KLZ4_unalign; -static U16 KLZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -static U32 KLZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -static reg_t KLZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; } +static U16 KLZ4_read16(const void* ptr) { return ((const KLZ4_unalign*)ptr)->u16; } +static U32 KLZ4_read32(const void* ptr) { return ((const KLZ4_unalign*)ptr)->u32; } +static reg_t KLZ4_read_ARCH(const void* ptr) { return ((const KLZ4_unalign*)ptr)->uArch; } -static void KLZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } -static void KLZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; } +static void KLZ4_write16(void* memPtr, U16 value) { ((KLZ4_unalign*)memPtr)->u16 = value; } +static void KLZ4_write32(void* memPtr, U32 value) { ((KLZ4_unalign*)memPtr)->u32 = value; } #else /* safe and portable access using memcpy() */ @@ -424,10 +462,12 @@ static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; #ifndef KLZ4_FAST_DEC_LOOP # if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64 # define KLZ4_FAST_DEC_LOOP 1 +# elif defined(__aarch64__) && defined(__APPLE__) +# define KLZ4_FAST_DEC_LOOP 1 # elif defined(__aarch64__) && !defined(__clang__) - /* On aarch64, we disable this optimization for clang because on certain - * mobile chipsets, performance is reduced with clang. For information - * refer to https://github.com/lz4/lz4/pull/707 */ + /* On non-Apple aarch64, we disable this optimization for clang because + * on certain mobile chipsets, performance is reduced with clang. For + * more information refer to https://github.com/lz4/lz4/pull/707 */ # define KLZ4_FAST_DEC_LOOP 1 # else # define KLZ4_FAST_DEC_LOOP 0 @@ -489,7 +529,14 @@ KLZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const s case 2: KLZ4_memcpy(v, srcPtr, 2); KLZ4_memcpy(&v[2], srcPtr, 2); +#if defined(_MSC_VER) && (_MSC_VER <= 1933) /* MSVC 2022 ver 17.3 or earlier */ +# pragma warning(push) +# pragma warning(disable : 6385) /* warning C6385: Reading invalid data from 'v'. */ +#endif KLZ4_memcpy(&v[4], v, 4); +#if defined(_MSC_VER) && (_MSC_VER <= 1933) /* MSVC 2022 ver 17.3 or earlier */ +# pragma warning(pop) +#endif break; case 4: KLZ4_memcpy(v, srcPtr, 4); @@ -518,9 +565,20 @@ static unsigned KLZ4_NbCommonBytes (reg_t val) assert(val != 0); if (KLZ4_isLittleEndian()) { if (sizeof(val) == 8) { -# if defined(_MSC_VER) && (_MSC_VER >= 1800) && defined(_M_AMD64) && !defined(KLZ4_FORCE_SW_BITCOUNT) +# if defined(_MSC_VER) && (_MSC_VER >= 1800) && (defined(_M_AMD64) && !defined(_M_ARM64EC)) && !defined(KLZ4_FORCE_SW_BITCOUNT) +/*-************************************************************************************************* +* ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications on ARM64 Windows 11. +* The ARM64EC ABI does not support AVX/AVX2/AVX512 instructions, nor their relevant intrinsics +* including _tzcnt_u64. Therefore, we need to neuter the _tzcnt_u64 code path for ARM64EC. +****************************************************************************************************/ +# if defined(__clang__) && (__clang_major__ < 10) + /* Avoid undefined clang-cl intrinsics issue. + * See https://github.com/lz4/lz4/pull/1017 for details. */ + return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3; +# else /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */ return (unsigned)_tzcnt_u64(val) >> 3; +# endif # elif defined(_MSC_VER) && defined(_WIN64) && !defined(KLZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanForward64(&r, (U64)val); @@ -655,10 +713,10 @@ typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t; * - usingExtDict : Like withPrefix64k, but the preceding content is somewhere * else in memory, starting at ctx->dictionary with length * ctx->dictSize. - * - usingDictCtx : Like usingExtDict, but everything concerning the preceding - * content is in a separate context, pointed to by - * ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table - * entries in the current context that refer to positions + * - usingDictCtx : Everything concerning the preceding content is + * in a separate context, pointed to by ctx->dictCtx. + * ctx->dictionary, ctx->dictSize, and table entries + * in the current context that refer to positions * preceding the beginning of the current compression are * ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx * ->dictSize describe the location and size of the preceding @@ -675,12 +733,12 @@ typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; int KLZ4_versionNumber (void) { return KLZ4_VERSION_NUMBER; } const char* KLZ4_versionString(void) { return KLZ4_VERSION_STRING; } int KLZ4_compressBound(int isize) { return KLZ4_COMPRESSBOUND(isize); } -int KLZ4_sizeofState(void) { return KLZ4_STREAMSIZE; } +int KLZ4_sizeofState(void) { return sizeof(KLZ4_stream_t); } -/*-************************************ -* Internal Definitions used in Tests -**************************************/ +/*-**************************************** +* Internal Definitions, used only in Tests +*******************************************/ #if defined (__cplusplus) extern "C" { #endif @@ -690,7 +748,9 @@ int KLZ4_compress_forceExtDict (KLZ4_stream_t* KLZ4_dict, const char* source, ch int KLZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const void* dictStart, size_t dictSize); - +int KLZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest, + int compressedSize, int targetOutputSize, int dstCapacity, + const void* dictStart, size_t dictSize); #if defined (__cplusplus) } #endif @@ -830,9 +890,10 @@ KLZ4_prepareTable(KLZ4_stream_t_internal* const cctx, } } - /* Adding a gap, so all previous entries are > KLZ4_DISTANCE_MAX back, is faster - * than compressing without a gap. However, compressing with - * currentOffset == 0 is faster still, so we preserve that case. + /* Adding a gap, so all previous entries are > KLZ4_DISTANCE_MAX back, + * is faster than compressing without a gap. + * However, compressing with currentOffset == 0 is faster still, + * so we preserve that case. */ if (cctx->currentOffset != 0 && tableType == byU32) { DEBUGLOG(5, "KLZ4_prepareTable: adding 64KB to currentOffset"); @@ -856,7 +917,7 @@ KLZ4_FORCE_INLINE int KLZ4_compress_generic_validated( const char* const source, char* const dest, const int inputSize, - int *inputConsumed, /* only written when outputDirective == fillOutput */ + int* inputConsumed, /* only written when outputDirective == fillOutput */ const int maxOutputSize, const limitedOutput_directive outputDirective, const tableType_t tableType, @@ -888,7 +949,8 @@ KLZ4_FORCE_INLINE int KLZ4_compress_generic_validated( /* the dictCtx currentOffset is indexed on the start of the dictionary, * while a dictionary in the current context precedes the currentOffset */ - const BYTE* dictBase = !dictionary ? NULL : (dictDirective == usingDictCtx) ? + const BYTE* dictBase = (dictionary == NULL) ? NULL : + (dictDirective == usingDictCtx) ? dictionary + dictSize - dictCtx->currentOffset : dictionary + dictSize - startIndex; @@ -984,10 +1046,11 @@ KLZ4_FORCE_INLINE int KLZ4_compress_generic_validated( match = base + matchIndex; lowLimit = (const BYTE*)source; } - } else if (dictDirective==usingExtDict) { + } else if (dictDirective == usingExtDict) { if (matchIndex < startIndex) { DEBUGLOG(7, "extDict candidate: matchIndex=%5u < startIndex=%5u", matchIndex, startIndex); assert(startIndex - matchIndex >= MINMATCH); + assert(dictBase); match = dictBase + matchIndex; lowLimit = dictionary; } else { @@ -1051,7 +1114,7 @@ KLZ4_FORCE_INLINE int KLZ4_compress_generic_validated( _next_match: /* at this stage, the following variables must be correctly set : * - ip : at start of LZ operation - * - match : at start of previous pattern occurence; can be within current prefix, or within extDict + * - match : at start of previous pattern occurrence; can be within current prefix, or within extDict * - offset : if maybe_ext_memSegment==1 (constant) * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written @@ -1176,6 +1239,7 @@ _next_match: } } else if (dictDirective==usingExtDict) { if (matchIndex < startIndex) { + assert(dictBase); match = dictBase + matchIndex; lowLimit = dictionary; /* required for match length counter */ } else { @@ -1358,7 +1422,7 @@ int KLZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOut { int result; #if (KLZ4_HEAPMODE) - KLZ4_stream_t* ctxPtr = ALLOC(sizeof(KLZ4_stream_t)); /* malloc-calloc always properly aligned */ + KLZ4_stream_t* ctxPtr = (KLZ4_stream_t*)ALLOC(sizeof(KLZ4_stream_t)); /* malloc-calloc always properly aligned */ if (ctxPtr == NULL) return 0; #else KLZ4_stream_t ctx; @@ -1423,15 +1487,17 @@ int KLZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targ * Streaming functions ********************************/ +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) KLZ4_stream_t* KLZ4_createStream(void) { KLZ4_stream_t* const lz4s = (KLZ4_stream_t*)ALLOC(sizeof(KLZ4_stream_t)); - KLZ4_STATIC_ASSERT(KLZ4_STREAMSIZE >= sizeof(KLZ4_stream_t_internal)); /* A compilation error here means KLZ4_STREAMSIZE is not large enough */ + KLZ4_STATIC_ASSERT(sizeof(KLZ4_stream_t) >= sizeof(KLZ4_stream_t_internal)); DEBUGLOG(4, "KLZ4_createStream %p", lz4s); if (lz4s == NULL) return NULL; KLZ4_initStream(lz4s, sizeof(*lz4s)); return lz4s; } +#endif static size_t KLZ4_stream_t_alignment(void) { @@ -1465,6 +1531,7 @@ void KLZ4_resetStream_fast(KLZ4_stream_t* ctx) { KLZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32); } +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) int KLZ4_freeStream (KLZ4_stream_t* KLZ4_stream) { if (!KLZ4_stream) return 0; /* support free on NULL */ @@ -1472,6 +1539,7 @@ int KLZ4_freeStream (KLZ4_stream_t* KLZ4_stream) FREEMEM(KLZ4_stream); return (0); } +#endif #define HASH_UNIT sizeof(reg_t) @@ -1517,8 +1585,9 @@ int KLZ4_loadDict (KLZ4_stream_t* KLZ4_dict, const char* dictionary, int dictSiz return (int)dict->dictSize; } -void KLZ4_attach_dictionary(KLZ4_stream_t* workingStream, const KLZ4_stream_t* dictionaryStream) { - const KLZ4_stream_t_internal* dictCtx = dictionaryStream == NULL ? NULL : +void KLZ4_attach_dictionary(KLZ4_stream_t* workingStream, const KLZ4_stream_t* dictionaryStream) +{ + const KLZ4_stream_t_internal* dictCtx = (dictionaryStream == NULL) ? NULL : &(dictionaryStream->internal_donotuse); DEBUGLOG(4, "KLZ4_attach_dictionary (%p, %p, size %u)", @@ -1571,36 +1640,40 @@ int KLZ4_compress_fast_continue (KLZ4_stream_t* KLZ4_stream, int acceleration) { const tableType_t tableType = byU32; - KLZ4_stream_t_internal* streamPtr = &KLZ4_stream->internal_donotuse; - const BYTE* dictEnd = streamPtr->dictionary + streamPtr->dictSize; + KLZ4_stream_t_internal* const streamPtr = &KLZ4_stream->internal_donotuse; + const char* dictEnd = streamPtr->dictSize ? (const char*)streamPtr->dictionary + streamPtr->dictSize : NULL; - DEBUGLOG(5, "KLZ4_compress_fast_continue (inputSize=%i)", inputSize); + DEBUGLOG(5, "KLZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", inputSize, streamPtr->dictSize); - KLZ4_renormDictT(streamPtr, inputSize); /* avoid index overflow */ + KLZ4_renormDictT(streamPtr, inputSize); /* fix index overflow */ if (acceleration < 1) acceleration = KLZ4_ACCELERATION_DEFAULT; if (acceleration > KLZ4_ACCELERATION_MAX) acceleration = KLZ4_ACCELERATION_MAX; /* invalidate tiny dictionaries */ - if ( (streamPtr->dictSize-1 < 4-1) /* intentional underflow */ - && (dictEnd != (const BYTE*)source) ) { + if ( (streamPtr->dictSize < 4) /* tiny dictionary : not enough for a hash */ + && (dictEnd != source) /* prefix mode */ + && (inputSize > 0) /* tolerance : don't lose history, in case next invocation would use prefix mode */ + && (streamPtr->dictCtx == NULL) /* usingDictCtx */ + ) { DEBUGLOG(5, "KLZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary); + /* remove dictionary existence from history, to employ faster prefix mode */ streamPtr->dictSize = 0; streamPtr->dictionary = (const BYTE*)source; - dictEnd = (const BYTE*)source; + dictEnd = source; } /* Check overlapping input/dictionary space */ - { const BYTE* sourceEnd = (const BYTE*) source + inputSize; - if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) { + { const char* const sourceEnd = source + inputSize; + if ((sourceEnd > (const char*)streamPtr->dictionary) && (sourceEnd < dictEnd)) { streamPtr->dictSize = (U32)(dictEnd - sourceEnd); if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; - streamPtr->dictionary = dictEnd - streamPtr->dictSize; + streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize; } } /* prefix mode : source data follows dictionary */ - if (dictEnd == (const BYTE*)source) { + if (dictEnd == source) { if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) return KLZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration); else @@ -1626,7 +1699,7 @@ int KLZ4_compress_fast_continue (KLZ4_stream_t* KLZ4_stream, } else { result = KLZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration); } - } else { + } else { /* small data <= 4 KB */ if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { result = KLZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration); } else { @@ -1664,21 +1737,25 @@ int KLZ4_compress_forceExtDict (KLZ4_stream_t* KLZ4_dict, const char* source, ch /*! KLZ4_saveDict() : * If previously compressed data block is not guaranteed to remain available at its memory location, * save it into a safer place (char* safeBuffer). - * Note : you don't need to call KLZ4_loadDict() afterwards, - * dictionary is immediately usable, you can therefore call KLZ4_compress_fast_continue(). - * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. + * Note : no need to call KLZ4_loadDict() afterwards, dictionary is immediately usable, + * one can therefore call KLZ4_compress_fast_continue() right after. + * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. */ int KLZ4_saveDict (KLZ4_stream_t* KLZ4_dict, char* safeBuffer, int dictSize) { KLZ4_stream_t_internal* const dict = &KLZ4_dict->internal_donotuse; - const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; + + DEBUGLOG(5, "KLZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, safeBuffer); if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */ if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; } if (safeBuffer == NULL) assert(dictSize == 0); - if (dictSize > 0) - memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + if (dictSize > 0) { + const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; + assert(dict->dictionary); + KLZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize); + } dict->dictionary = (const BYTE*)safeBuffer; dict->dictSize = (U32)dictSize; @@ -1692,39 +1769,163 @@ int KLZ4_saveDict (KLZ4_stream_t* KLZ4_dict, char* safeBuffer, int dictSize) * Decompression functions ********************************/ -typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; #undef MIN #define MIN(a,b) ( (a) < (b) ? (a) : (b) ) + +/* variant for decompress_unsafe() + * does not know end of input + * presumes input is well formed + * note : will consume at least one byte */ +size_t kread_long_length_no_check(const BYTE** pp) +{ + size_t b, l = 0; + do { b = **pp; (*pp)++; l += b; } while (b==255); + DEBUGLOG(6, "kread_long_length_no_check: +length=%zu using %zu input bytes", l, l/255 + 1) + return l; +} + +/* core decoder variant for KLZ4_decompress_fast*() + * for legacy support only : these entry points are deprecated. + * - Presumes input is correctly formed (no defense vs malformed inputs) + * - Does not know input size (presume input buffer is "large enough") + * - Decompress a full block (only) + * @return : nb of bytes read from input. + * Note : this variant is not optimized for speed, just for maintenance. + * the goal is to remove support of decompress_fast*() variants by v2.0 +**/ +KLZ4_FORCE_INLINE int +KLZ4_decompress_unsafe_generic( + const BYTE* const istart, + BYTE* const ostart, + int decompressedSize, + + size_t prefixSize, + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note: =0 if dictStart==NULL */ + ) +{ + const BYTE* ip = istart; + BYTE* op = (BYTE*)ostart; + BYTE* const oend = ostart + decompressedSize; + const BYTE* const prefixStart = ostart - prefixSize; + + DEBUGLOG(5, "KLZ4_decompress_unsafe_generic"); + if (dictStart == NULL) assert(dictSize == 0); + + while (1) { + /* start new sequence */ + unsigned token = *ip++; + + /* literals */ + { size_t ll = token >> ML_BITS; + if (ll==15) { + /* long literal length */ + ll += kread_long_length_no_check(&ip); + } + if ((size_t)(oend-op) < ll) return -1; /* output buffer overflow */ + KLZ4_memmove(op, ip, ll); /* support in-place decompression */ + op += ll; + ip += ll; + if ((size_t)(oend-op) < MFLIMIT) { + if (op==oend) break; /* end of block */ + DEBUGLOG(5, "invalid: literals end at distance %zi from end of block", oend-op); + /* incorrect end of block : + * last match must start at least MFLIMIT==12 bytes before end of output block */ + return -1; + } } + + /* match */ + { size_t ml = token & 15; + size_t const offset = KLZ4_readLE16(ip); + ip+=2; + + if (ml==15) { + /* long literal length */ + ml += kread_long_length_no_check(&ip); + } + ml += MINMATCH; + + if ((size_t)(oend-op) < ml) return -1; /* output buffer overflow */ + + { const BYTE* match = op - offset; + + /* out of range */ + if (offset > (size_t)(op - prefixStart) + dictSize) { + DEBUGLOG(6, "offset out of range"); + return -1; + } + + /* check special case : extDict */ + if (offset > (size_t)(op - prefixStart)) { + /* extDict scenario */ + const BYTE* const dictEnd = dictStart + dictSize; + const BYTE* extMatch = dictEnd - (offset - (size_t)(op-prefixStart)); + size_t const extml = (size_t)(dictEnd - extMatch); + if (extml > ml) { + /* match entirely within extDict */ + KLZ4_memmove(op, extMatch, ml); + op += ml; + ml = 0; + } else { + /* match split between extDict & prefix */ + KLZ4_memmove(op, extMatch, extml); + op += extml; + ml -= extml; + } + match = prefixStart; + } + + /* match copy - slow variant, supporting overlap copy */ + { size_t u; + for (u=0; u= lencheck. - * loop_check - check ip >= lencheck in body of loop. Returns loop_error if so. - * initial_check - check ip >= lencheck before start of loop. Returns initial_error if so. - * error (output) - error code. Should be set to 0 before call. - */ -typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error; -KLZ4_FORCE_INLINE unsigned -read_variable_length(const BYTE**ip, const BYTE* lencheck, - int loop_check, int initial_check, - variable_length_error* error) + * @ip : input pointer + * @ilimit : position after which if length is not decoded, the input is necessarily corrupted. + * @initial_check - check ip >= ipmax before start of loop. Returns initial_error if so. + * @error (output) - error code. Must be set to 0 before call. +**/ +typedef size_t Rvl_t; +static const Rvl_t rvl_error = (Rvl_t)(-1); +KLZ4_FORCE_INLINE Rvl_t +read_variable_length(const BYTE** ip, const BYTE* ilimit, + int initial_check) { - U32 length = 0; - U32 s; - if (initial_check && unlikely((*ip) >= lencheck)) { /* overflow detection */ - *error = initial_error; - return length; + Rvl_t s, length = 0; + assert(ip != NULL); + assert(*ip != NULL); + assert(ilimit != NULL); + if (initial_check && unlikely((*ip) >= ilimit)) { /* read limit reached */ + return rvl_error; } do { s = **ip; (*ip)++; length += s; - if (loop_check && unlikely((*ip) >= lencheck)) { /* overflow detection */ - *error = loop_error; - return length; + if (unlikely((*ip) > ilimit)) { /* read limit reached */ + return rvl_error; + } + /* accumulator overflow detection (32-bit mode only) */ + if ((sizeof(length)<8) && unlikely(length > ((Rvl_t)(-1)/2)) ) { + return rvl_error; } } while (s==255); @@ -1744,7 +1945,6 @@ KLZ4_decompress_generic( int srcSize, int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity` */ - endCondition_directive endOnInput, /* endOnOutputSize, endOnInputSize */ earlyEnd_directive partialDecoding, /* full, partial */ dict_directive dict, /* noDict, withPrefix64k, usingExtDict */ const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */ @@ -1752,7 +1952,7 @@ KLZ4_decompress_generic( const size_t dictSize /* note : = 0 if noDict */ ) { - if (src == NULL) { return -1; } + if ((src == NULL) || (outputSize < 0)) { return -1; } { const BYTE* ip = (const BYTE*) src; const BYTE* const iend = ip + srcSize; @@ -1763,13 +1963,12 @@ KLZ4_decompress_generic( const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize; - const int safeDecode = (endOnInput==endOnInputSize); - const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + const int checkOffset = (dictSize < (int)(64 KB)); /* Set up the "end" pointers for the shortcut. */ - const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; - const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; + const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/; + const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/; const BYTE* match; size_t offset; @@ -1781,83 +1980,70 @@ KLZ4_decompress_generic( /* Special cases */ assert(lowPrefix <= op); - if ((endOnInput) && (unlikely(outputSize==0))) { + if (unlikely(outputSize==0)) { /* Empty output buffer */ if (partialDecoding) return 0; return ((srcSize==1) && (*ip==0)) ? 0 : -1; } - if ((!endOnInput) && (unlikely(outputSize==0))) { return (*ip==0 ? 1 : -1); } - if ((endOnInput) && unlikely(srcSize==0)) { return -1; } + if (unlikely(srcSize==0)) { return -1; } - /* Currently the fast loop shows a regression on qualcomm arm chips. */ + /* KLZ4_FAST_DEC_LOOP: + * designed for modern OoO performance cpus, + * where copying reliably 32-bytes is preferable to an unpredictable branch. + * note : fast loop may show a regression for some client arm chips. */ #if KLZ4_FAST_DEC_LOOP if ((oend - op) < FASTLOOP_SAFE_DISTANCE) { DEBUGLOG(6, "skip fast decode loop"); goto safe_decode; } - /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */ + /* Fast loop : decode sequences as long as output < oend-FASTLOOP_SAFE_DISTANCE */ while (1) { /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */ assert(oend - op >= FASTLOOP_SAFE_DISTANCE); - if (endOnInput) { assert(ip < iend); } + assert(ip < iend); token = *ip++; length = token >> ML_BITS; /* literal length */ - assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ - /* decode literal length */ if (length == RUN_MASK) { - variable_length_error error = ok; - length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error); - if (error == initial_error) { goto _output_error; } - if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ - if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ + size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1); + if (addl == rvl_error) { goto _output_error; } + length += addl; + if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ + if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ /* copy literals */ cpy = op+length; KLZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); - if (endOnInput) { /* KLZ4_decompress_safe() */ - if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; } - KLZ4_wildCopy32(op, ip, cpy); - } else { /* KLZ4_decompress_fast() */ - if (cpy>oend-8) { goto safe_literal_copy; } - KLZ4_wildCopy8(op, ip, cpy); /* KLZ4_decompress_fast() cannot copy more than 8 bytes at a time : - * it doesn't know input length, and only relies on end-of-block properties */ - } + if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; } + KLZ4_wildCopy32(op, ip, cpy); ip += length; op = cpy; } else { cpy = op+length; - if (endOnInput) { /* KLZ4_decompress_safe() */ - DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length); - /* We don't need to check oend, since we check it once for each loop below */ - if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; } - /* Literals can only be 14, but hope compilers optimize if we copy by a register size */ - KLZ4_memcpy(op, ip, 16); - } else { /* KLZ4_decompress_fast() */ - /* KLZ4_decompress_fast() cannot copy more than 8 bytes at a time : - * it doesn't know input length, and relies on end-of-block properties */ - KLZ4_memcpy(op, ip, 8); - if (length > 8) { KLZ4_memcpy(op+8, ip+8, 8); } - } + DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length); + /* We don't need to check oend, since we check it once for each loop below */ + if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; } + /* Literals can only be <= 14, but hope compilers optimize better when copy by a register size */ + KLZ4_memcpy(op, ip, 16); ip += length; op = cpy; } /* get offset */ offset = KLZ4_readLE16(ip); ip+=2; match = op - offset; - assert(match <= op); + assert(match <= op); /* overflow check */ /* get matchlength */ length = token & ML_MASK; if (length == ML_MASK) { - variable_length_error error = ok; - if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */ - length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error); - if (error != ok) { goto _output_error; } - if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */ + size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0); + if (addl == rvl_error) { goto _output_error; } + length += addl; length += MINMATCH; + if (unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */ + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */ if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { goto safe_match_copy; } @@ -1867,7 +2053,7 @@ KLZ4_decompress_generic( goto safe_match_copy; } - /* Fastpath check: Avoids a branch in KLZ4_wildCopy32 if true */ + /* Fastpath check: skip KLZ4_wildCopy32 when true */ if ((dict == withPrefix64k) || (match >= lowPrefix)) { if (offset >= 8) { assert(match >= lowPrefix); @@ -1884,6 +2070,7 @@ KLZ4_decompress_generic( if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */ /* match starting within external dictionary */ if ((dict==usingExtDict) && (match < lowPrefix)) { + assert(dictEnd != NULL); if (unlikely(op+length > oend-LASTLITERALS)) { if (partialDecoding) { DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd"); @@ -1894,7 +2081,7 @@ KLZ4_decompress_generic( if (length <= (size_t)(lowPrefix-match)) { /* match fits entirely within external dictionary : just copy */ - memmove(op, dictEnd - (lowPrefix-match), length); + KLZ4_memmove(op, dictEnd - (lowPrefix-match), length); op += length; } else { /* match stretches into both external dictionary and current block */ @@ -1930,11 +2117,10 @@ KLZ4_decompress_generic( /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */ while (1) { + assert(ip < iend); token = *ip++; length = token >> ML_BITS; /* literal length */ - assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ - /* A two-stage shortcut for the most common case: * 1) If the literal length is 0..14, and there is enough space, * enter the shortcut and copy 16 bytes on behalf of the literals @@ -1944,11 +2130,11 @@ KLZ4_decompress_generic( * those 18 bytes earlier, upon entering the shortcut (in other words, * there is a combined check for both stages). */ - if ( (endOnInput ? length != RUN_MASK : length <= 8) + if ( (length != RUN_MASK) /* strictly "less than" on input, to re-enter the loop with at least one byte */ - && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) { + && likely((ip < shortiend) & (op <= shortoend)) ) { /* Copy the literals */ - KLZ4_memcpy(op, ip, endOnInput ? 16 : 8); + KLZ4_memcpy(op, ip, 16); op += length; ip += length; /* The second stage: prepare for match copying, decode full info. @@ -1978,11 +2164,11 @@ KLZ4_decompress_generic( /* decode literal length */ if (length == RUN_MASK) { - variable_length_error error = ok; - length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error); - if (error == initial_error) { goto _output_error; } - if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ - if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ + size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1); + if (addl == rvl_error) { goto _output_error; } + length += addl; + if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ + if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ } /* copy literals */ @@ -1991,9 +2177,7 @@ KLZ4_decompress_generic( safe_literal_copy: #endif KLZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); - if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) ) - || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) ) - { + if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) { /* We've either hit the input parsing restriction or the output parsing restriction. * In the normal scenario, decoding a full block, it must be the last sequence, * otherwise it's an error (invalid input or dimensions). @@ -2003,7 +2187,6 @@ KLZ4_decompress_generic( /* Since we are partial decoding we may be in this block because of the output parsing * restriction, which is not valid since the output buffer is allowed to be undersized. */ - assert(endOnInput); DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end") DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length); DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op)); @@ -2024,21 +2207,17 @@ KLZ4_decompress_generic( length = (size_t)(oend-op); } } else { - /* We must be on the last sequence because of the parsing limitations so check - * that we exactly regenerate the original size (must be exact when !endOnInput). - */ - if ((!endOnInput) && (cpy != oend)) { goto _output_error; } /* We must be on the last sequence (or invalid) because of the parsing limitations * so check that we exactly consume the input and don't overrun the output buffer. */ - if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) { + if ((ip+length != iend) || (cpy > oend)) { DEBUGLOG(6, "should have been last run of literals") DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend); DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend); goto _output_error; } } - memmove(op, ip, length); /* supports overlapping memory regions; only matters for in-place decompression scenarios */ + KLZ4_memmove(op, ip, length); /* supports overlapping memory regions, for in-place decompression scenarios */ ip += length; op += length; /* Necessarily EOF when !partialDecoding. @@ -2050,7 +2229,7 @@ KLZ4_decompress_generic( break; } } else { - KLZ4_wildCopy8(op, ip, cpy); /* may overwrite up to WILDCOPYLENGTH beyond cpy */ + KLZ4_wildCopy8(op, ip, cpy); /* can overwrite up to 8 bytes beyond cpy */ ip += length; op = cpy; } @@ -2063,10 +2242,10 @@ KLZ4_decompress_generic( _copy_match: if (length == ML_MASK) { - variable_length_error error = ok; - length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error); - if (error != ok) goto _output_error; - if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ + size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0); + if (addl == rvl_error) { goto _output_error; } + length += addl; + if (unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ } length += MINMATCH; @@ -2076,6 +2255,7 @@ KLZ4_decompress_generic( if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ /* match starting within external dictionary */ if ((dict==usingExtDict) && (match < lowPrefix)) { + assert(dictEnd != NULL); if (unlikely(op+length > oend-LASTLITERALS)) { if (partialDecoding) length = MIN(length, (size_t)(oend-op)); else goto _output_error; /* doesn't respect parsing restriction */ @@ -2083,7 +2263,7 @@ KLZ4_decompress_generic( if (length <= (size_t)(lowPrefix-match)) { /* match fits entirely within external dictionary : just copy */ - memmove(op, dictEnd - (lowPrefix-match), length); + KLZ4_memmove(op, dictEnd - (lowPrefix-match), length); op += length; } else { /* match stretches into both external dictionary and current block */ @@ -2154,12 +2334,8 @@ KLZ4_decompress_generic( } /* end of decoding */ - if (endOnInput) { - DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst)); - return (int) (((char*)op)-dst); /* Nb of output bytes decoded */ - } else { - return (int) (((const char*)ip)-src); /* Nb of input bytes read */ - } + DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst)); + return (int) (((char*)op)-dst); /* Nb of output bytes decoded */ /* Overflow error detected */ _output_error: @@ -2174,7 +2350,7 @@ KLZ4_FORCE_O2 int KLZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize) { return KLZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, - endOnInputSize, decode_full_block, noDict, + decode_full_block, noDict, (BYTE*)dest, NULL, 0); } @@ -2183,16 +2359,17 @@ int KLZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, { dstCapacity = MIN(targetOutputSize, dstCapacity); return KLZ4_decompress_generic(src, dst, compressedSize, dstCapacity, - endOnInputSize, partial_decode, + partial_decode, noDict, (BYTE*)dst, NULL, 0); } KLZ4_FORCE_O2 int KLZ4_decompress_fast(const char* source, char* dest, int originalSize) { - return KLZ4_decompress_generic(source, dest, 0, originalSize, - endOnOutputSize, decode_full_block, withPrefix64k, - (BYTE*)dest - 64 KB, NULL, 0); + DEBUGLOG(5, "KLZ4_decompress_fast"); + return KLZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + 0, NULL, 0); } /*===== Instantiate a few more decoding cases, used more than once. =====*/ @@ -2201,16 +2378,25 @@ KLZ4_FORCE_O2 /* Exported, an obsolete API function. */ int KLZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) { return KLZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - endOnInputSize, decode_full_block, withPrefix64k, + decode_full_block, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +KLZ4_FORCE_O2 +static int KLZ4_decompress_safe_partial_withPrefix64k(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return KLZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 0); } /* Another obsolete API function, paired with the previous one. */ int KLZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) { - /* KLZ4_decompress_fast doesn't validate match offsets, - * and thus serves well with any prefixed dictionary. */ - return KLZ4_decompress_fast(source, dest, originalSize); + return KLZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + 64 KB, NULL, 0); } KLZ4_FORCE_O2 @@ -2218,7 +2404,17 @@ static int KLZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, size_t prefixSize) { return KLZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - endOnInputSize, decode_full_block, noDict, + decode_full_block, noDict, + (BYTE*)dest-prefixSize, NULL, 0); +} + +KLZ4_FORCE_O2 +static int KLZ4_decompress_safe_partial_withSmallPrefix(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, + size_t prefixSize) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return KLZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, noDict, (BYTE*)dest-prefixSize, NULL, 0); } @@ -2228,7 +2424,18 @@ int KLZ4_decompress_safe_forceExtDict(const char* source, char* dest, const void* dictStart, size_t dictSize) { return KLZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - endOnInputSize, decode_full_block, usingExtDict, + decode_full_block, usingExtDict, + (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +KLZ4_FORCE_O2 +int KLZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest, + int compressedSize, int targetOutputSize, int dstCapacity, + const void* dictStart, size_t dictSize) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return KLZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); } @@ -2236,9 +2443,9 @@ KLZ4_FORCE_O2 static int KLZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize, const void* dictStart, size_t dictSize) { - return KLZ4_decompress_generic(source, dest, 0, originalSize, - endOnOutputSize, decode_full_block, usingExtDict, - (BYTE*)dest, (const BYTE*)dictStart, dictSize); + return KLZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + 0, (const BYTE*)dictStart, dictSize); } /* The "double dictionary" mode, for use with e.g. ring buffers: the first part @@ -2250,26 +2457,17 @@ int KLZ4_decompress_safe_doubleDict(const char* source, char* dest, int compress size_t prefixSize, const void* dictStart, size_t dictSize) { return KLZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - endOnInputSize, decode_full_block, usingExtDict, - (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize); -} - -KLZ4_FORCE_INLINE -int KLZ4_decompress_fast_doubleDict(const char* source, char* dest, int originalSize, - size_t prefixSize, const void* dictStart, size_t dictSize) -{ - return KLZ4_decompress_generic(source, dest, 0, originalSize, - endOnOutputSize, decode_full_block, usingExtDict, + decode_full_block, usingExtDict, (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize); } /*===== streaming decompression functions =====*/ +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) KLZ4_streamDecode_t* KLZ4_createStreamDecode(void) { - KLZ4_streamDecode_t* lz4s = (KLZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(KLZ4_streamDecode_t)); - KLZ4_STATIC_ASSERT(KLZ4_STREAMDECODESIZE >= sizeof(KLZ4_streamDecode_t_internal)); /* A compilation error here means KLZ4_STREAMDECODESIZE is not large enough */ - return lz4s; + KLZ4_STATIC_ASSERT(sizeof(KLZ4_streamDecode_t) >= sizeof(KLZ4_streamDecode_t_internal)); + return (KLZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(KLZ4_streamDecode_t)); } int KLZ4_freeStreamDecode (KLZ4_streamDecode_t* KLZ4_stream) @@ -2278,6 +2476,7 @@ int KLZ4_freeStreamDecode (KLZ4_streamDecode_t* KLZ4_stream) FREEMEM(KLZ4_stream); return 0; } +#endif /*! KLZ4_setStreamDecode() : * Use this function to instruct where to find the dictionary. @@ -2288,8 +2487,13 @@ int KLZ4_freeStreamDecode (KLZ4_streamDecode_t* KLZ4_stream) int KLZ4_setStreamDecode (KLZ4_streamDecode_t* KLZ4_streamDecode, const char* dictionary, int dictSize) { KLZ4_streamDecode_t_internal* lz4sd = &KLZ4_streamDecode->internal_donotuse; - lz4sd->prefixSize = (size_t) dictSize; - lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + lz4sd->prefixSize = (size_t)dictSize; + if (dictSize) { + assert(dictionary != NULL); + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + } else { + lz4sd->prefixEnd = (const BYTE*) dictionary; + } lz4sd->externalDict = NULL; lz4sd->extDictSize = 0; return 1; @@ -2361,29 +2565,35 @@ int KLZ4_decompress_safe_continue (KLZ4_streamDecode_t* KLZ4_streamDecode, const return result; } -KLZ4_FORCE_O2 -int KLZ4_decompress_fast_continue (KLZ4_streamDecode_t* KLZ4_streamDecode, const char* source, char* dest, int originalSize) +KLZ4_FORCE_O2 int +KLZ4_decompress_fast_continue (KLZ4_streamDecode_t* KLZ4_streamDecode, + const char* source, char* dest, int originalSize) { - KLZ4_streamDecode_t_internal* lz4sd = &KLZ4_streamDecode->internal_donotuse; + KLZ4_streamDecode_t_internal* const lz4sd = + (assert(KLZ4_streamDecode!=NULL), &KLZ4_streamDecode->internal_donotuse); int result; + + DEBUGLOG(5, "KLZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize); assert(originalSize >= 0); if (lz4sd->prefixSize == 0) { + DEBUGLOG(5, "first invocation : no prefix nor extDict"); assert(lz4sd->extDictSize == 0); result = KLZ4_decompress_fast(source, dest, originalSize); if (result <= 0) return result; lz4sd->prefixSize = (size_t)originalSize; lz4sd->prefixEnd = (BYTE*)dest + originalSize; } else if (lz4sd->prefixEnd == (BYTE*)dest) { - if (lz4sd->prefixSize >= 64 KB - 1 || lz4sd->extDictSize == 0) - result = KLZ4_decompress_fast(source, dest, originalSize); - else - result = KLZ4_decompress_fast_doubleDict(source, dest, originalSize, - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + DEBUGLOG(5, "continue using existing prefix"); + result = KLZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + lz4sd->prefixSize, + lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize += (size_t)originalSize; lz4sd->prefixEnd += originalSize; } else { + DEBUGLOG(5, "prefix becomes extDict"); lz4sd->extDictSize = lz4sd->prefixSize; lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; result = KLZ4_decompress_fast_extDict(source, dest, originalSize, @@ -2419,10 +2629,27 @@ int KLZ4_decompress_safe_usingDict(const char* source, char* dest, int compresse return KLZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize); } +int KLZ4_decompress_safe_partial_usingDict(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return KLZ4_decompress_safe_partial(source, dest, compressedSize, targetOutputSize, dstCapacity); + if (dictStart+dictSize == dest) { + if (dictSize >= 64 KB - 1) { + return KLZ4_decompress_safe_partial_withPrefix64k(source, dest, compressedSize, targetOutputSize, dstCapacity); + } + assert(dictSize >= 0); + return KLZ4_decompress_safe_partial_withSmallPrefix(source, dest, compressedSize, targetOutputSize, dstCapacity, (size_t)dictSize); + } + assert(dictSize >= 0); + return KLZ4_decompress_safe_partial_forceExtDict(source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, (size_t)dictSize); +} + int KLZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) { if (dictSize==0 || dictStart+dictSize == dest) - return KLZ4_decompress_fast(source, dest, originalSize); + return KLZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, + (size_t)dictSize, NULL, 0); assert(dictSize >= 0); return KLZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize); } @@ -2474,7 +2701,7 @@ int KLZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize /* Obsolete Streaming functions */ -int KLZ4_sizeofStreamState(void) { return KLZ4_STREAMSIZE; } +int KLZ4_sizeofStreamState(void) { return sizeof(KLZ4_stream_t); } int KLZ4_resetStreamState(void* state, char* inputBuffer) { @@ -2483,11 +2710,13 @@ int KLZ4_resetStreamState(void* state, char* inputBuffer) return 0; } +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) void* KLZ4_create (char* inputBuffer) { (void)inputBuffer; return KLZ4_createStream(); } +#endif char* KLZ4_slideInputBuffer (void* state) { diff --git a/src/third_party/librdkafka/dist/src/lz4.h b/src/third_party/librdkafka/dist/src/lz4.h index f937625c7eb..b70f241c8c4 100644 --- a/src/third_party/librdkafka/dist/src/lz4.h +++ b/src/third_party/librdkafka/dist/src/lz4.h @@ -1,7 +1,7 @@ /* * KLZ4 - Fast LZ compression algorithm * Header File - * Copyright (C) 2011-present, Yann Collet. + * Copyright (C) 2011-2020, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) @@ -97,36 +97,77 @@ extern "C" { # define KLZ4LIB_API KLZ4LIB_VISIBILITY #endif +/*! KLZ4_FREESTANDING : + * When this macro is set to 1, it enables "freestanding mode" that is + * suitable for typical freestanding environment which doesn't support + * standard C library. + * + * - KLZ4_FREESTANDING is a compile-time switch. + * - It requires the following macros to be defined: + * KLZ4_memcpy, KLZ4_memmove, KLZ4_memset. + * - It only enables KLZ4/HC functions which don't use heap. + * All KLZ4F_* functions are not supported. + * - See tests/freestanding.c to check its basic setup. + */ +#if defined(KLZ4_FREESTANDING) && (KLZ4_FREESTANDING == 1) +# define KLZ4_HEAPMODE 0 +# define KLZ4HC_HEAPMODE 0 +# define KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1 +# if !defined(KLZ4_memcpy) +# error "KLZ4_FREESTANDING requires macro 'KLZ4_memcpy'." +# endif +# if !defined(KLZ4_memset) +# error "KLZ4_FREESTANDING requires macro 'KLZ4_memset'." +# endif +# if !defined(KLZ4_memmove) +# error "KLZ4_FREESTANDING requires macro 'KLZ4_memmove'." +# endif +#elif ! defined(KLZ4_FREESTANDING) +# define KLZ4_FREESTANDING 0 +#endif + + /*------ Version ------*/ #define KLZ4_VERSION_MAJOR 1 /* for breaking interface changes */ #define KLZ4_VERSION_MINOR 9 /* for new (non-breaking) interface capabilities */ -#define KLZ4_VERSION_RELEASE 3 /* for tweaks, bug-fixes, or development */ +#define KLZ4_VERSION_RELEASE 4 /* for tweaks, bug-fixes, or development */ #define KLZ4_VERSION_NUMBER (KLZ4_VERSION_MAJOR *100*100 + KLZ4_VERSION_MINOR *100 + KLZ4_VERSION_RELEASE) #define KLZ4_LIB_VERSION KLZ4_VERSION_MAJOR.KLZ4_VERSION_MINOR.KLZ4_VERSION_RELEASE #define KLZ4_QUOTE(str) #str #define KLZ4_EXPAND_AND_QUOTE(str) KLZ4_QUOTE(str) -#define KLZ4_VERSION_STRING KLZ4_EXPAND_AND_QUOTE(KLZ4_LIB_VERSION) +#define KLZ4_VERSION_STRING KLZ4_EXPAND_AND_QUOTE(KLZ4_LIB_VERSION) /* requires v1.7.3+ */ -KLZ4LIB_API int KLZ4_versionNumber (void); /**< library version number; useful to check dll version */ -KLZ4LIB_API const char* KLZ4_versionString (void); /**< library version string; useful to check dll version */ +KLZ4LIB_API int KLZ4_versionNumber (void); /**< library version number; useful to check dll version; requires v1.3.0+ */ +KLZ4LIB_API const char* KLZ4_versionString (void); /**< library version string; useful to check dll version; requires v1.7.5+ */ /*-************************************ * Tuning parameter **************************************/ +#define KLZ4_MEMORY_USAGE_MIN 10 +#define KLZ4_MEMORY_USAGE_DEFAULT 14 +#define KLZ4_MEMORY_USAGE_MAX 20 + /*! * KLZ4_MEMORY_USAGE : - * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) - * Increasing memory usage improves compression ratio. - * Reduced memory usage may improve speed, thanks to better cache locality. + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; ) + * Increasing memory usage improves compression ratio, at the cost of speed. + * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality. * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ #ifndef KLZ4_MEMORY_USAGE -# define KLZ4_MEMORY_USAGE 14 +# define KLZ4_MEMORY_USAGE KLZ4_MEMORY_USAGE_DEFAULT #endif +#if (KLZ4_MEMORY_USAGE < KLZ4_MEMORY_USAGE_MIN) +# error "KLZ4_MEMORY_USAGE is too small !" +#endif + +#if (KLZ4_MEMORY_USAGE > KLZ4_MEMORY_USAGE_MAX) +# error "KLZ4_MEMORY_USAGE is too large !" +#endif /*-************************************ * Simple Functions @@ -270,8 +311,25 @@ KLZ4LIB_API int KLZ4_decompress_safe_partial (const char* src, char* dst, int sr ***********************************************/ typedef union KLZ4_stream_u KLZ4_stream_t; /* incomplete type (defined later) */ +/** + Note about RC_INVOKED + + - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio). + https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros + + - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars) + and reports warning "RC4011: identifier truncated". + + - To eliminate the warning, we surround long preprocessor symbol with + "#if !defined(RC_INVOKED) ... #endif" block that means + "skip this block when rc.exe is trying to read it". +*/ +#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */ +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) KLZ4LIB_API KLZ4_stream_t* KLZ4_createStream(void); KLZ4LIB_API int KLZ4_freeStream (KLZ4_stream_t* streamPtr); +#endif /* !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */ +#endif /*! KLZ4_resetStream_fast() : v1.9.0+ * Use this to prepare an KLZ4_stream_t for a new chain of dependent blocks @@ -355,8 +413,12 @@ typedef union KLZ4_streamDecode_u KLZ4_streamDecode_t; /* tracking context */ * creation / destruction of streaming decompression tracking context. * A tracking context can be re-used multiple times. */ +#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */ +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) KLZ4LIB_API KLZ4_streamDecode_t* KLZ4_createStreamDecode(void); KLZ4LIB_API int KLZ4_freeStreamDecode (KLZ4_streamDecode_t* KLZ4_stream); +#endif /* !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */ +#endif /*! KLZ4_setStreamDecode() : * An KLZ4_streamDecode_t context can be allocated once and re-used multiple times. @@ -406,7 +468,10 @@ KLZ4LIB_API int KLZ4_decoderRingBufferSize(int maxBlockSize); * save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression, * then indicate where this data is saved using KLZ4_setStreamDecode(), before decompressing next block. */ -KLZ4LIB_API int KLZ4_decompress_safe_continue (KLZ4_streamDecode_t* KLZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity); +KLZ4LIB_API int +KLZ4_decompress_safe_continue (KLZ4_streamDecode_t* KLZ4_streamDecode, + const char* src, char* dst, + int srcSize, int dstCapacity); /*! KLZ4_decompress_*_usingDict() : @@ -417,7 +482,16 @@ KLZ4LIB_API int KLZ4_decompress_safe_continue (KLZ4_streamDecode_t* KLZ4_streamD * Performance tip : Decompression speed can be substantially increased * when dst == dictStart + dictSize. */ -KLZ4LIB_API int KLZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize); +KLZ4LIB_API int +KLZ4_decompress_safe_usingDict(const char* src, char* dst, + int srcSize, int dstCapacity, + const char* dictStart, int dictSize); + +KLZ4LIB_API int +KLZ4_decompress_safe_partial_usingDict(const char* src, char* dst, + int compressedSize, + int targetOutputSize, int maxOutputSize, + const char* dictStart, int dictSize); #endif /* KLZ4_H_2983827168210 */ @@ -496,13 +570,15 @@ KLZ4LIB_STATIC_API int KLZ4_compress_fast_extState_fastReset (void* state, const * stream (and source buffer) must remain in-place / accessible / unchanged * through the completion of the first compression call on the stream. */ -KLZ4LIB_STATIC_API void KLZ4_attach_dictionary(KLZ4_stream_t* workingStream, const KLZ4_stream_t* dictionaryStream); +KLZ4LIB_STATIC_API void +KLZ4_attach_dictionary(KLZ4_stream_t* workingStream, + const KLZ4_stream_t* dictionaryStream); /*! In-place compression and decompression * * It's possible to have input and output sharing the same buffer, - * for highly contrained memory environments. + * for highly constrained memory environments. * In both cases, it requires input to lay at the end of the buffer, * and decompression to start at beginning of the buffer. * Buffer size must feature some margin, hence be larger than final size. @@ -592,38 +668,26 @@ KLZ4LIB_STATIC_API void KLZ4_attach_dictionary(KLZ4_stream_t* workingStream, con typedef unsigned int KLZ4_u32; #endif +/*! KLZ4_stream_t : + * Never ever use below internal definitions directly ! + * These definitions are not API/ABI safe, and may change in future versions. + * If you need static allocation, declare or allocate an KLZ4_stream_t object. +**/ + typedef struct KLZ4_stream_t_internal KLZ4_stream_t_internal; struct KLZ4_stream_t_internal { KLZ4_u32 hashTable[KLZ4_HASH_SIZE_U32]; - KLZ4_u32 currentOffset; - KLZ4_u32 tableType; const KLZ4_byte* dictionary; const KLZ4_stream_t_internal* dictCtx; + KLZ4_u32 currentOffset; + KLZ4_u32 tableType; KLZ4_u32 dictSize; + /* Implicit padding to ensure structure is aligned */ }; -typedef struct { - const KLZ4_byte* externalDict; - size_t extDictSize; - const KLZ4_byte* prefixEnd; - size_t prefixSize; -} KLZ4_streamDecode_t_internal; - - -/*! KLZ4_stream_t : - * Do not use below internal definitions directly ! - * Declare or allocate an KLZ4_stream_t instead. - * KLZ4_stream_t can also be created using KLZ4_createStream(), which is recommended. - * The structure definition can be convenient for static allocation - * (on stack, or as part of larger structure). - * Init this structure with KLZ4_initStream() before first use. - * note : only use this definition in association with static linking ! - * this definition is not API/ABI safe, and may change in future versions. - */ -#define KLZ4_STREAMSIZE 16416 /* static size, for inter-version compatibility */ -#define KLZ4_STREAMSIZE_VOIDP (KLZ4_STREAMSIZE / sizeof(void*)) +#define KLZ4_STREAM_MINSIZE ((1UL << KLZ4_MEMORY_USAGE) + 32) /* static size, for inter-version compatibility */ union KLZ4_stream_u { - void* table[KLZ4_STREAMSIZE_VOIDP]; + char minStateSize[KLZ4_STREAM_MINSIZE]; KLZ4_stream_t_internal internal_donotuse; }; /* previously typedef'd to KLZ4_stream_t */ @@ -641,21 +705,25 @@ union KLZ4_stream_u { * In which case, the function will @return NULL. * Note2: An KLZ4_stream_t structure guarantees correct alignment and size. * Note3: Before v1.9.0, use KLZ4_resetStream() instead - */ +**/ KLZ4LIB_API KLZ4_stream_t* KLZ4_initStream (void* buffer, size_t size); /*! KLZ4_streamDecode_t : - * information structure to track an KLZ4 stream during decompression. - * init this structure using KLZ4_setStreamDecode() before first use. - * note : only use in association with static linking ! - * this definition is not API/ABI safe, - * and may change in a future version ! - */ -#define KLZ4_STREAMDECODESIZE_U64 (4 + ((sizeof(void*)==16) ? 2 : 0) /*AS-400*/ ) -#define KLZ4_STREAMDECODESIZE (KLZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) + * Never ever use below internal definitions directly ! + * These definitions are not API/ABI safe, and may change in future versions. + * If you need static allocation, declare or allocate an KLZ4_streamDecode_t object. +**/ +typedef struct { + const KLZ4_byte* externalDict; + const KLZ4_byte* prefixEnd; + size_t extDictSize; + size_t prefixSize; +} KLZ4_streamDecode_t_internal; + +#define KLZ4_STREAMDECODE_MINSIZE 32 union KLZ4_streamDecode_u { - unsigned long long table[KLZ4_STREAMDECODESIZE_U64]; + char minStateSize[KLZ4_STREAMDECODE_MINSIZE]; KLZ4_streamDecode_t_internal internal_donotuse; } ; /* previously typedef'd to KLZ4_streamDecode_t */ diff --git a/src/third_party/librdkafka/dist/src/lz4frame.c b/src/third_party/librdkafka/dist/src/lz4frame.c index 499962740c6..1353a052a41 100644 --- a/src/third_party/librdkafka/dist/src/lz4frame.c +++ b/src/third_party/librdkafka/dist/src/lz4frame.c @@ -45,7 +45,7 @@ * Compiler Options **************************************/ #ifdef _MSC_VER /* Visual Studio */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #endif @@ -62,28 +62,6 @@ #endif -/*-************************************ -* Memory routines -**************************************/ -/* - * User may redirect invocations of - * malloc(), calloc() and free() - * towards another library or solution of their choice - * by modifying below section. - */ -#ifndef KLZ4_SRC_INCLUDED /* avoid redefinition when sources are coalesced */ -#include "rd.h" /* rd_malloc, rd_calloc, rd_free */ -# define ALLOC(s) rd_malloc(s) -# define ALLOC_AND_ZERO(s) rd_calloc(1,(s)) -# define FREEMEM(p) rd_free(p) -#endif - -#include /* memset, memcpy, memmove */ -#ifndef KLZ4_SRC_INCLUDED /* avoid redefinition when sources are coalesced */ -# define MEM_INIT(p,v,s) memset((p),(v),(s)) -#endif - - /*-************************************ * Library declarations **************************************/ @@ -97,6 +75,66 @@ #include "rdxxhash.h" +/*-************************************ +* Memory routines +**************************************/ +/* + * User may redirect invocations of + * malloc(), calloc() and free() + * towards another library or solution of their choice + * by modifying below section. +**/ + +#include /* memset, memcpy, memmove */ +#ifndef KLZ4_SRC_INCLUDED /* avoid redefinition when sources are coalesced */ +# define MEM_INIT(p,v,s) memset((p),(v),(s)) +#endif + +#ifndef KLZ4_SRC_INCLUDED /* avoid redefinition when sources are coalesced */ +# include /* malloc, calloc, free */ +# define ALLOC(s) malloc(s) +# define ALLOC_AND_ZERO(s) calloc(1,(s)) +# define FREEMEM(p) free(p) +#endif + +static void* KLZ4F_calloc(size_t s, KLZ4F_CustomMem cmem) +{ + /* custom calloc defined : use it */ + if (cmem.customCalloc != NULL) { + return cmem.customCalloc(cmem.opaqueState, s); + } + /* nothing defined : use default 's calloc() */ + if (cmem.customAlloc == NULL) { + return ALLOC_AND_ZERO(s); + } + /* only custom alloc defined : use it, and combine it with memset() */ + { void* const p = cmem.customAlloc(cmem.opaqueState, s); + if (p != NULL) MEM_INIT(p, 0, s); + return p; +} } + +static void* KLZ4F_malloc(size_t s, KLZ4F_CustomMem cmem) +{ + /* custom malloc defined : use it */ + if (cmem.customAlloc != NULL) { + return cmem.customAlloc(cmem.opaqueState, s); + } + /* nothing defined : use default 's malloc() */ + return ALLOC(s); +} + +static void KLZ4F_free(void* p, KLZ4F_CustomMem cmem) +{ + /* custom malloc defined : use it */ + if (cmem.customFree != NULL) { + cmem.customFree(cmem.opaqueState, p); + return; + } + /* nothing defined : use default 's free() */ + FREEMEM(p); +} + + /*-************************************ * Debug **************************************/ @@ -143,7 +181,7 @@ static int g_debuglog_enable = 1; #endif -/* unoptimized version; solves endianess & alignment issues */ +/* unoptimized version; solves endianness & alignment issues */ static U32 KLZ4F_readLE32 (const void* src) { const BYTE* const srcPtr = (const BYTE*)src; @@ -206,8 +244,6 @@ static void KLZ4F_writeLE64 (void* dst, U64 value64) #define _4BITS 0x0F #define _8BITS 0xFF -#define KLZ4F_MAGIC_SKIPPABLE_START 0x184D2A50U -#define KLZ4F_MAGICNUMBER 0x184D2204U #define KLZ4F_BLOCKUNCOMPRESSED_FLAG 0x80000000U #define KLZ4F_BLOCKSIZEID_DEFAULT KLZ4F_max64KB @@ -220,22 +256,27 @@ static const size_t BFSize = KLZ4F_BLOCK_CHECKSUM_SIZE; /* block footer : check /*-************************************ * Structures and local types **************************************/ + +typedef enum { KLZ4B_COMPRESSED, KLZ4B_UNCOMPRESSED} KLZ4F_blockCompression_t; + typedef struct KLZ4F_cctx_s { + KLZ4F_CustomMem cmem; KLZ4F_preferences_t prefs; U32 version; U32 cStage; const KLZ4F_CDict* cdict; size_t maxBlockSize; size_t maxBufferSize; - BYTE* tmpBuff; - BYTE* tmpIn; - size_t tmpInSize; + BYTE* tmpBuff; /* internal buffer, for streaming */ + BYTE* tmpIn; /* starting position of data compress within internal buffer (>= tmpBuff) */ + size_t tmpInSize; /* amount of data to compress after tmpIn */ U64 totalInSize; KXXH32_state_t xxh; void* lz4CtxPtr; U16 lz4CtxAlloc; /* sized for: 0 = none, 1 = lz4 ctx, 2 = lz4hc ctx */ U16 lz4CtxState; /* in use as: 0 = none, 1 = lz4 ctx, 2 = lz4hc ctx */ + KLZ4F_blockCompression_t blockCompression; } KLZ4F_cctx_t; @@ -264,27 +305,33 @@ KLZ4F_errorCodes KLZ4F_getErrorCode(size_t functionResult) return (KLZ4F_errorCodes)(-(ptrdiff_t)functionResult); } -static KLZ4F_errorCode_t err0r(KLZ4F_errorCodes code) +static KLZ4F_errorCode_t KLZ4F_returnErrorCode(KLZ4F_errorCodes code) { /* A compilation error here means sizeof(ptrdiff_t) is not large enough */ KLZ4F_STATIC_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t)); return (KLZ4F_errorCode_t)-(ptrdiff_t)code; } +#define RETURN_ERROR(e) return KLZ4F_returnErrorCode(KLZ4F_ERROR_ ## e) + +#define RETURN_ERROR_IF(c,e) if (c) RETURN_ERROR(e) + +#define FORWARD_IF_ERROR(r) if (KLZ4F_isError(r)) return (r) + unsigned KLZ4F_getVersion(void) { return KLZ4F_VERSION; } int KLZ4F_compressionLevel_max(void) { return KLZ4HC_CLEVEL_MAX; } -size_t KLZ4F_getBlockSize(unsigned blockSizeID) +size_t KLZ4F_getBlockSize(KLZ4F_blockSizeID_t blockSizeID) { static const size_t blockSizes[4] = { 64 KB, 256 KB, 1 MB, 4 MB }; if (blockSizeID == 0) blockSizeID = KLZ4F_BLOCKSIZEID_DEFAULT; if (blockSizeID < KLZ4F_max64KB || blockSizeID > KLZ4F_max4MB) - return err0r(KLZ4F_ERROR_maxBlockSize_invalid); - blockSizeID -= KLZ4F_max64KB; - return blockSizes[blockSizeID]; -} + RETURN_ERROR(maxBlockSize_invalid); + { int const blockSizeIdx = (int)blockSizeID - (int)KLZ4F_max64KB; + return blockSizes[blockSizeIdx]; +} } /*-************************************ * Private functions @@ -397,21 +444,20 @@ size_t KLZ4F_compressFrame_usingCDict(KLZ4F_cctx* cctx, MEM_INIT(&options, 0, sizeof(options)); options.stableSrc = 1; - if (dstCapacity < KLZ4F_compressFrameBound(srcSize, &prefs)) /* condition to guarantee success */ - return err0r(KLZ4F_ERROR_dstMaxSize_tooSmall); + RETURN_ERROR_IF(dstCapacity < KLZ4F_compressFrameBound(srcSize, &prefs), dstMaxSize_tooSmall); { size_t const headerSize = KLZ4F_compressBegin_usingCDict(cctx, dstBuffer, dstCapacity, cdict, &prefs); /* write header */ - if (KLZ4F_isError(headerSize)) return headerSize; + FORWARD_IF_ERROR(headerSize); dstPtr += headerSize; /* header size */ } assert(dstEnd >= dstPtr); { size_t const cSize = KLZ4F_compressUpdate(cctx, dstPtr, (size_t)(dstEnd-dstPtr), srcBuffer, srcSize, &options); - if (KLZ4F_isError(cSize)) return cSize; + FORWARD_IF_ERROR(cSize); dstPtr += cSize; } assert(dstEnd >= dstPtr); { size_t const tailSize = KLZ4F_compressEnd(cctx, dstPtr, (size_t)(dstEnd-dstPtr), &options); /* flush last block, and generate suffix */ - if (KLZ4F_isError(tailSize)) return tailSize; + FORWARD_IF_ERROR(tailSize); dstPtr += tailSize; } assert(dstEnd >= dstStart); @@ -432,27 +478,26 @@ size_t KLZ4F_compressFrame(void* dstBuffer, size_t dstCapacity, { size_t result; #if (KLZ4F_HEAPMODE) - KLZ4F_cctx_t *cctxPtr; + KLZ4F_cctx_t* cctxPtr; result = KLZ4F_createCompressionContext(&cctxPtr, KLZ4F_VERSION); - if (KLZ4F_isError(result)) return result; + FORWARD_IF_ERROR(result); #else KLZ4F_cctx_t cctx; KLZ4_stream_t lz4ctx; - KLZ4F_cctx_t *cctxPtr = &cctx; + KLZ4F_cctx_t* const cctxPtr = &cctx; - DEBUGLOG(4, "KLZ4F_compressFrame"); MEM_INIT(&cctx, 0, sizeof(cctx)); cctx.version = KLZ4F_VERSION; cctx.maxBufferSize = 5 MB; /* mess with real buffer size to prevent dynamic allocation; works only because autoflush==1 & stableSrc==1 */ - if (preferencesPtr == NULL || - preferencesPtr->compressionLevel < KLZ4HC_CLEVEL_MIN) - { + if ( preferencesPtr == NULL + || preferencesPtr->compressionLevel < KLZ4HC_CLEVEL_MIN ) { KLZ4_initStream(&lz4ctx, sizeof(lz4ctx)); cctxPtr->lz4CtxPtr = &lz4ctx; cctxPtr->lz4CtxAlloc = 1; cctxPtr->lz4CtxState = 1; } #endif + DEBUGLOG(4, "KLZ4F_compressFrame"); result = KLZ4F_compressFrame_usingCDict(cctxPtr, dstBuffer, dstCapacity, srcBuffer, srcSize, @@ -461,10 +506,9 @@ size_t KLZ4F_compressFrame(void* dstBuffer, size_t dstCapacity, #if (KLZ4F_HEAPMODE) KLZ4F_freeCompressionContext(cctxPtr); #else - if (preferencesPtr != NULL && - preferencesPtr->compressionLevel >= KLZ4HC_CLEVEL_MIN) - { - FREEMEM(cctxPtr->lz4CtxPtr); + if ( preferencesPtr != NULL + && preferencesPtr->compressionLevel >= KLZ4HC_CLEVEL_MIN ) { + KLZ4F_free(cctxPtr->lz4CtxPtr, cctxPtr->cmem); } #endif return result; @@ -476,30 +520,31 @@ size_t KLZ4F_compressFrame(void* dstBuffer, size_t dstCapacity, *****************************************************/ struct KLZ4F_CDict_s { + KLZ4F_CustomMem cmem; void* dictContent; KLZ4_stream_t* fastCtx; KLZ4_streamHC_t* HCCtx; }; /* typedef'd to KLZ4F_CDict within lz4frame_static.h */ -/*! KLZ4F_createCDict() : - * When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once. - * KLZ4F_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. - * KLZ4F_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. - * `dictBuffer` can be released after KLZ4F_CDict creation, since its content is copied within CDict - * @return : digested dictionary for compression, or NULL if failed */ -KLZ4F_CDict* KLZ4F_createCDict(const void* dictBuffer, size_t dictSize) +KLZ4F_CDict* +KLZ4F_createCDict_advanced(KLZ4F_CustomMem cmem, const void* dictBuffer, size_t dictSize) { const char* dictStart = (const char*)dictBuffer; - KLZ4F_CDict* cdict = (KLZ4F_CDict*) ALLOC(sizeof(*cdict)); - DEBUGLOG(4, "KLZ4F_createCDict"); + KLZ4F_CDict* const cdict = (KLZ4F_CDict*)KLZ4F_malloc(sizeof(*cdict), cmem); + DEBUGLOG(4, "KLZ4F_createCDict_advanced"); if (!cdict) return NULL; + cdict->cmem = cmem; if (dictSize > 64 KB) { dictStart += dictSize - 64 KB; dictSize = 64 KB; } - cdict->dictContent = ALLOC(dictSize); - cdict->fastCtx = KLZ4_createStream(); - cdict->HCCtx = KLZ4_createStreamHC(); + cdict->dictContent = KLZ4F_malloc(dictSize, cmem); + cdict->fastCtx = (KLZ4_stream_t*)KLZ4F_malloc(sizeof(KLZ4_stream_t), cmem); + if (cdict->fastCtx) + KLZ4_initStream(cdict->fastCtx, sizeof(KLZ4_stream_t)); + cdict->HCCtx = (KLZ4_streamHC_t*)KLZ4F_malloc(sizeof(KLZ4_streamHC_t), cmem); + if (cdict->HCCtx) + KLZ4_initStream(cdict->HCCtx, sizeof(KLZ4_streamHC_t)); if (!cdict->dictContent || !cdict->fastCtx || !cdict->HCCtx) { KLZ4F_freeCDict(cdict); return NULL; @@ -511,13 +556,25 @@ KLZ4F_CDict* KLZ4F_createCDict(const void* dictBuffer, size_t dictSize) return cdict; } +/*! KLZ4F_createCDict() : + * When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once. + * KLZ4F_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. + * KLZ4F_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * @dictBuffer can be released after KLZ4F_CDict creation, since its content is copied within CDict + * @return : digested dictionary for compression, or NULL if failed */ +KLZ4F_CDict* KLZ4F_createCDict(const void* dictBuffer, size_t dictSize) +{ + DEBUGLOG(4, "KLZ4F_createCDict"); + return KLZ4F_createCDict_advanced(KLZ4F_defaultCMem, dictBuffer, dictSize); +} + void KLZ4F_freeCDict(KLZ4F_CDict* cdict) { if (cdict==NULL) return; /* support free on NULL */ - FREEMEM(cdict->dictContent); - KLZ4_freeStream(cdict->fastCtx); - KLZ4_freeStreamHC(cdict->HCCtx); - FREEMEM(cdict); + KLZ4F_free(cdict->dictContent, cdict->cmem); + KLZ4F_free(cdict->fastCtx, cdict->cmem); + KLZ4F_free(cdict->HCCtx, cdict->cmem); + KLZ4F_free(cdict, cdict->cmem); } @@ -525,6 +582,20 @@ void KLZ4F_freeCDict(KLZ4F_CDict* cdict) * Advanced compression functions ***********************************/ +KLZ4F_cctx* +KLZ4F_createCompressionContext_advanced(KLZ4F_CustomMem customMem, unsigned version) +{ + KLZ4F_cctx* const cctxPtr = + (KLZ4F_cctx*)KLZ4F_calloc(sizeof(KLZ4F_cctx), customMem); + if (cctxPtr==NULL) return NULL; + + cctxPtr->cmem = customMem; + cctxPtr->version = version; + cctxPtr->cStage = 0; /* Uninitialized. Next stage : init cctx */ + + return cctxPtr; +} + /*! KLZ4F_createCompressionContext() : * The first thing to do is to create a compressionContext object, which will be used in all compression operations. * This is achieved using KLZ4F_createCompressionContext(), which takes as argument a version and an KLZ4F_preferences_t structure. @@ -532,17 +603,16 @@ void KLZ4F_freeCDict(KLZ4F_CDict* cdict) * The function will provide a pointer to an allocated KLZ4F_compressionContext_t object. * If the result KLZ4F_errorCode_t is not OK_NoError, there was an error during context creation. * Object can release its memory using KLZ4F_freeCompressionContext(); - */ -KLZ4F_errorCode_t KLZ4F_createCompressionContext(KLZ4F_cctx** KLZ4F_compressionContextPtr, unsigned version) +**/ +KLZ4F_errorCode_t +KLZ4F_createCompressionContext(KLZ4F_cctx** KLZ4F_compressionContextPtr, unsigned version) { - KLZ4F_cctx_t* const cctxPtr = (KLZ4F_cctx_t*)ALLOC_AND_ZERO(sizeof(KLZ4F_cctx_t)); - if (cctxPtr==NULL) return err0r(KLZ4F_ERROR_allocation_failed); - - cctxPtr->version = version; - cctxPtr->cStage = 0; /* Next stage : init stream */ - - *KLZ4F_compressionContextPtr = cctxPtr; + assert(KLZ4F_compressionContextPtr != NULL); /* considered a violation of narrow contract */ + /* in case it nonetheless happen in production */ + RETURN_ERROR_IF(KLZ4F_compressionContextPtr == NULL, parameter_null); + *KLZ4F_compressionContextPtr = KLZ4F_createCompressionContext_advanced(KLZ4F_defaultCMem, version); + RETURN_ERROR_IF(*KLZ4F_compressionContextPtr==NULL, allocation_failed); return KLZ4F_OK_NoError; } @@ -550,11 +620,10 @@ KLZ4F_errorCode_t KLZ4F_createCompressionContext(KLZ4F_cctx** KLZ4F_compressionC KLZ4F_errorCode_t KLZ4F_freeCompressionContext(KLZ4F_cctx* cctxPtr) { if (cctxPtr != NULL) { /* support free on NULL */ - FREEMEM(cctxPtr->lz4CtxPtr); /* note: KLZ4_streamHC_t and KLZ4_stream_t are simple POD types */ - FREEMEM(cctxPtr->tmpBuff); - FREEMEM(cctxPtr); + KLZ4F_free(cctxPtr->lz4CtxPtr, cctxPtr->cmem); /* note: KLZ4_streamHC_t and KLZ4_stream_t are simple POD types */ + KLZ4F_free(cctxPtr->tmpBuff, cctxPtr->cmem); + KLZ4F_free(cctxPtr, cctxPtr->cmem); } - return KLZ4F_OK_NoError; } @@ -588,11 +657,21 @@ static void KLZ4F_initStream(void* ctx, } } +static int ctxTypeID_to_size(int ctxTypeID) { + switch(ctxTypeID) { + case 1: + return KLZ4_sizeofState(); + case 2: + return KLZ4_sizeofStateHC(); + default: + return 0; + } +} /*! KLZ4F_compressBegin_usingCDict() : - * init streaming compression and writes frame header into dstBuffer. - * dstBuffer must be >= KLZ4F_HEADER_SIZE_MAX bytes. - * @return : number of bytes written into dstBuffer for the header + * init streaming compression AND writes frame header into @dstBuffer. + * @dstCapacity must be >= KLZ4F_HEADER_SIZE_MAX bytes. + * @return : number of bytes written into @dstBuffer for the header * or an error code (can be tested using KLZ4F_isError()) */ size_t KLZ4F_compressBegin_usingCDict(KLZ4F_cctx* cctxPtr, @@ -600,41 +679,46 @@ size_t KLZ4F_compressBegin_usingCDict(KLZ4F_cctx* cctxPtr, const KLZ4F_CDict* cdict, const KLZ4F_preferences_t* preferencesPtr) { - KLZ4F_preferences_t prefNull; + KLZ4F_preferences_t const prefNull = KLZ4F_INIT_PREFERENCES; BYTE* const dstStart = (BYTE*)dstBuffer; BYTE* dstPtr = dstStart; - BYTE* headerStart; - if (dstCapacity < maxFHSize) return err0r(KLZ4F_ERROR_dstMaxSize_tooSmall); - MEM_INIT(&prefNull, 0, sizeof(prefNull)); + RETURN_ERROR_IF(dstCapacity < maxFHSize, dstMaxSize_tooSmall); if (preferencesPtr == NULL) preferencesPtr = &prefNull; cctxPtr->prefs = *preferencesPtr; - /* Ctx Management */ + /* cctx Management */ { U16 const ctxTypeID = (cctxPtr->prefs.compressionLevel < KLZ4HC_CLEVEL_MIN) ? 1 : 2; - if (cctxPtr->lz4CtxAlloc < ctxTypeID) { - FREEMEM(cctxPtr->lz4CtxPtr); + int requiredSize = ctxTypeID_to_size(ctxTypeID); + int allocatedSize = ctxTypeID_to_size(cctxPtr->lz4CtxAlloc); + if (allocatedSize < requiredSize) { + /* not enough space allocated */ + KLZ4F_free(cctxPtr->lz4CtxPtr, cctxPtr->cmem); if (cctxPtr->prefs.compressionLevel < KLZ4HC_CLEVEL_MIN) { - cctxPtr->lz4CtxPtr = KLZ4_createStream(); + /* must take ownership of memory allocation, + * in order to respect custom allocator contract */ + cctxPtr->lz4CtxPtr = KLZ4F_malloc(sizeof(KLZ4_stream_t), cctxPtr->cmem); + if (cctxPtr->lz4CtxPtr) + KLZ4_initStream(cctxPtr->lz4CtxPtr, sizeof(KLZ4_stream_t)); } else { - cctxPtr->lz4CtxPtr = KLZ4_createStreamHC(); + cctxPtr->lz4CtxPtr = KLZ4F_malloc(sizeof(KLZ4_streamHC_t), cctxPtr->cmem); + if (cctxPtr->lz4CtxPtr) + KLZ4_initStreamHC(cctxPtr->lz4CtxPtr, sizeof(KLZ4_streamHC_t)); } - if (cctxPtr->lz4CtxPtr == NULL) - return err0r(KLZ4F_ERROR_allocation_failed); + RETURN_ERROR_IF(cctxPtr->lz4CtxPtr == NULL, allocation_failed); cctxPtr->lz4CtxAlloc = ctxTypeID; cctxPtr->lz4CtxState = ctxTypeID; } else if (cctxPtr->lz4CtxState != ctxTypeID) { - /* otherwise, a sufficient buffer is allocated, but we need to - * reset it to the correct context type */ + /* otherwise, a sufficient buffer is already allocated, + * but we need to reset it to the correct context type */ if (cctxPtr->prefs.compressionLevel < KLZ4HC_CLEVEL_MIN) { - KLZ4_initStream((KLZ4_stream_t *) cctxPtr->lz4CtxPtr, sizeof (KLZ4_stream_t)); + KLZ4_initStream((KLZ4_stream_t*)cctxPtr->lz4CtxPtr, sizeof(KLZ4_stream_t)); } else { - KLZ4_initStreamHC((KLZ4_streamHC_t *) cctxPtr->lz4CtxPtr, sizeof(KLZ4_streamHC_t)); - KLZ4_setCompressionLevel((KLZ4_streamHC_t *) cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel); + KLZ4_initStreamHC((KLZ4_streamHC_t*)cctxPtr->lz4CtxPtr, sizeof(KLZ4_streamHC_t)); + KLZ4_setCompressionLevel((KLZ4_streamHC_t*)cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel); } cctxPtr->lz4CtxState = ctxTypeID; - } - } + } } /* Buffer Management */ if (cctxPtr->prefs.frameInfo.blockSizeID == 0) @@ -647,9 +731,9 @@ size_t KLZ4F_compressBegin_usingCDict(KLZ4F_cctx* cctxPtr, if (cctxPtr->maxBufferSize < requiredBuffSize) { cctxPtr->maxBufferSize = 0; - FREEMEM(cctxPtr->tmpBuff); - cctxPtr->tmpBuff = (BYTE*)ALLOC_AND_ZERO(requiredBuffSize); - if (cctxPtr->tmpBuff == NULL) return err0r(KLZ4F_ERROR_allocation_failed); + KLZ4F_free(cctxPtr->tmpBuff, cctxPtr->cmem); + cctxPtr->tmpBuff = (BYTE*)KLZ4F_calloc(requiredBuffSize, cctxPtr->cmem); + RETURN_ERROR_IF(cctxPtr->tmpBuff == NULL, allocation_failed); cctxPtr->maxBufferSize = requiredBuffSize; } } cctxPtr->tmpIn = cctxPtr->tmpBuff; @@ -669,31 +753,32 @@ size_t KLZ4F_compressBegin_usingCDict(KLZ4F_cctx* cctxPtr, /* Magic Number */ KLZ4F_writeLE32(dstPtr, KLZ4F_MAGICNUMBER); dstPtr += 4; - headerStart = dstPtr; + { BYTE* const headerStart = dstPtr; - /* FLG Byte */ - *dstPtr++ = (BYTE)(((1 & _2BITS) << 6) /* Version('01') */ - + ((cctxPtr->prefs.frameInfo.blockMode & _1BIT ) << 5) - + ((cctxPtr->prefs.frameInfo.blockChecksumFlag & _1BIT ) << 4) - + ((unsigned)(cctxPtr->prefs.frameInfo.contentSize > 0) << 3) - + ((cctxPtr->prefs.frameInfo.contentChecksumFlag & _1BIT ) << 2) - + (cctxPtr->prefs.frameInfo.dictID > 0) ); - /* BD Byte */ - *dstPtr++ = (BYTE)((cctxPtr->prefs.frameInfo.blockSizeID & _3BITS) << 4); - /* Optional Frame content size field */ - if (cctxPtr->prefs.frameInfo.contentSize) { - KLZ4F_writeLE64(dstPtr, cctxPtr->prefs.frameInfo.contentSize); - dstPtr += 8; - cctxPtr->totalInSize = 0; + /* FLG Byte */ + *dstPtr++ = (BYTE)(((1 & _2BITS) << 6) /* Version('01') */ + + ((cctxPtr->prefs.frameInfo.blockMode & _1BIT ) << 5) + + ((cctxPtr->prefs.frameInfo.blockChecksumFlag & _1BIT ) << 4) + + ((unsigned)(cctxPtr->prefs.frameInfo.contentSize > 0) << 3) + + ((cctxPtr->prefs.frameInfo.contentChecksumFlag & _1BIT ) << 2) + + (cctxPtr->prefs.frameInfo.dictID > 0) ); + /* BD Byte */ + *dstPtr++ = (BYTE)((cctxPtr->prefs.frameInfo.blockSizeID & _3BITS) << 4); + /* Optional Frame content size field */ + if (cctxPtr->prefs.frameInfo.contentSize) { + KLZ4F_writeLE64(dstPtr, cctxPtr->prefs.frameInfo.contentSize); + dstPtr += 8; + cctxPtr->totalInSize = 0; + } + /* Optional dictionary ID field */ + if (cctxPtr->prefs.frameInfo.dictID) { + KLZ4F_writeLE32(dstPtr, cctxPtr->prefs.frameInfo.dictID); + dstPtr += 4; + } + /* Header CRC Byte */ + *dstPtr = KLZ4F_headerChecksum(headerStart, (size_t)(dstPtr - headerStart)); + dstPtr++; } - /* Optional dictionary ID field */ - if (cctxPtr->prefs.frameInfo.dictID) { - KLZ4F_writeLE32(dstPtr, cctxPtr->prefs.frameInfo.dictID); - dstPtr += 4; - } - /* Header CRC Byte */ - *dstPtr = KLZ4F_headerChecksum(headerStart, (size_t)(dstPtr - headerStart)); - dstPtr++; cctxPtr->cStage = 1; /* header written, now request input data block */ return (size_t)(dstPtr - dstStart); @@ -701,9 +786,9 @@ size_t KLZ4F_compressBegin_usingCDict(KLZ4F_cctx* cctxPtr, /*! KLZ4F_compressBegin() : - * init streaming compression and writes frame header into dstBuffer. - * dstBuffer must be >= KLZ4F_HEADER_SIZE_MAX bytes. - * preferencesPtr can be NULL, in which case default parameters are selected. + * init streaming compression AND writes frame header into @dstBuffer. + * @dstCapacity must be >= KLZ4F_HEADER_SIZE_MAX bytes. + * @preferencesPtr can be NULL, in which case default parameters are selected. * @return : number of bytes written into dstBuffer for the header * or an error code (can be tested using KLZ4F_isError()) */ @@ -744,11 +829,13 @@ static size_t KLZ4F_makeBlock(void* dst, KLZ4F_blockChecksum_t crcFlag) { BYTE* const cSizePtr = (BYTE*)dst; - U32 cSize = (U32)compress(lz4ctx, (const char*)src, (char*)(cSizePtr+BHSize), - (int)(srcSize), (int)(srcSize-1), - level, cdict); - if (cSize == 0) { /* compression failed */ - DEBUGLOG(5, "KLZ4F_makeBlock: compression failed, creating a raw block (size %u)", (U32)srcSize); + U32 cSize; + assert(compress != NULL); + cSize = (U32)compress(lz4ctx, (const char*)src, (char*)(cSizePtr+BHSize), + (int)(srcSize), (int)(srcSize-1), + level, cdict); + + if (cSize == 0 || cSize >= srcSize) { cSize = (U32)srcSize; KLZ4F_writeLE32(cSizePtr, cSize | KLZ4F_BLOCKUNCOMPRESSED_FLAG); memcpy(cSizePtr+BHSize, src, srcSize); @@ -766,6 +853,7 @@ static size_t KLZ4F_makeBlock(void* dst, static int KLZ4F_compressBlock(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const KLZ4F_CDict* cdict) { int const acceleration = (level < 0) ? -level + 1 : 1; + DEBUGLOG(5, "KLZ4F_compressBlock (srcSize=%i)", srcSize); KLZ4F_initStream(ctx, cdict, level, KLZ4F_blockIndependent); if (cdict) { return KLZ4_compress_fast_continue((KLZ4_stream_t*)ctx, src, dst, srcSize, dstCapacity, acceleration); @@ -778,6 +866,7 @@ static int KLZ4F_compressBlock_continue(void* ctx, const char* src, char* dst, i { int const acceleration = (level < 0) ? -level + 1 : 1; (void)cdict; /* init once at beginning of frame */ + DEBUGLOG(5, "KLZ4F_compressBlock_continue (srcSize=%i)", srcSize); return KLZ4_compress_fast_continue((KLZ4_stream_t*)ctx, src, dst, srcSize, dstCapacity, acceleration); } @@ -796,8 +885,15 @@ static int KLZ4F_compressBlockHC_continue(void* ctx, const char* src, char* dst, return KLZ4_compress_HC_continue((KLZ4_streamHC_t*)ctx, src, dst, srcSize, dstCapacity); } -static compressFunc_t KLZ4F_selectCompression(KLZ4F_blockMode_t blockMode, int level) +static int KLZ4F_doNotCompressBlock(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const KLZ4F_CDict* cdict) { + (void)ctx; (void)src; (void)dst; (void)srcSize; (void)dstCapacity; (void)level; (void)cdict; + return 0; +} + +static compressFunc_t KLZ4F_selectCompression(KLZ4F_blockMode_t blockMode, int level, KLZ4F_blockCompression_t compressMode) +{ + if (compressMode == KLZ4B_UNCOMPRESSED) return KLZ4F_doNotCompressBlock; if (level < KLZ4HC_CLEVEL_MIN) { if (blockMode == KLZ4F_blockIndependent) return KLZ4F_compressBlock; return KLZ4F_compressBlock_continue; @@ -806,6 +902,7 @@ static compressFunc_t KLZ4F_selectCompression(KLZ4F_blockMode_t blockMode, int l return KLZ4F_compressBlockHC_continue; } +/* Save history (up to 64KB) into @tmpBuff */ static int KLZ4F_localSaveDict(KLZ4F_cctx_t* cctxPtr) { if (cctxPtr->prefs.compressionLevel < KLZ4HC_CLEVEL_MIN) @@ -815,38 +912,57 @@ static int KLZ4F_localSaveDict(KLZ4F_cctx_t* cctxPtr) typedef enum { notDone, fromTmpBuffer, fromSrcBuffer } KLZ4F_lastBlockStatus; -/*! KLZ4F_compressUpdate() : +static const KLZ4F_compressOptions_t k_cOptionsNull = { 0, { 0, 0, 0 } }; + + + /*! KLZ4F_compressUpdateImpl() : * KLZ4F_compressUpdate() can be called repetitively to compress as much data as necessary. - * dstBuffer MUST be >= KLZ4F_compressBound(srcSize, preferencesPtr). - * KLZ4F_compressOptions_t structure is optional : you can provide NULL as argument. + * When successful, the function always entirely consumes @srcBuffer. + * src data is either buffered or compressed into @dstBuffer. + * If the block compression does not match the compression of the previous block, the old data is flushed + * and operations continue with the new compression mode. + * @dstCapacity MUST be >= KLZ4F_compressBound(srcSize, preferencesPtr) when block compression is turned on. + * @compressOptionsPtr is optional : provide NULL to mean "default". * @return : the number of bytes written into dstBuffer. It can be zero, meaning input data was just buffered. * or an error code if it fails (which can be tested using KLZ4F_isError()) + * After an error, the state is left in a UB state, and must be re-initialized. */ -size_t KLZ4F_compressUpdate(KLZ4F_cctx* cctxPtr, - void* dstBuffer, size_t dstCapacity, +static size_t KLZ4F_compressUpdateImpl(KLZ4F_cctx* cctxPtr, + void* dstBuffer, size_t dstCapacity, const void* srcBuffer, size_t srcSize, - const KLZ4F_compressOptions_t* compressOptionsPtr) -{ - KLZ4F_compressOptions_t cOptionsNull; + const KLZ4F_compressOptions_t* compressOptionsPtr, + KLZ4F_blockCompression_t blockCompression) + { size_t const blockSize = cctxPtr->maxBlockSize; const BYTE* srcPtr = (const BYTE*)srcBuffer; const BYTE* const srcEnd = srcPtr + srcSize; BYTE* const dstStart = (BYTE*)dstBuffer; BYTE* dstPtr = dstStart; KLZ4F_lastBlockStatus lastBlockCompressed = notDone; - compressFunc_t const compress = KLZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel); - + compressFunc_t const compress = KLZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel, blockCompression); + size_t bytesWritten; DEBUGLOG(4, "KLZ4F_compressUpdate (srcSize=%zu)", srcSize); - if (cctxPtr->cStage != 1) return err0r(KLZ4F_ERROR_GENERIC); + RETURN_ERROR_IF(cctxPtr->cStage != 1, compressionState_uninitialized); /* state must be initialized and waiting for next block */ if (dstCapacity < KLZ4F_compressBound_internal(srcSize, &(cctxPtr->prefs), cctxPtr->tmpInSize)) - return err0r(KLZ4F_ERROR_dstMaxSize_tooSmall); - MEM_INIT(&cOptionsNull, 0, sizeof(cOptionsNull)); - if (compressOptionsPtr == NULL) compressOptionsPtr = &cOptionsNull; + RETURN_ERROR(dstMaxSize_tooSmall); + + if (blockCompression == KLZ4B_UNCOMPRESSED && dstCapacity < srcSize) + RETURN_ERROR(dstMaxSize_tooSmall); + + /* flush currently written block, to continue with new block compression */ + if (cctxPtr->blockCompression != blockCompression) { + bytesWritten = KLZ4F_flush(cctxPtr, dstBuffer, dstCapacity, compressOptionsPtr); + dstPtr += bytesWritten; + cctxPtr->blockCompression = blockCompression; + } + + if (compressOptionsPtr == NULL) compressOptionsPtr = &k_cOptionsNull; /* complete tmp buffer */ if (cctxPtr->tmpInSize > 0) { /* some data already within tmp buffer */ size_t const sizeToCopy = blockSize - cctxPtr->tmpInSize; + assert(blockSize > cctxPtr->tmpInSize); if (sizeToCopy > srcSize) { /* add src to tmpIn buffer */ memcpy(cctxPtr->tmpIn + cctxPtr->tmpInSize, srcBuffer, srcSize); @@ -864,11 +980,9 @@ size_t KLZ4F_compressUpdate(KLZ4F_cctx* cctxPtr, compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel, cctxPtr->cdict, cctxPtr->prefs.frameInfo.blockChecksumFlag); - if (cctxPtr->prefs.frameInfo.blockMode==KLZ4F_blockLinked) cctxPtr->tmpIn += blockSize; cctxPtr->tmpInSize = 0; - } - } + } } while ((size_t)(srcEnd - srcPtr) >= blockSize) { /* compress full blocks */ @@ -882,33 +996,38 @@ size_t KLZ4F_compressUpdate(KLZ4F_cctx* cctxPtr, } if ((cctxPtr->prefs.autoFlush) && (srcPtr < srcEnd)) { - /* compress remaining input < blockSize */ + /* autoFlush : remaining input (< blockSize) is compressed */ lastBlockCompressed = fromSrcBuffer; dstPtr += KLZ4F_makeBlock(dstPtr, srcPtr, (size_t)(srcEnd - srcPtr), compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel, cctxPtr->cdict, cctxPtr->prefs.frameInfo.blockChecksumFlag); - srcPtr = srcEnd; + srcPtr = srcEnd; } - /* preserve dictionary if necessary */ + /* preserve dictionary within @tmpBuff whenever necessary */ if ((cctxPtr->prefs.frameInfo.blockMode==KLZ4F_blockLinked) && (lastBlockCompressed==fromSrcBuffer)) { + /* linked blocks are only supported in compressed mode, see KLZ4F_uncompressedUpdate */ + assert(blockCompression == KLZ4B_COMPRESSED); if (compressOptionsPtr->stableSrc) { - cctxPtr->tmpIn = cctxPtr->tmpBuff; + cctxPtr->tmpIn = cctxPtr->tmpBuff; /* src is stable : dictionary remains in src across invocations */ } else { int const realDictSize = KLZ4F_localSaveDict(cctxPtr); - if (realDictSize==0) return err0r(KLZ4F_ERROR_GENERIC); + assert(0 <= realDictSize && realDictSize <= 64 KB); cctxPtr->tmpIn = cctxPtr->tmpBuff + realDictSize; } } /* keep tmpIn within limits */ - if ((cctxPtr->tmpIn + blockSize) > (cctxPtr->tmpBuff + cctxPtr->maxBufferSize) /* necessarily KLZ4F_blockLinked && lastBlockCompressed==fromTmpBuffer */ - && !(cctxPtr->prefs.autoFlush)) + if (!(cctxPtr->prefs.autoFlush) /* no autoflush : there may be some data left within internal buffer */ + && (cctxPtr->tmpIn + blockSize) > (cctxPtr->tmpBuff + cctxPtr->maxBufferSize) ) /* not enough room to store next block */ { + /* only preserve 64KB within internal buffer. Ensures there is enough room for next block. + * note: this situation necessarily implies lastBlockCompressed==fromTmpBuffer */ int const realDictSize = KLZ4F_localSaveDict(cctxPtr); cctxPtr->tmpIn = cctxPtr->tmpBuff + realDictSize; + assert((cctxPtr->tmpIn + blockSize) <= (cctxPtr->tmpBuff + cctxPtr->maxBufferSize)); } /* some input data left, necessarily < blockSize */ @@ -926,6 +1045,53 @@ size_t KLZ4F_compressUpdate(KLZ4F_cctx* cctxPtr, return (size_t)(dstPtr - dstStart); } +/*! KLZ4F_compressUpdate() : + * KLZ4F_compressUpdate() can be called repetitively to compress as much data as necessary. + * When successful, the function always entirely consumes @srcBuffer. + * src data is either buffered or compressed into @dstBuffer. + * If previously an uncompressed block was written, buffered data is flushed + * before appending compressed data is continued. + * @dstCapacity MUST be >= KLZ4F_compressBound(srcSize, preferencesPtr). + * @compressOptionsPtr is optional : provide NULL to mean "default". + * @return : the number of bytes written into dstBuffer. It can be zero, meaning input data was just buffered. + * or an error code if it fails (which can be tested using KLZ4F_isError()) + * After an error, the state is left in a UB state, and must be re-initialized. + */ +size_t KLZ4F_compressUpdate(KLZ4F_cctx* cctxPtr, + void* dstBuffer, size_t dstCapacity, + const void* srcBuffer, size_t srcSize, + const KLZ4F_compressOptions_t* compressOptionsPtr) +{ + return KLZ4F_compressUpdateImpl(cctxPtr, + dstBuffer, dstCapacity, + srcBuffer, srcSize, + compressOptionsPtr, KLZ4B_COMPRESSED); +} + +/*! KLZ4F_compressUpdate() : + * KLZ4F_compressUpdate() can be called repetitively to compress as much data as necessary. + * When successful, the function always entirely consumes @srcBuffer. + * src data is either buffered or compressed into @dstBuffer. + * If previously an uncompressed block was written, buffered data is flushed + * before appending compressed data is continued. + * This is only supported when KLZ4F_blockIndependent is used + * @dstCapacity MUST be >= KLZ4F_compressBound(srcSize, preferencesPtr). + * @compressOptionsPtr is optional : provide NULL to mean "default". + * @return : the number of bytes written into dstBuffer. It can be zero, meaning input data was just buffered. + * or an error code if it fails (which can be tested using KLZ4F_isError()) + * After an error, the state is left in a UB state, and must be re-initialized. + */ +size_t KLZ4F_uncompressedUpdate(KLZ4F_cctx* cctxPtr, + void* dstBuffer, size_t dstCapacity, + const void* srcBuffer, size_t srcSize, + const KLZ4F_compressOptions_t* compressOptionsPtr) { + RETURN_ERROR_IF(cctxPtr->prefs.frameInfo.blockMode != KLZ4F_blockIndependent, blockMode_invalid); + return KLZ4F_compressUpdateImpl(cctxPtr, + dstBuffer, dstCapacity, + srcBuffer, srcSize, + compressOptionsPtr, KLZ4B_UNCOMPRESSED); +} + /*! KLZ4F_flush() : * When compressed data must be sent immediately, without waiting for a block to be filled, @@ -944,13 +1110,12 @@ size_t KLZ4F_flush(KLZ4F_cctx* cctxPtr, compressFunc_t compress; if (cctxPtr->tmpInSize == 0) return 0; /* nothing to flush */ - if (cctxPtr->cStage != 1) return err0r(KLZ4F_ERROR_GENERIC); - if (dstCapacity < (cctxPtr->tmpInSize + BHSize + BFSize)) - return err0r(KLZ4F_ERROR_dstMaxSize_tooSmall); - (void)compressOptionsPtr; /* not yet useful */ + RETURN_ERROR_IF(cctxPtr->cStage != 1, compressionState_uninitialized); + RETURN_ERROR_IF(dstCapacity < (cctxPtr->tmpInSize + BHSize + BFSize), dstMaxSize_tooSmall); + (void)compressOptionsPtr; /* not useful (yet) */ /* select compression function */ - compress = KLZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel); + compress = KLZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel, cctxPtr->blockCompression); /* compress tmp buffer */ dstPtr += KLZ4F_makeBlock(dstPtr, @@ -992,19 +1157,19 @@ size_t KLZ4F_compressEnd(KLZ4F_cctx* cctxPtr, size_t const flushSize = KLZ4F_flush(cctxPtr, dstBuffer, dstCapacity, compressOptionsPtr); DEBUGLOG(5,"KLZ4F_compressEnd: dstCapacity=%u", (unsigned)dstCapacity); - if (KLZ4F_isError(flushSize)) return flushSize; + FORWARD_IF_ERROR(flushSize); dstPtr += flushSize; assert(flushSize <= dstCapacity); dstCapacity -= flushSize; - if (dstCapacity < 4) return err0r(KLZ4F_ERROR_dstMaxSize_tooSmall); + RETURN_ERROR_IF(dstCapacity < 4, dstMaxSize_tooSmall); KLZ4F_writeLE32(dstPtr, 0); dstPtr += 4; /* endMark */ if (cctxPtr->prefs.frameInfo.contentChecksumFlag == KLZ4F_contentChecksumEnabled) { U32 const xxh = KXXH32_digest(&(cctxPtr->xxh)); - if (dstCapacity < 8) return err0r(KLZ4F_ERROR_dstMaxSize_tooSmall); + RETURN_ERROR_IF(dstCapacity < 8, dstMaxSize_tooSmall); DEBUGLOG(5,"Writing 32-bit content checksum"); KLZ4F_writeLE32(dstPtr, xxh); dstPtr+=4; /* content Checksum */ @@ -1015,7 +1180,7 @@ size_t KLZ4F_compressEnd(KLZ4F_cctx* cctxPtr, if (cctxPtr->prefs.frameInfo.contentSize) { if (cctxPtr->prefs.frameInfo.contentSize != cctxPtr->totalInSize) - return err0r(KLZ4F_ERROR_frameSize_wrong); + RETURN_ERROR(frameSize_wrong); } return (size_t)(dstPtr - dstStart); @@ -1039,6 +1204,7 @@ typedef enum { } dStage_t; struct KLZ4F_dctx_s { + KLZ4F_CustomMem cmem; KLZ4F_frameInfo_t frameInfo; U32 version; dStage_t dStage; @@ -1056,26 +1222,37 @@ struct KLZ4F_dctx_s { size_t tmpOutStart; KXXH32_state_t xxh; KXXH32_state_t blockChecksum; + int skipChecksum; BYTE header[KLZ4F_HEADER_SIZE_MAX]; }; /* typedef'd to KLZ4F_dctx in lz4frame.h */ +KLZ4F_dctx* KLZ4F_createDecompressionContext_advanced(KLZ4F_CustomMem customMem, unsigned version) +{ + KLZ4F_dctx* const dctx = (KLZ4F_dctx*)KLZ4F_calloc(sizeof(KLZ4F_dctx), customMem); + if (dctx == NULL) return NULL; + + dctx->cmem = customMem; + dctx->version = version; + return dctx; +} + /*! KLZ4F_createDecompressionContext() : * Create a decompressionContext object, which will track all decompression operations. * Provides a pointer to a fully allocated and initialized KLZ4F_decompressionContext object. * Object can later be released using KLZ4F_freeDecompressionContext(). * @return : if != 0, there was an error during context creation. */ -KLZ4F_errorCode_t KLZ4F_createDecompressionContext(KLZ4F_dctx** KLZ4F_decompressionContextPtr, unsigned versionNumber) +KLZ4F_errorCode_t +KLZ4F_createDecompressionContext(KLZ4F_dctx** KLZ4F_decompressionContextPtr, unsigned versionNumber) { - KLZ4F_dctx* const dctx = (KLZ4F_dctx*)ALLOC_AND_ZERO(sizeof(KLZ4F_dctx)); - if (dctx == NULL) { /* failed allocation */ - *KLZ4F_decompressionContextPtr = NULL; - return err0r(KLZ4F_ERROR_allocation_failed); - } + assert(KLZ4F_decompressionContextPtr != NULL); /* violation of narrow contract */ + RETURN_ERROR_IF(KLZ4F_decompressionContextPtr == NULL, parameter_null); /* in case it nonetheless happen in production */ - dctx->version = versionNumber; - *KLZ4F_decompressionContextPtr = dctx; + *KLZ4F_decompressionContextPtr = KLZ4F_createDecompressionContext_advanced(KLZ4F_defaultCMem, versionNumber); + if (*KLZ4F_decompressionContextPtr == NULL) { /* failed allocation */ + RETURN_ERROR(allocation_failed); + } return KLZ4F_OK_NoError; } @@ -1084,9 +1261,9 @@ KLZ4F_errorCode_t KLZ4F_freeDecompressionContext(KLZ4F_dctx* dctx) KLZ4F_errorCode_t result = KLZ4F_OK_NoError; if (dctx != NULL) { /* can accept NULL input, like free() */ result = (KLZ4F_errorCode_t)dctx->dStage; - FREEMEM(dctx->tmpIn); - FREEMEM(dctx->tmpOutBuffer); - FREEMEM(dctx); + KLZ4F_free(dctx->tmpIn, dctx->cmem); + KLZ4F_free(dctx->tmpOutBuffer, dctx->cmem); + KLZ4F_free(dctx, dctx->cmem); } return result; } @@ -1099,6 +1276,7 @@ void KLZ4F_resetDecompressionContext(KLZ4F_dctx* dctx) dctx->dStage = dstage_getFrameHeader; dctx->dict = NULL; dctx->dictSize = 0; + dctx->skipChecksum = 0; } @@ -1118,7 +1296,7 @@ static size_t KLZ4F_decodeHeader(KLZ4F_dctx* dctx, const void* src, size_t srcSi DEBUGLOG(5, "KLZ4F_decodeHeader"); /* need to decode header to get frameInfo */ - if (srcSize < minFHSize) return err0r(KLZ4F_ERROR_frameHeader_incomplete); /* minimal frame header size */ + RETURN_ERROR_IF(srcSize < minFHSize, frameHeader_incomplete); /* minimal frame header size */ MEM_INIT(&(dctx->frameInfo), 0, sizeof(dctx->frameInfo)); /* special case : skippable frames */ @@ -1132,14 +1310,13 @@ static size_t KLZ4F_decodeHeader(KLZ4F_dctx* dctx, const void* src, size_t srcSi } else { dctx->dStage = dstage_getSFrameSize; return 4; - } - } + } } /* control magic number */ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION if (KLZ4F_readLE32(srcPtr) != KLZ4F_MAGICNUMBER) { DEBUGLOG(4, "frame header error : unknown magic number"); - return err0r(KLZ4F_ERROR_frameType_unknown); + RETURN_ERROR(frameType_unknown); } #endif dctx->frameInfo.frameType = KLZ4F_frame; @@ -1153,8 +1330,8 @@ static size_t KLZ4F_decodeHeader(KLZ4F_dctx* dctx, const void* src, size_t srcSi contentChecksumFlag = (FLG>>2) & _1BIT; dictIDFlag = FLG & _1BIT; /* validate */ - if (((FLG>>1)&_1BIT) != 0) return err0r(KLZ4F_ERROR_reservedFlag_set); /* Reserved bit */ - if (version != 1) return err0r(KLZ4F_ERROR_headerVersion_wrong); /* Version Number, only supported value */ + if (((FLG>>1)&_1BIT) != 0) RETURN_ERROR(reservedFlag_set); /* Reserved bit */ + if (version != 1) RETURN_ERROR(headerVersion_wrong); /* Version Number, only supported value */ } /* Frame Header Size */ @@ -1173,17 +1350,16 @@ static size_t KLZ4F_decodeHeader(KLZ4F_dctx* dctx, const void* src, size_t srcSi { U32 const BD = srcPtr[5]; blockSizeID = (BD>>4) & _3BITS; /* validate */ - if (((BD>>7)&_1BIT) != 0) return err0r(KLZ4F_ERROR_reservedFlag_set); /* Reserved bit */ - if (blockSizeID < 4) return err0r(KLZ4F_ERROR_maxBlockSize_invalid); /* 4-7 only supported values for the time being */ - if (((BD>>0)&_4BITS) != 0) return err0r(KLZ4F_ERROR_reservedFlag_set); /* Reserved bits */ + if (((BD>>7)&_1BIT) != 0) RETURN_ERROR(reservedFlag_set); /* Reserved bit */ + if (blockSizeID < 4) RETURN_ERROR(maxBlockSize_invalid); /* 4-7 only supported values for the time being */ + if (((BD>>0)&_4BITS) != 0) RETURN_ERROR(reservedFlag_set); /* Reserved bits */ } /* check header */ assert(frameHeaderSize > 5); #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION { BYTE const HC = KLZ4F_headerChecksum(srcPtr+4, frameHeaderSize-5); - if (HC != srcPtr[frameHeaderSize-1]) - return err0r(KLZ4F_ERROR_headerChecksum_invalid); + RETURN_ERROR_IF(HC != srcPtr[frameHeaderSize-1], headerChecksum_invalid); } #endif @@ -1192,10 +1368,9 @@ static size_t KLZ4F_decodeHeader(KLZ4F_dctx* dctx, const void* src, size_t srcSi dctx->frameInfo.blockChecksumFlag = (KLZ4F_blockChecksum_t)blockChecksumFlag; dctx->frameInfo.contentChecksumFlag = (KLZ4F_contentChecksum_t)contentChecksumFlag; dctx->frameInfo.blockSizeID = (KLZ4F_blockSizeID_t)blockSizeID; - dctx->maxBlockSize = KLZ4F_getBlockSize(blockSizeID); + dctx->maxBlockSize = KLZ4F_getBlockSize((KLZ4F_blockSizeID_t)blockSizeID); if (contentSizeFlag) - dctx->frameRemainingSize = - dctx->frameInfo.contentSize = KLZ4F_readLE64(srcPtr+6); + dctx->frameRemainingSize = dctx->frameInfo.contentSize = KLZ4F_readLE64(srcPtr+6); if (dictIDFlag) dctx->frameInfo.dictID = KLZ4F_readLE32(srcPtr + frameHeaderSize - 5); @@ -1211,11 +1386,11 @@ static size_t KLZ4F_decodeHeader(KLZ4F_dctx* dctx, const void* src, size_t srcSi */ size_t KLZ4F_headerSize(const void* src, size_t srcSize) { - if (src == NULL) return err0r(KLZ4F_ERROR_srcPtr_wrong); + RETURN_ERROR_IF(src == NULL, srcPtr_wrong); /* minimal srcSize to determine header size */ if (srcSize < KLZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH) - return err0r(KLZ4F_ERROR_frameHeader_incomplete); + RETURN_ERROR(frameHeader_incomplete); /* special case : skippable frames */ if ((KLZ4F_readLE32(src) & 0xFFFFFFF0U) == KLZ4F_MAGIC_SKIPPABLE_START) @@ -1224,7 +1399,7 @@ size_t KLZ4F_headerSize(const void* src, size_t srcSize) /* control magic number */ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION if (KLZ4F_readLE32(src) != KLZ4F_MAGICNUMBER) - return err0r(KLZ4F_ERROR_frameType_unknown); + RETURN_ERROR(frameType_unknown); #endif /* Frame Header Size */ @@ -1266,13 +1441,13 @@ KLZ4F_errorCode_t KLZ4F_getFrameInfo(KLZ4F_dctx* dctx, if (dctx->dStage == dstage_storeFrameHeader) { /* frame decoding already started, in the middle of header => automatic fail */ *srcSizePtr = 0; - return err0r(KLZ4F_ERROR_frameDecoding_alreadyStarted); + RETURN_ERROR(frameDecoding_alreadyStarted); } else { size_t const hSize = KLZ4F_headerSize(srcBuffer, *srcSizePtr); if (KLZ4F_isError(hSize)) { *srcSizePtr=0; return hSize; } if (*srcSizePtr < hSize) { *srcSizePtr=0; - return err0r(KLZ4F_ERROR_frameHeader_incomplete); + RETURN_ERROR(frameHeader_incomplete); } { size_t decodeResult = KLZ4F_decodeHeader(dctx, srcBuffer, hSize); @@ -1290,16 +1465,14 @@ KLZ4F_errorCode_t KLZ4F_getFrameInfo(KLZ4F_dctx* dctx, /* KLZ4F_updateDict() : * only used for KLZ4F_blockLinked mode - * Condition : dstPtr != NULL + * Condition : @dstPtr != NULL */ static void KLZ4F_updateDict(KLZ4F_dctx* dctx, const BYTE* dstPtr, size_t dstSize, const BYTE* dstBufferStart, unsigned withinTmp) { assert(dstPtr != NULL); - if (dctx->dictSize==0) { - dctx->dict = (const BYTE*)dstPtr; /* priority to prefix mode */ - } + if (dctx->dictSize==0) dctx->dict = (const BYTE*)dstPtr; /* will lead to prefix mode */ assert(dctx->dict != NULL); if (dctx->dict + dctx->dictSize == dstPtr) { /* prefix mode, everything within dstBuffer */ @@ -1362,7 +1535,6 @@ static void KLZ4F_updateDict(KLZ4F_dctx* dctx, } - /*! KLZ4F_decompress() : * Call this function repetitively to regenerate compressed data in srcBuffer. * The function will attempt to decode up to *srcSizePtr bytes from srcBuffer @@ -1406,6 +1578,7 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, *srcSizePtr = 0; *dstSizePtr = 0; assert(dctx != NULL); + dctx->skipChecksum |= (decompressOptionsPtr->skipChecksums != 0); /* once set, disable for the remainder of the frame */ /* behaves as a state machine */ @@ -1418,7 +1591,7 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, DEBUGLOG(6, "dstage_getFrameHeader"); if ((size_t)(srcEnd-srcPtr) >= maxFHSize) { /* enough to decode - shortcut */ size_t const hSize = KLZ4F_decodeHeader(dctx, srcPtr, (size_t)(srcEnd-srcPtr)); /* will update dStage appropriately */ - if (KLZ4F_isError(hSize)) return hSize; + FORWARD_IF_ERROR(hSize); srcPtr += hSize; break; } @@ -1440,9 +1613,7 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, doAnotherStage = 0; /* not enough src data, ask for some more */ break; } - { size_t const hSize = KLZ4F_decodeHeader(dctx, dctx->header, dctx->tmpInTarget); /* will update dStage appropriately */ - if (KLZ4F_isError(hSize)) return hSize; - } + FORWARD_IF_ERROR( KLZ4F_decodeHeader(dctx, dctx->header, dctx->tmpInTarget) ); /* will update dStage appropriately */ break; case dstage_init: @@ -1453,14 +1624,12 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, + ((dctx->frameInfo.blockMode==KLZ4F_blockLinked) ? 128 KB : 0); if (bufferNeeded > dctx->maxBufferSize) { /* tmp buffers too small */ dctx->maxBufferSize = 0; /* ensure allocation will be re-attempted on next entry*/ - FREEMEM(dctx->tmpIn); - dctx->tmpIn = (BYTE*)ALLOC(dctx->maxBlockSize + BFSize /* block checksum */); - if (dctx->tmpIn == NULL) - return err0r(KLZ4F_ERROR_allocation_failed); - FREEMEM(dctx->tmpOutBuffer); - dctx->tmpOutBuffer= (BYTE*)ALLOC(bufferNeeded); - if (dctx->tmpOutBuffer== NULL) - return err0r(KLZ4F_ERROR_allocation_failed); + KLZ4F_free(dctx->tmpIn, dctx->cmem); + dctx->tmpIn = (BYTE*)KLZ4F_malloc(dctx->maxBlockSize + BFSize /* block checksum */, dctx->cmem); + RETURN_ERROR_IF(dctx->tmpIn == NULL, allocation_failed); + KLZ4F_free(dctx->tmpOutBuffer, dctx->cmem); + dctx->tmpOutBuffer= (BYTE*)KLZ4F_malloc(bufferNeeded, dctx->cmem); + RETURN_ERROR_IF(dctx->tmpOutBuffer== NULL, allocation_failed); dctx->maxBufferSize = bufferNeeded; } } dctx->tmpInSize = 0; @@ -1509,7 +1678,7 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, break; } if (nextCBlockSize > dctx->maxBlockSize) { - return err0r(KLZ4F_ERROR_maxBlockSize_invalid); + RETURN_ERROR(maxBlockSize_invalid); } if (blockHeader & KLZ4F_BLOCKUNCOMPRESSED_FLAG) { /* next block is uncompressed */ @@ -1540,11 +1709,13 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, size_t const minBuffSize = MIN((size_t)(srcEnd-srcPtr), (size_t)(dstEnd-dstPtr)); sizeToCopy = MIN(dctx->tmpInTarget, minBuffSize); memcpy(dstPtr, srcPtr, sizeToCopy); - if (dctx->frameInfo.blockChecksumFlag) { - (void)KXXH32_update(&dctx->blockChecksum, srcPtr, sizeToCopy); + if (!dctx->skipChecksum) { + if (dctx->frameInfo.blockChecksumFlag) { + (void)KXXH32_update(&dctx->blockChecksum, srcPtr, sizeToCopy); + } + if (dctx->frameInfo.contentChecksumFlag) + (void)KXXH32_update(&dctx->xxh, srcPtr, sizeToCopy); } - if (dctx->frameInfo.contentChecksumFlag) - (void)KXXH32_update(&dctx->xxh, srcPtr, sizeToCopy); if (dctx->frameInfo.contentSize) dctx->frameRemainingSize -= sizeToCopy; @@ -1590,14 +1761,15 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, } crcSrc = dctx->header; } - { U32 const readCRC = KLZ4F_readLE32(crcSrc); + if (!dctx->skipChecksum) { + U32 const readCRC = KLZ4F_readLE32(crcSrc); U32 const calcCRC = KXXH32_digest(&dctx->blockChecksum); #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION DEBUGLOG(6, "compare block checksum"); if (readCRC != calcCRC) { DEBUGLOG(4, "incorrect block checksum: %08X != %08X", readCRC, calcCRC); - return err0r(KLZ4F_ERROR_blockChecksum_invalid); + RETURN_ERROR(blockChecksum_invalid); } #else (void)readCRC; @@ -1637,37 +1809,44 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, } /* At this stage, input is large enough to decode a block */ + + /* First, decode and control block checksum if it exists */ if (dctx->frameInfo.blockChecksumFlag) { + assert(dctx->tmpInTarget >= 4); dctx->tmpInTarget -= 4; assert(selectedIn != NULL); /* selectedIn is defined at this stage (either srcPtr, or dctx->tmpIn) */ { U32 const readBlockCrc = KLZ4F_readLE32(selectedIn + dctx->tmpInTarget); U32 const calcBlockCrc = KXXH32(selectedIn, dctx->tmpInTarget, 0); #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - if (readBlockCrc != calcBlockCrc) - return err0r(KLZ4F_ERROR_blockChecksum_invalid); + RETURN_ERROR_IF(readBlockCrc != calcBlockCrc, blockChecksum_invalid); #else (void)readBlockCrc; (void)calcBlockCrc; #endif } } - if ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize) { + /* decode directly into destination buffer if there is enough room */ + if ( ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize) + /* unless the dictionary is stored in tmpOut: + * in which case it's faster to decode within tmpOut + * to benefit from prefix speedup */ + && !(dctx->dict!= NULL && (const BYTE*)dctx->dict + dctx->dictSize == dctx->tmpOut) ) + { const char* dict = (const char*)dctx->dict; size_t dictSize = dctx->dictSize; int decodedSize; assert(dstPtr != NULL); if (dict && dictSize > 1 GB) { - /* the dictSize param is an int, avoid truncation / sign issues */ + /* overflow control : dctx->dictSize is an int, avoid truncation / sign issues */ dict += dictSize - 64 KB; dictSize = 64 KB; } - /* enough capacity in `dst` to decompress directly there */ decodedSize = KLZ4_decompress_safe_usingDict( (const char*)selectedIn, (char*)dstPtr, (int)dctx->tmpInTarget, (int)dctx->maxBlockSize, dict, (int)dictSize); - if (decodedSize < 0) return err0r(KLZ4F_ERROR_GENERIC); /* decompression failed */ - if (dctx->frameInfo.contentChecksumFlag) + RETURN_ERROR_IF(decodedSize < 0, decompressionFailed); + if ((dctx->frameInfo.contentChecksumFlag) && (!dctx->skipChecksum)) KXXH32_update(&(dctx->xxh), dstPtr, (size_t)decodedSize); if (dctx->frameInfo.contentSize) dctx->frameRemainingSize -= (size_t)decodedSize; @@ -1678,25 +1857,27 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, } dstPtr += decodedSize; - dctx->dStage = dstage_getBlockHeader; + dctx->dStage = dstage_getBlockHeader; /* end of block, let's get another one */ break; } /* not enough place into dst : decode into tmpOut */ - /* ensure enough place for tmpOut */ + + /* manage dictionary */ if (dctx->frameInfo.blockMode == KLZ4F_blockLinked) { if (dctx->dict == dctx->tmpOutBuffer) { + /* truncate dictionary to 64 KB if too big */ if (dctx->dictSize > 128 KB) { memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - 64 KB, 64 KB); dctx->dictSize = 64 KB; } dctx->tmpOut = dctx->tmpOutBuffer + dctx->dictSize; - } else { /* dict not within tmp */ + } else { /* dict not within tmpOut */ size_t const reservedDictSpace = MIN(dctx->dictSize, 64 KB); dctx->tmpOut = dctx->tmpOutBuffer + reservedDictSpace; } } - /* Decode block */ + /* Decode block into tmpOut */ { const char* dict = (const char*)dctx->dict; size_t dictSize = dctx->dictSize; int decodedSize; @@ -1709,9 +1890,8 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, (const char*)selectedIn, (char*)dctx->tmpOut, (int)dctx->tmpInTarget, (int)dctx->maxBlockSize, dict, (int)dictSize); - if (decodedSize < 0) /* decompression failed */ - return err0r(KLZ4F_ERROR_decompressionFailed); - if (dctx->frameInfo.contentChecksumFlag) + RETURN_ERROR_IF(decodedSize < 0, decompressionFailed); + if (dctx->frameInfo.contentChecksumFlag && !dctx->skipChecksum) KXXH32_update(&(dctx->xxh), dctx->tmpOut, (size_t)decodedSize); if (dctx->frameInfo.contentSize) dctx->frameRemainingSize -= (size_t)decodedSize; @@ -1744,8 +1924,7 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, break; case dstage_getSuffix: - if (dctx->frameRemainingSize) - return err0r(KLZ4F_ERROR_frameSize_wrong); /* incorrect frame size decoded */ + RETURN_ERROR_IF(dctx->frameRemainingSize, frameSize_wrong); /* incorrect frame size decoded */ if (!dctx->frameInfo.contentChecksumFlag) { /* no checksum, frame is completed */ nextSrcSizeHint = 0; KLZ4F_resetDecompressionContext(dctx); @@ -1777,20 +1956,20 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, } /* if (dctx->dStage == dstage_storeSuffix) */ /* case dstage_checkSuffix: */ /* no direct entry, avoid initialization risks */ - { U32 const readCRC = KLZ4F_readLE32(selectedIn); + if (!dctx->skipChecksum) { + U32 const readCRC = KLZ4F_readLE32(selectedIn); U32 const resultCRC = KXXH32_digest(&(dctx->xxh)); #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - if (readCRC != resultCRC) - return err0r(KLZ4F_ERROR_contentChecksum_invalid); + RETURN_ERROR_IF(readCRC != resultCRC, contentChecksum_invalid); #else (void)readCRC; (void)resultCRC; #endif - nextSrcSizeHint = 0; - KLZ4F_resetDecompressionContext(dctx); - doAnotherStage = 0; - break; } + nextSrcSizeHint = 0; + KLZ4F_resetDecompressionContext(dctx); + doAnotherStage = 0; + break; case dstage_getSFrameSize: if ((srcEnd - srcPtr) >= 4) { @@ -1841,7 +2020,7 @@ size_t KLZ4F_decompress(KLZ4F_dctx* dctx, } /* switch (dctx->dStage) */ } /* while (doAnotherStage) */ - /* preserve history within tmp whenever necessary */ + /* preserve history within tmpOut whenever necessary */ KLZ4F_STATIC_ASSERT((unsigned)dstage_init == 2); if ( (dctx->frameInfo.blockMode==KLZ4F_blockLinked) /* next block will use up to 64KB from previous ones */ && (dctx->dict != dctx->tmpOutBuffer) /* dictionary is not already within tmp */ diff --git a/src/third_party/librdkafka/dist/src/lz4frame.h b/src/third_party/librdkafka/dist/src/lz4frame.h index 0a43fed5753..1f6c9554145 100644 --- a/src/third_party/librdkafka/dist/src/lz4frame.h +++ b/src/third_party/librdkafka/dist/src/lz4frame.h @@ -1,7 +1,7 @@ /* - KLZ4 auto-framing library + KLZ4F - KLZ4-Frame library Header File - Copyright (C) 2011-2017, Yann Collet. + Copyright (C) 2011-2020, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without @@ -39,7 +39,7 @@ * KLZ4F also offers streaming capabilities. * * lz4.h is not required when using lz4frame.h, - * except to extract common constant such as KLZ4_VERSION_NUMBER. + * except to extract common constants such as KLZ4_VERSION_NUMBER. * */ #ifndef KLZ4F_H_09782039843 @@ -54,12 +54,12 @@ extern "C" { /** - Introduction - - lz4frame.h implements KLZ4 frame specification (doc/lz4_Frame_format.md). - lz4frame.h provides frame compression functions that take care - of encoding standard metadata alongside KLZ4-compressed blocks. -*/ + * Introduction + * + * lz4frame.h implements KLZ4 frame specification: see doc/lz4_Frame_format.md . + * KLZ4 Frames are compatible with `lz4` CLI, + * and designed to be interoperable with any system. +**/ /*-*************************************************************** * Compiler specifics @@ -210,7 +210,7 @@ KLZ4FLIB_API int KLZ4F_compressionLevel_max(void); /* v1.8.0+ */ * Returns the maximum possible compressed size with KLZ4F_compressFrame() given srcSize and preferences. * `preferencesPtr` is optional. It can be replaced by NULL, in which case, the function will assume default preferences. * Note : this result is only usable with KLZ4F_compressFrame(). - * It may also be used with KLZ4F_compressUpdate() _if no flush() operation_ is performed. + * It may also be relevant to KLZ4F_compressUpdate() _only if_ no flush() operation is ever performed. */ KLZ4FLIB_API size_t KLZ4F_compressFrameBound(size_t srcSize, const KLZ4F_preferences_t* preferencesPtr); @@ -230,7 +230,7 @@ KLZ4FLIB_API size_t KLZ4F_compressFrame(void* dstBuffer, size_t dstCapacity, * Advanced compression functions *************************************/ typedef struct KLZ4F_cctx_s KLZ4F_cctx; /* incomplete type */ -typedef KLZ4F_cctx* KLZ4F_compressionContext_t; /* for compatibility with previous API version */ +typedef KLZ4F_cctx* KLZ4F_compressionContext_t; /* for compatibility with older APIs, prefer using KLZ4F_cctx */ typedef struct { unsigned stableSrc; /* 1 == src content will remain present on future calls to KLZ4F_compress(); skip copying src content within tmp buffer */ @@ -243,20 +243,27 @@ typedef struct { KLZ4FLIB_API unsigned KLZ4F_getVersion(void); /*! KLZ4F_createCompressionContext() : - * The first thing to do is to create a compressionContext object, which will be used in all compression operations. - * This is achieved using KLZ4F_createCompressionContext(), which takes as argument a version. - * The version provided MUST be KLZ4F_VERSION. It is intended to track potential version mismatch, notably when using DLL. - * The function will provide a pointer to a fully allocated KLZ4F_cctx object. - * If @return != zero, there was an error during context creation. - * Object can release its memory using KLZ4F_freeCompressionContext(); - */ + * The first thing to do is to create a compressionContext object, + * which will keep track of operation state during streaming compression. + * This is achieved using KLZ4F_createCompressionContext(), which takes as argument a version, + * and a pointer to KLZ4F_cctx*, to write the resulting pointer into. + * @version provided MUST be KLZ4F_VERSION. It is intended to track potential version mismatch, notably when using DLL. + * The function provides a pointer to a fully allocated KLZ4F_cctx object. + * @cctxPtr MUST be != NULL. + * If @return != zero, context creation failed. + * A created compression context can be employed multiple times for consecutive streaming operations. + * Once all streaming compression jobs are completed, + * the state object can be released using KLZ4F_freeCompressionContext(). + * Note1 : KLZ4F_freeCompressionContext() is always successful. Its return value can be ignored. + * Note2 : KLZ4F_freeCompressionContext() works fine with NULL input pointers (do nothing). +**/ KLZ4FLIB_API KLZ4F_errorCode_t KLZ4F_createCompressionContext(KLZ4F_cctx** cctxPtr, unsigned version); KLZ4FLIB_API KLZ4F_errorCode_t KLZ4F_freeCompressionContext(KLZ4F_cctx* cctx); /*---- Compression ----*/ -#define KLZ4F_HEADER_SIZE_MIN 7 /* KLZ4 Frame header size can vary, depending on selected paramaters */ +#define KLZ4F_HEADER_SIZE_MIN 7 /* KLZ4 Frame header size can vary, depending on selected parameters */ #define KLZ4F_HEADER_SIZE_MAX 19 /* Size in bytes of a block header in little-endian format. Highest bit indicates if block data is uncompressed */ @@ -301,8 +308,9 @@ KLZ4FLIB_API size_t KLZ4F_compressBound(size_t srcSize, const KLZ4F_preferences_ * Important rule: dstCapacity MUST be large enough to ensure operation success even in worst case situations. * This value is provided by KLZ4F_compressBound(). * If this condition is not respected, KLZ4F_compress() will fail (result is an errorCode). - * KLZ4F_compressUpdate() doesn't guarantee error recovery. - * When an error occurs, compression context must be freed or resized. + * After an error, the state is left in a UB state, and must be re-initialized or freed. + * If previously an uncompressed block was written, buffered data is flushed + * before appending compressed data is continued. * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default. * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered). * or an error code if it fails (which can be tested using KLZ4F_isError()) @@ -347,8 +355,12 @@ typedef struct KLZ4F_dctx_s KLZ4F_dctx; /* incomplete type */ typedef KLZ4F_dctx* KLZ4F_decompressionContext_t; /* compatibility with previous API versions */ typedef struct { - unsigned stableDst; /* pledges that last 64KB decompressed data will remain available unmodified. This optimization skips storage operations in tmp buffers. */ - unsigned reserved[3]; /* must be set to zero for forward compatibility */ + unsigned stableDst; /* pledges that last 64KB decompressed data will remain available unmodified between invocations. + * This optimization skips storage operations in tmp buffers. */ + unsigned skipChecksums; /* disable checksum calculation and verification, even when one is present in frame, to save CPU time. + * Setting this option to 1 once disables all checksums for the rest of the frame. */ + unsigned reserved1; /* must be set to zero for forward compatibility */ + unsigned reserved0; /* idem */ } KLZ4F_decompressOptions_t; @@ -356,9 +368,10 @@ typedef struct { /*! KLZ4F_createDecompressionContext() : * Create an KLZ4F_dctx object, to track all decompression operations. - * The version provided MUST be KLZ4F_VERSION. - * The function provides a pointer to an allocated and initialized KLZ4F_dctx object. - * The result is an errorCode, which can be tested using KLZ4F_isError(). + * @version provided MUST be KLZ4F_VERSION. + * @dctxPtr MUST be valid. + * The function fills @dctxPtr with the value of a pointer to an allocated and initialized KLZ4F_dctx object. + * The @return is an errorCode, which can be tested using KLZ4F_isError(). * dctx memory can be released using KLZ4F_freeDecompressionContext(); * Result of KLZ4F_freeDecompressionContext() indicates current state of decompressionContext when being released. * That is, it should be == 0 if decompression has been completed fully and correctly. @@ -371,6 +384,8 @@ KLZ4FLIB_API KLZ4F_errorCode_t KLZ4F_freeDecompressionContext(KLZ4F_dctx* dctx); * Streaming decompression functions *************************************/ +#define KLZ4F_MAGICNUMBER 0x184D2204U +#define KLZ4F_MAGIC_SKIPPABLE_START 0x184D2A50U #define KLZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH 5 /*! KLZ4F_headerSize() : v1.9.0+ @@ -386,7 +401,7 @@ KLZ4FLIB_API size_t KLZ4F_headerSize(const void* src, size_t srcSize); /*! KLZ4F_getFrameInfo() : * This function extracts frame parameters (max blockSize, dictID, etc.). - * Its usage is optional: user can call KLZ4F_decompress() directly. + * Its usage is optional: user can also invoke KLZ4F_decompress() directly. * * Extracted information will fill an existing KLZ4F_frameInfo_t structure. * This can be useful for allocation and dictionary identification purposes. @@ -427,9 +442,10 @@ KLZ4FLIB_API size_t KLZ4F_headerSize(const void* src, size_t srcSize); * note 1 : in case of error, dctx is not modified. Decoding operation can resume from beginning safely. * note 2 : frame parameters are *copied into* an already allocated KLZ4F_frameInfo_t structure. */ -KLZ4FLIB_API size_t KLZ4F_getFrameInfo(KLZ4F_dctx* dctx, - KLZ4F_frameInfo_t* frameInfoPtr, - const void* srcBuffer, size_t* srcSizePtr); +KLZ4FLIB_API size_t +KLZ4F_getFrameInfo(KLZ4F_dctx* dctx, + KLZ4F_frameInfo_t* frameInfoPtr, + const void* srcBuffer, size_t* srcSizePtr); /*! KLZ4F_decompress() : * Call this function repetitively to regenerate data compressed in `srcBuffer`. @@ -462,10 +478,11 @@ KLZ4FLIB_API size_t KLZ4F_getFrameInfo(KLZ4F_dctx* dctx, * * After a frame is fully decoded, dctx can be used again to decompress another frame. */ -KLZ4FLIB_API size_t KLZ4F_decompress(KLZ4F_dctx* dctx, - void* dstBuffer, size_t* dstSizePtr, - const void* srcBuffer, size_t* srcSizePtr, - const KLZ4F_decompressOptions_t* dOptPtr); +KLZ4FLIB_API size_t +KLZ4F_decompress(KLZ4F_dctx* dctx, + void* dstBuffer, size_t* dstSizePtr, + const void* srcBuffer, size_t* srcSizePtr, + const KLZ4F_decompressOptions_t* dOptPtr); /*! KLZ4F_resetDecompressionContext() : added in v1.8.0 @@ -529,6 +546,8 @@ extern "C" { ITEM(ERROR_headerChecksum_invalid) \ ITEM(ERROR_contentChecksum_invalid) \ ITEM(ERROR_frameDecoding_alreadyStarted) \ + ITEM(ERROR_compressionState_uninitialized) \ + ITEM(ERROR_parameter_null) \ ITEM(ERROR_maxCode) #define KLZ4F_GENERATE_ENUM(ENUM) KLZ4F_##ENUM, @@ -539,7 +558,31 @@ typedef enum { KLZ4F_LIST_ERRORS(KLZ4F_GENERATE_ENUM) KLZ4FLIB_STATIC_API KLZ4F_errorCodes KLZ4F_getErrorCode(size_t functionResult); -KLZ4FLIB_STATIC_API size_t KLZ4F_getBlockSize(unsigned); + +/*! KLZ4F_getBlockSize() : + * Return, in scalar format (size_t), + * the maximum block size associated with blockSizeID. +**/ +KLZ4FLIB_STATIC_API size_t KLZ4F_getBlockSize(KLZ4F_blockSizeID_t blockSizeID); + +/*! KLZ4F_uncompressedUpdate() : + * KLZ4F_uncompressedUpdate() can be called repetitively to add as much data uncompressed data as necessary. + * Important rule: dstCapacity MUST be large enough to store the entire source buffer as + * no compression is done for this operation + * If this condition is not respected, KLZ4F_uncompressedUpdate() will fail (result is an errorCode). + * After an error, the state is left in a UB state, and must be re-initialized or freed. + * If previously a compressed block was written, buffered data is flushed + * before appending uncompressed data is continued. + * This is only supported when KLZ4F_blockIndependent is used + * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default. + * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered). + * or an error code if it fails (which can be tested using KLZ4F_isError()) + */ +KLZ4FLIB_STATIC_API size_t +KLZ4F_uncompressedUpdate(KLZ4F_cctx* cctx, + void* dstBuffer, size_t dstCapacity, + const void* srcBuffer, size_t srcSize, + const KLZ4F_compressOptions_t* cOptPtr); /********************************** * Bulk processing dictionary API @@ -583,12 +626,12 @@ KLZ4FLIB_STATIC_API void KLZ4F_freeCDict(KLZ4F_CDict* CDict); * but it's not recommended, as it's the only way to provide dictID in the frame header. * @return : number of bytes written into dstBuffer. * or an error code if it fails (can be tested using KLZ4F_isError()) */ -KLZ4FLIB_STATIC_API size_t KLZ4F_compressFrame_usingCDict( - KLZ4F_cctx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const KLZ4F_CDict* cdict, - const KLZ4F_preferences_t* preferencesPtr); +KLZ4FLIB_STATIC_API size_t +KLZ4F_compressFrame_usingCDict(KLZ4F_cctx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const KLZ4F_CDict* cdict, + const KLZ4F_preferences_t* preferencesPtr); /*! KLZ4F_compressBegin_usingCDict() : @@ -598,23 +641,49 @@ KLZ4FLIB_STATIC_API size_t KLZ4F_compressFrame_usingCDict( * however, it's the only way to provide dictID in the frame header. * @return : number of bytes written into dstBuffer for the header, * or an error code (which can be tested using KLZ4F_isError()) */ -KLZ4FLIB_STATIC_API size_t KLZ4F_compressBegin_usingCDict( - KLZ4F_cctx* cctx, - void* dstBuffer, size_t dstCapacity, - const KLZ4F_CDict* cdict, - const KLZ4F_preferences_t* prefsPtr); +KLZ4FLIB_STATIC_API size_t +KLZ4F_compressBegin_usingCDict(KLZ4F_cctx* cctx, + void* dstBuffer, size_t dstCapacity, + const KLZ4F_CDict* cdict, + const KLZ4F_preferences_t* prefsPtr); /*! KLZ4F_decompress_usingDict() : * Same as KLZ4F_decompress(), using a predefined dictionary. * Dictionary is used "in place", without any preprocessing. - * It must remain accessible throughout the entire frame decoding. */ -KLZ4FLIB_STATIC_API size_t KLZ4F_decompress_usingDict( - KLZ4F_dctx* dctxPtr, - void* dstBuffer, size_t* dstSizePtr, - const void* srcBuffer, size_t* srcSizePtr, - const void* dict, size_t dictSize, - const KLZ4F_decompressOptions_t* decompressOptionsPtr); +** It must remain accessible throughout the entire frame decoding. */ +KLZ4FLIB_STATIC_API size_t +KLZ4F_decompress_usingDict(KLZ4F_dctx* dctxPtr, + void* dstBuffer, size_t* dstSizePtr, + const void* srcBuffer, size_t* srcSizePtr, + const void* dict, size_t dictSize, + const KLZ4F_decompressOptions_t* decompressOptionsPtr); + + +/*! Custom memory allocation : + * These prototypes make it possible to pass custom allocation/free functions. + * KLZ4F_customMem is provided at state creation time, using KLZ4F_create*_advanced() listed below. + * All allocation/free operations will be completed using these custom variants instead of regular ones. + */ +typedef void* (*KLZ4F_AllocFunction) (void* opaqueState, size_t size); +typedef void* (*KLZ4F_CallocFunction) (void* opaqueState, size_t size); +typedef void (*KLZ4F_FreeFunction) (void* opaqueState, void* address); +typedef struct { + KLZ4F_AllocFunction customAlloc; + KLZ4F_CallocFunction customCalloc; /* optional; when not defined, uses customAlloc + memset */ + KLZ4F_FreeFunction customFree; + void* opaqueState; +} KLZ4F_CustomMem; +static +#ifdef __GNUC__ +__attribute__((__unused__)) +#endif +KLZ4F_CustomMem const KLZ4F_defaultCMem = { NULL, NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ + +KLZ4FLIB_STATIC_API KLZ4F_cctx* KLZ4F_createCompressionContext_advanced(KLZ4F_CustomMem customMem, unsigned version); +KLZ4FLIB_STATIC_API KLZ4F_dctx* KLZ4F_createDecompressionContext_advanced(KLZ4F_CustomMem customMem, unsigned version); +KLZ4FLIB_STATIC_API KLZ4F_CDict* KLZ4F_createCDict_advanced(KLZ4F_CustomMem customMem, const void* dictBuffer, size_t dictSize); + #if defined (__cplusplus) } diff --git a/src/third_party/librdkafka/dist/src/lz4frame_static.h b/src/third_party/librdkafka/dist/src/lz4frame_static.h index d8ce83f16e1..0c36991877d 100644 --- a/src/third_party/librdkafka/dist/src/lz4frame_static.h +++ b/src/third_party/librdkafka/dist/src/lz4frame_static.h @@ -1,7 +1,7 @@ /* KLZ4 auto-framing library Header File for static linking only - Copyright (C) 2011-2016, Yann Collet. + Copyright (C) 2011-2020, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) diff --git a/src/third_party/librdkafka/dist/src/lz4hc.c b/src/third_party/librdkafka/dist/src/lz4hc.c index f6344a40f1b..66b87f917c0 100644 --- a/src/third_party/librdkafka/dist/src/lz4hc.c +++ b/src/third_party/librdkafka/dist/src/lz4hc.c @@ -1,6 +1,6 @@ /* KLZ4 HC - High Compression Mode of KLZ4 - Copyright (C) 2011-2017, Yann Collet. + Copyright (C) 2011-2020, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) @@ -42,7 +42,7 @@ * Select how default compression function will allocate workplace memory, * in stack (0:fastest), or in heap (1:requires malloc()). * Since workplace is rather large, heap mode is recommended. - */ +**/ #ifndef KLZ4HC_HEAPMODE # define KLZ4HC_HEAPMODE 1 #endif @@ -99,18 +99,20 @@ static void KLZ4HC_clearTables (KLZ4HC_CCtx_internal* hc4) static void KLZ4HC_init_internal (KLZ4HC_CCtx_internal* hc4, const BYTE* start) { - uptrval startingOffset = (uptrval)(hc4->end - hc4->base); - if (startingOffset > 1 GB) { + size_t const bufferSize = (size_t)(hc4->end - hc4->prefixStart); + size_t newStartingOffset = bufferSize + hc4->dictLimit; + assert(newStartingOffset >= bufferSize); /* check overflow */ + if (newStartingOffset > 1 GB) { KLZ4HC_clearTables(hc4); - startingOffset = 0; + newStartingOffset = 0; } - startingOffset += 64 KB; - hc4->nextToUpdate = (U32) startingOffset; - hc4->base = start - startingOffset; + newStartingOffset += 64 KB; + hc4->nextToUpdate = (U32)newStartingOffset; + hc4->prefixStart = start; hc4->end = start; - hc4->dictBase = start - startingOffset; - hc4->dictLimit = (U32) startingOffset; - hc4->lowLimit = (U32) startingOffset; + hc4->dictStart = start; + hc4->dictLimit = (U32)newStartingOffset; + hc4->lowLimit = (U32)newStartingOffset; } @@ -119,12 +121,15 @@ KLZ4_FORCE_INLINE void KLZ4HC_Insert (KLZ4HC_CCtx_internal* hc4, const BYTE* ip) { U16* const chainTable = hc4->chainTable; U32* const hashTable = hc4->hashTable; - const BYTE* const base = hc4->base; - U32 const target = (U32)(ip - base); + const BYTE* const prefixPtr = hc4->prefixStart; + U32 const prefixIdx = hc4->dictLimit; + U32 const target = (U32)(ip - prefixPtr) + prefixIdx; U32 idx = hc4->nextToUpdate; + assert(ip >= prefixPtr); + assert(target >= prefixIdx); while (idx < target) { - U32 const h = KLZ4HC_hashPtr(base+idx); + U32 const h = KLZ4HC_hashPtr(prefixPtr+idx-prefixIdx); size_t delta = idx - hashTable[h]; if (delta>KLZ4_DISTANCE_MAX) delta = KLZ4_DISTANCE_MAX; DELTANEXTU16(chainTable, idx) = (U16)delta; @@ -193,15 +198,14 @@ KLZ4HC_countPattern(const BYTE* ip, const BYTE* const iEnd, U32 const pattern32) BYTE const byte = (BYTE)(pattern >> bitOffset); if (*ip != byte) break; ip ++; bitOffset -= 8; - } - } + } } return (unsigned)(ip - iStart); } /* KLZ4HC_reverseCountPattern() : * pattern must be a sample of repetitive pattern of length 1, 2 or 4 (but not 3!) - * read using natural platform endianess */ + * read using natural platform endianness */ static unsigned KLZ4HC_reverseCountPattern(const BYTE* ip, const BYTE* const iLow, U32 pattern) { @@ -211,7 +215,7 @@ KLZ4HC_reverseCountPattern(const BYTE* ip, const BYTE* const iLow, U32 pattern) if (KLZ4_read32(ip-4) != pattern) break; ip -= 4; } - { const BYTE* bytePtr = (const BYTE*)(&pattern) + 3; /* works for any endianess */ + { const BYTE* bytePtr = (const BYTE*)(&pattern) + 3; /* works for any endianness */ while (likely(ip>iLow)) { if (ip[-1] != *bytePtr) break; ip--; bytePtr--; @@ -234,28 +238,28 @@ typedef enum { favorCompressionRatio=0, favorDecompressionSpeed } HCfavor_e; KLZ4_FORCE_INLINE int KLZ4HC_InsertAndGetWiderMatch ( - KLZ4HC_CCtx_internal* hc4, - const BYTE* const ip, - const BYTE* const iLowLimit, - const BYTE* const iHighLimit, - int longest, - const BYTE** matchpos, - const BYTE** startpos, - const int maxNbAttempts, - const int patternAnalysis, - const int chainSwap, - const dictCtx_directive dict, - const HCfavor_e favorDecSpeed) + KLZ4HC_CCtx_internal* const hc4, + const BYTE* const ip, + const BYTE* const iLowLimit, const BYTE* const iHighLimit, + int longest, + const BYTE** matchpos, + const BYTE** startpos, + const int maxNbAttempts, + const int patternAnalysis, const int chainSwap, + const dictCtx_directive dict, + const HCfavor_e favorDecSpeed) { U16* const chainTable = hc4->chainTable; U32* const HashTable = hc4->hashTable; const KLZ4HC_CCtx_internal * const dictCtx = hc4->dictCtx; - const BYTE* const base = hc4->base; - const U32 dictLimit = hc4->dictLimit; - const BYTE* const lowPrefixPtr = base + dictLimit; - const U32 ipIndex = (U32)(ip - base); - const U32 lowestMatchIndex = (hc4->lowLimit + (KLZ4_DISTANCE_MAX + 1) > ipIndex) ? hc4->lowLimit : ipIndex - KLZ4_DISTANCE_MAX; - const BYTE* const dictBase = hc4->dictBase; + const BYTE* const prefixPtr = hc4->prefixStart; + const U32 prefixIdx = hc4->dictLimit; + const U32 ipIndex = (U32)(ip - prefixPtr) + prefixIdx; + const int withinStartDistance = (hc4->lowLimit + (KLZ4_DISTANCE_MAX + 1) > ipIndex); + const U32 lowestMatchIndex = (withinStartDistance) ? hc4->lowLimit : ipIndex - KLZ4_DISTANCE_MAX; + const BYTE* const dictStart = hc4->dictStart; + const U32 dictIdx = hc4->lowLimit; + const BYTE* const dictEnd = dictStart + prefixIdx - dictIdx; int const lookBackLength = (int)(ip-iLowLimit); int nbAttempts = maxNbAttempts; U32 matchChainPos = 0; @@ -277,14 +281,13 @@ KLZ4HC_InsertAndGetWiderMatch ( assert(matchIndex < ipIndex); if (favorDecSpeed && (ipIndex - matchIndex < 8)) { /* do nothing */ - } else if (matchIndex >= dictLimit) { /* within current Prefix */ - const BYTE* const matchPtr = base + matchIndex; - assert(matchPtr >= lowPrefixPtr); + } else if (matchIndex >= prefixIdx) { /* within current Prefix */ + const BYTE* const matchPtr = prefixPtr + matchIndex - prefixIdx; assert(matchPtr < ip); assert(longest >= 1); if (KLZ4_read16(iLowLimit + longest - 1) == KLZ4_read16(matchPtr - lookBackLength + longest - 1)) { if (KLZ4_read32(matchPtr) == pattern) { - int const back = lookBackLength ? KLZ4HC_countBack(ip, matchPtr, iLowLimit, lowPrefixPtr) : 0; + int const back = lookBackLength ? KLZ4HC_countBack(ip, matchPtr, iLowLimit, prefixPtr) : 0; matchLength = MINMATCH + (int)KLZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit); matchLength -= back; if (matchLength > longest) { @@ -293,24 +296,25 @@ KLZ4HC_InsertAndGetWiderMatch ( *startpos = ip + back; } } } } else { /* lowestMatchIndex <= matchIndex < dictLimit */ - const BYTE* const matchPtr = dictBase + matchIndex; - if (KLZ4_read32(matchPtr) == pattern) { - const BYTE* const dictStart = dictBase + hc4->lowLimit; + const BYTE* const matchPtr = dictStart + (matchIndex - dictIdx); + assert(matchIndex >= dictIdx); + if ( likely(matchIndex <= prefixIdx - 4) + && (KLZ4_read32(matchPtr) == pattern) ) { int back = 0; - const BYTE* vLimit = ip + (dictLimit - matchIndex); + const BYTE* vLimit = ip + (prefixIdx - matchIndex); if (vLimit > iHighLimit) vLimit = iHighLimit; matchLength = (int)KLZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH; if ((ip+matchLength == vLimit) && (vLimit < iHighLimit)) - matchLength += KLZ4_count(ip+matchLength, lowPrefixPtr, iHighLimit); + matchLength += KLZ4_count(ip+matchLength, prefixPtr, iHighLimit); back = lookBackLength ? KLZ4HC_countBack(ip, matchPtr, iLowLimit, dictStart) : 0; matchLength -= back; if (matchLength > longest) { longest = matchLength; - *matchpos = base + matchIndex + back; /* virtual pos, relative to ip, to retrieve offset */ + *matchpos = prefixPtr - prefixIdx + matchIndex + back; /* virtual pos, relative to ip, to retrieve offset */ *startpos = ip + back; } } } - if (chainSwap && matchLength==longest) { /* better match => select a better chain */ + if (chainSwap && matchLength==longest) { /* better match => select a better chain */ assert(lookBackLength==0); /* search forward only */ if (matchIndex + (U32)longest <= ipIndex) { int const kTrigger = 4; @@ -326,8 +330,7 @@ KLZ4HC_InsertAndGetWiderMatch ( distanceToNextMatch = candidateDist; matchChainPos = (U32)pos; accel = 1 << kTrigger; - } - } + } } if (distanceToNextMatch > 1) { if (distanceToNextMatch > matchIndex) break; /* avoid overflow */ matchIndex -= distanceToNextMatch; @@ -347,23 +350,24 @@ KLZ4HC_InsertAndGetWiderMatch ( repeat = rep_not; } } if ( (repeat == rep_confirmed) && (matchCandidateIdx >= lowestMatchIndex) - && KLZ4HC_protectDictEnd(dictLimit, matchCandidateIdx) ) { - const int extDict = matchCandidateIdx < dictLimit; - const BYTE* const matchPtr = (extDict ? dictBase : base) + matchCandidateIdx; + && KLZ4HC_protectDictEnd(prefixIdx, matchCandidateIdx) ) { + const int extDict = matchCandidateIdx < prefixIdx; + const BYTE* const matchPtr = (extDict ? dictStart - dictIdx : prefixPtr - prefixIdx) + matchCandidateIdx; if (KLZ4_read32(matchPtr) == pattern) { /* good candidate */ - const BYTE* const dictStart = dictBase + hc4->lowLimit; - const BYTE* const iLimit = extDict ? dictBase + dictLimit : iHighLimit; + const BYTE* const iLimit = extDict ? dictEnd : iHighLimit; size_t forwardPatternLength = KLZ4HC_countPattern(matchPtr+sizeof(pattern), iLimit, pattern) + sizeof(pattern); if (extDict && matchPtr + forwardPatternLength == iLimit) { U32 const rotatedPattern = KLZ4HC_rotatePattern(forwardPatternLength, pattern); - forwardPatternLength += KLZ4HC_countPattern(lowPrefixPtr, iHighLimit, rotatedPattern); + forwardPatternLength += KLZ4HC_countPattern(prefixPtr, iHighLimit, rotatedPattern); } - { const BYTE* const lowestMatchPtr = extDict ? dictStart : lowPrefixPtr; + { const BYTE* const lowestMatchPtr = extDict ? dictStart : prefixPtr; size_t backLength = KLZ4HC_reverseCountPattern(matchPtr, lowestMatchPtr, pattern); size_t currentSegmentLength; - if (!extDict && matchPtr - backLength == lowPrefixPtr && hc4->lowLimit < dictLimit) { + if (!extDict + && matchPtr - backLength == prefixPtr + && dictIdx < prefixIdx) { U32 const rotatedPattern = KLZ4HC_rotatePattern((U32)(-(int)backLength), pattern); - backLength += KLZ4HC_reverseCountPattern(dictBase + dictLimit, dictStart, rotatedPattern); + backLength += KLZ4HC_reverseCountPattern(dictEnd, dictStart, rotatedPattern); } /* Limit backLength not go further than lowestMatchIndex */ backLength = matchCandidateIdx - MAX(matchCandidateIdx - (U32)backLength, lowestMatchIndex); @@ -373,28 +377,28 @@ KLZ4HC_InsertAndGetWiderMatch ( if ( (currentSegmentLength >= srcPatternLength) /* current pattern segment large enough to contain full srcPatternLength */ && (forwardPatternLength <= srcPatternLength) ) { /* haven't reached this position yet */ U32 const newMatchIndex = matchCandidateIdx + (U32)forwardPatternLength - (U32)srcPatternLength; /* best position, full pattern, might be followed by more match */ - if (KLZ4HC_protectDictEnd(dictLimit, newMatchIndex)) + if (KLZ4HC_protectDictEnd(prefixIdx, newMatchIndex)) matchIndex = newMatchIndex; else { /* Can only happen if started in the prefix */ - assert(newMatchIndex >= dictLimit - 3 && newMatchIndex < dictLimit && !extDict); - matchIndex = dictLimit; + assert(newMatchIndex >= prefixIdx - 3 && newMatchIndex < prefixIdx && !extDict); + matchIndex = prefixIdx; } } else { U32 const newMatchIndex = matchCandidateIdx - (U32)backLength; /* farthest position in current segment, will find a match of length currentSegmentLength + maybe some back */ - if (!KLZ4HC_protectDictEnd(dictLimit, newMatchIndex)) { - assert(newMatchIndex >= dictLimit - 3 && newMatchIndex < dictLimit && !extDict); - matchIndex = dictLimit; + if (!KLZ4HC_protectDictEnd(prefixIdx, newMatchIndex)) { + assert(newMatchIndex >= prefixIdx - 3 && newMatchIndex < prefixIdx && !extDict); + matchIndex = prefixIdx; } else { matchIndex = newMatchIndex; if (lookBackLength==0) { /* no back possible */ size_t const maxML = MIN(currentSegmentLength, srcPatternLength); if ((size_t)longest < maxML) { - assert(base + matchIndex != ip); - if ((size_t)(ip - base) - matchIndex > KLZ4_DISTANCE_MAX) break; + assert(prefixPtr - prefixIdx + matchIndex != ip); + if ((size_t)(ip - prefixPtr) + prefixIdx - matchIndex > KLZ4_DISTANCE_MAX) break; assert(maxML < 2 GB); longest = (int)maxML; - *matchpos = base + matchIndex; /* virtual pos, relative to ip, to retrieve offset */ + *matchpos = prefixPtr - prefixIdx + matchIndex; /* virtual pos, relative to ip, to retrieve offset */ *startpos = ip; } { U32 const distToNextPattern = DELTANEXTU16(chainTable, matchIndex); @@ -413,12 +417,12 @@ KLZ4HC_InsertAndGetWiderMatch ( if ( dict == usingDictCtxHc && nbAttempts > 0 && ipIndex - lowestMatchIndex < KLZ4_DISTANCE_MAX) { - size_t const dictEndOffset = (size_t)(dictCtx->end - dictCtx->base); + size_t const dictEndOffset = (size_t)(dictCtx->end - dictCtx->prefixStart) + dictCtx->dictLimit; U32 dictMatchIndex = dictCtx->hashTable[KLZ4HC_hashPtr(ip)]; assert(dictEndOffset <= 1 GB); matchIndex = dictMatchIndex + lowestMatchIndex - (U32)dictEndOffset; while (ipIndex - matchIndex <= KLZ4_DISTANCE_MAX && nbAttempts--) { - const BYTE* const matchPtr = dictCtx->base + dictMatchIndex; + const BYTE* const matchPtr = dictCtx->prefixStart - dictCtx->dictLimit + dictMatchIndex; if (KLZ4_read32(matchPtr) == pattern) { int mlt; @@ -426,11 +430,11 @@ KLZ4HC_InsertAndGetWiderMatch ( const BYTE* vLimit = ip + (dictEndOffset - dictMatchIndex); if (vLimit > iHighLimit) vLimit = iHighLimit; mlt = (int)KLZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH; - back = lookBackLength ? KLZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->base + dictCtx->dictLimit) : 0; + back = lookBackLength ? KLZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->prefixStart) : 0; mlt -= back; if (mlt > longest) { longest = mlt; - *matchpos = base + matchIndex + back; + *matchpos = prefixPtr - prefixIdx + matchIndex + back; *startpos = ip + back; } } @@ -442,13 +446,13 @@ KLZ4HC_InsertAndGetWiderMatch ( return longest; } -KLZ4_FORCE_INLINE -int KLZ4HC_InsertAndFindBestMatch(KLZ4HC_CCtx_internal* const hc4, /* Index table will be updated */ - const BYTE* const ip, const BYTE* const iLimit, - const BYTE** matchpos, - const int maxNbAttempts, - const int patternAnalysis, - const dictCtx_directive dict) +KLZ4_FORCE_INLINE int +KLZ4HC_InsertAndFindBestMatch(KLZ4HC_CCtx_internal* const hc4, /* Index table will be updated */ + const BYTE* const ip, const BYTE* const iLimit, + const BYTE** matchpos, + const int maxNbAttempts, + const int patternAnalysis, + const dictCtx_directive dict) { const BYTE* uselessPtr = ip; /* note : KLZ4HC_InsertAndGetWiderMatch() is able to modify the starting position of a match (*startpos), @@ -751,7 +755,7 @@ _last_literals: } else { *op++ = (BYTE)(lastRunSize << ML_BITS); } - memcpy(op, anchor, lastRunSize); + KLZ4_memcpy(op, anchor, lastRunSize); op += lastRunSize; } @@ -884,13 +888,13 @@ KLZ4HC_compress_generic_dictCtx ( limitedOutput_directive limit ) { - const size_t position = (size_t)(ctx->end - ctx->base) - ctx->lowLimit; + const size_t position = (size_t)(ctx->end - ctx->prefixStart) + (ctx->dictLimit - ctx->lowLimit); assert(ctx->dictCtx != NULL); if (position >= 64 KB) { ctx->dictCtx = NULL; return KLZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit); } else if (position == 0 && *srcSizePtr > 4 KB) { - memcpy(ctx, ctx->dictCtx, sizeof(KLZ4HC_CCtx_internal)); + KLZ4_memcpy(ctx, ctx->dictCtx, sizeof(KLZ4HC_CCtx_internal)); KLZ4HC_setExternalDict(ctx, (const BYTE *)src); ctx->compressionLevel = (short)cLevel; return KLZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit); @@ -953,13 +957,15 @@ int KLZ4_compress_HC_extStateHC (void* state, const char* src, char* dst, int sr int KLZ4_compress_HC(const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel) { + int cSize; #if defined(KLZ4HC_HEAPMODE) && KLZ4HC_HEAPMODE==1 KLZ4_streamHC_t* const statePtr = (KLZ4_streamHC_t*)ALLOC(sizeof(KLZ4_streamHC_t)); + if (statePtr==NULL) return 0; #else KLZ4_streamHC_t state; KLZ4_streamHC_t* const statePtr = &state; #endif - int const cSize = KLZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, dstCapacity, compressionLevel); + cSize = KLZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, dstCapacity, compressionLevel); #if defined(KLZ4HC_HEAPMODE) && KLZ4HC_HEAPMODE==1 FREEMEM(statePtr); #endif @@ -982,6 +988,7 @@ int KLZ4_compress_HC_destSize(void* state, const char* source, char* dest, int* * Streaming Functions **************************************/ /* allocation */ +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) KLZ4_streamHC_t* KLZ4_createStreamHC(void) { KLZ4_streamHC_t* const state = @@ -998,13 +1005,12 @@ int KLZ4_freeStreamHC (KLZ4_streamHC_t* KLZ4_streamHCPtr) FREEMEM(KLZ4_streamHCPtr); return 0; } +#endif KLZ4_streamHC_t* KLZ4_initStreamHC (void* buffer, size_t size) { KLZ4_streamHC_t* const KLZ4_streamHCPtr = (KLZ4_streamHC_t*)buffer; - /* if compilation fails here, KLZ4_STREAMHCSIZE must be increased */ - KLZ4_STATIC_ASSERT(sizeof(KLZ4HC_CCtx_internal) <= KLZ4_STREAMHCSIZE); DEBUGLOG(4, "KLZ4_initStreamHC(%p, %u)", buffer, (unsigned)size); /* check conditions */ if (buffer == NULL) return NULL; @@ -1030,9 +1036,13 @@ void KLZ4_resetStreamHC_fast (KLZ4_streamHC_t* KLZ4_streamHCPtr, int compression if (KLZ4_streamHCPtr->internal_donotuse.dirty) { KLZ4_initStreamHC(KLZ4_streamHCPtr, sizeof(*KLZ4_streamHCPtr)); } else { - /* preserve end - base : can trigger clearTable's threshold */ - KLZ4_streamHCPtr->internal_donotuse.end -= (uptrval)KLZ4_streamHCPtr->internal_donotuse.base; - KLZ4_streamHCPtr->internal_donotuse.base = NULL; + /* preserve end - prefixStart : can trigger clearTable's threshold */ + if (KLZ4_streamHCPtr->internal_donotuse.end != NULL) { + KLZ4_streamHCPtr->internal_donotuse.end -= (uptrval)KLZ4_streamHCPtr->internal_donotuse.prefixStart; + } else { + assert(KLZ4_streamHCPtr->internal_donotuse.prefixStart == NULL); + } + KLZ4_streamHCPtr->internal_donotuse.prefixStart = NULL; KLZ4_streamHCPtr->internal_donotuse.dictCtx = NULL; } KLZ4_setCompressionLevel(KLZ4_streamHCPtr, compressionLevel); @@ -1083,14 +1093,14 @@ void KLZ4_attach_HC_dictionary(KLZ4_streamHC_t *working_stream, const KLZ4_strea static void KLZ4HC_setExternalDict(KLZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock) { DEBUGLOG(4, "KLZ4HC_setExternalDict(%p, %p)", ctxPtr, newBlock); - if (ctxPtr->end >= ctxPtr->base + ctxPtr->dictLimit + 4) + if (ctxPtr->end >= ctxPtr->prefixStart + 4) KLZ4HC_Insert (ctxPtr, ctxPtr->end-3); /* Referencing remaining dictionary content */ /* Only one memory segment for extDict, so any previous extDict is lost at this stage */ ctxPtr->lowLimit = ctxPtr->dictLimit; - ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base); - ctxPtr->dictBase = ctxPtr->base; - ctxPtr->base = newBlock - ctxPtr->dictLimit; + ctxPtr->dictStart = ctxPtr->prefixStart; + ctxPtr->dictLimit += (U32)(ctxPtr->end - ctxPtr->prefixStart); + ctxPtr->prefixStart = newBlock; ctxPtr->end = newBlock; ctxPtr->nextToUpdate = ctxPtr->dictLimit; /* match referencing will resume from there */ @@ -1109,11 +1119,11 @@ KLZ4_compressHC_continue_generic (KLZ4_streamHC_t* KLZ4_streamHCPtr, KLZ4_streamHCPtr, src, *srcSizePtr, limit); assert(ctxPtr != NULL); /* auto-init if forgotten */ - if (ctxPtr->base == NULL) KLZ4HC_init_internal (ctxPtr, (const BYTE*) src); + if (ctxPtr->prefixStart == NULL) KLZ4HC_init_internal (ctxPtr, (const BYTE*) src); /* Check overflow */ - if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 GB) { - size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base) - ctxPtr->dictLimit; + if ((size_t)(ctxPtr->end - ctxPtr->prefixStart) + ctxPtr->dictLimit > 2 GB) { + size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->prefixStart); if (dictSize > 64 KB) dictSize = 64 KB; KLZ4_loadDictHC(KLZ4_streamHCPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize); } @@ -1124,13 +1134,16 @@ KLZ4_compressHC_continue_generic (KLZ4_streamHC_t* KLZ4_streamHCPtr, /* Check overlapping input/dictionary space */ { const BYTE* sourceEnd = (const BYTE*) src + *srcSizePtr; - const BYTE* const dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit; - const BYTE* const dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit; + const BYTE* const dictBegin = ctxPtr->dictStart; + const BYTE* const dictEnd = ctxPtr->dictStart + (ctxPtr->dictLimit - ctxPtr->lowLimit); if ((sourceEnd > dictBegin) && ((const BYTE*)src < dictEnd)) { if (sourceEnd > dictEnd) sourceEnd = dictEnd; - ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase); - if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) ctxPtr->lowLimit = ctxPtr->dictLimit; - } } + ctxPtr->lowLimit += (U32)(sourceEnd - ctxPtr->dictStart); + ctxPtr->dictStart += (U32)(sourceEnd - ctxPtr->dictStart); + if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) { + ctxPtr->lowLimit = ctxPtr->dictLimit; + ctxPtr->dictStart = ctxPtr->prefixStart; + } } } return KLZ4HC_compress_generic (ctxPtr, src, dst, srcSizePtr, dstCapacity, ctxPtr->compressionLevel, limit); } @@ -1158,7 +1171,7 @@ int KLZ4_compress_HC_continue_destSize (KLZ4_streamHC_t* KLZ4_streamHCPtr, const int KLZ4_saveDictHC (KLZ4_streamHC_t* KLZ4_streamHCPtr, char* safeBuffer, int dictSize) { KLZ4HC_CCtx_internal* const streamPtr = &KLZ4_streamHCPtr->internal_donotuse; - int const prefixSize = (int)(streamPtr->end - (streamPtr->base + streamPtr->dictLimit)); + int const prefixSize = (int)(streamPtr->end - streamPtr->prefixStart); DEBUGLOG(5, "KLZ4_saveDictHC(%p, %p, %d)", KLZ4_streamHCPtr, safeBuffer, dictSize); assert(prefixSize >= 0); if (dictSize > 64 KB) dictSize = 64 KB; @@ -1166,12 +1179,13 @@ int KLZ4_saveDictHC (KLZ4_streamHC_t* KLZ4_streamHCPtr, char* safeBuffer, int di if (dictSize > prefixSize) dictSize = prefixSize; if (safeBuffer == NULL) assert(dictSize == 0); if (dictSize > 0) - memmove(safeBuffer, streamPtr->end - dictSize, dictSize); - { U32 const endIndex = (U32)(streamPtr->end - streamPtr->base); + KLZ4_memmove(safeBuffer, streamPtr->end - dictSize, dictSize); + { U32 const endIndex = (U32)(streamPtr->end - streamPtr->prefixStart) + streamPtr->dictLimit; streamPtr->end = (const BYTE*)safeBuffer + dictSize; - streamPtr->base = streamPtr->end - endIndex; + streamPtr->prefixStart = streamPtr->end - dictSize; streamPtr->dictLimit = endIndex - (U32)dictSize; streamPtr->lowLimit = endIndex - (U32)dictSize; + streamPtr->dictStart = streamPtr->prefixStart; if (streamPtr->nextToUpdate < streamPtr->dictLimit) streamPtr->nextToUpdate = streamPtr->dictLimit; } @@ -1199,7 +1213,7 @@ int KLZ4_compressHC_limitedOutput_continue (KLZ4_streamHC_t* ctx, const char* sr /* Deprecated streaming functions */ -int KLZ4_sizeofStreamStateHC(void) { return KLZ4_STREAMHCSIZE; } +int KLZ4_sizeofStreamStateHC(void) { return sizeof(KLZ4_streamHC_t); } /* state is presumed correctly sized, aka >= sizeof(KLZ4_streamHC_t) * @return : 0 on success, !=0 if error */ @@ -1211,6 +1225,7 @@ int KLZ4_resetStreamStateHC(void* state, char* inputBuffer) return 0; } +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) void* KLZ4_createHC (const char* inputBuffer) { KLZ4_streamHC_t* const hc4 = KLZ4_createStreamHC(); @@ -1225,6 +1240,7 @@ int KLZ4_freeHC (void* KLZ4HC_Data) FREEMEM(KLZ4HC_Data); return 0; } +#endif int KLZ4_compressHC2_continue (void* KLZ4HC_Data, const char* src, char* dst, int srcSize, int cLevel) { @@ -1238,11 +1254,11 @@ int KLZ4_compressHC2_limitedOutput_continue (void* KLZ4HC_Data, const char* src, char* KLZ4_slideInputBufferHC(void* KLZ4HC_Data) { - KLZ4_streamHC_t *ctx = (KLZ4_streamHC_t*)KLZ4HC_Data; - const BYTE *bufferStart = ctx->internal_donotuse.base + ctx->internal_donotuse.lowLimit; + KLZ4_streamHC_t* const ctx = (KLZ4_streamHC_t*)KLZ4HC_Data; + const BYTE* bufferStart = ctx->internal_donotuse.prefixStart - ctx->internal_donotuse.dictLimit + ctx->internal_donotuse.lowLimit; KLZ4_resetStreamHC_fast(ctx, ctx->internal_donotuse.compressionLevel); /* avoid const char * -> char * conversion warning :( */ - return (char *)(uptrval)bufferStart; + return (char*)(uptrval)bufferStart; } @@ -1325,7 +1341,7 @@ static int KLZ4HC_compress_optimal ( KLZ4HC_CCtx_internal* ctx, { int retval = 0; #define TRAILING_LITERALS 3 -#ifdef KLZ4HC_HEAPMODE +#if defined(KLZ4HC_HEAPMODE) && KLZ4HC_HEAPMODE==1 KLZ4HC_optimal_t* const opt = (KLZ4HC_optimal_t*)ALLOC(sizeof(KLZ4HC_optimal_t) * (KLZ4_OPT_NUM + TRAILING_LITERALS)); #else KLZ4HC_optimal_t opt[KLZ4_OPT_NUM + TRAILING_LITERALS]; /* ~64 KB, which is a bit large for stack... */ @@ -1343,7 +1359,7 @@ static int KLZ4HC_compress_optimal ( KLZ4HC_CCtx_internal* ctx, const BYTE* ovref = NULL; /* init */ -#ifdef KLZ4HC_HEAPMODE +#if defined(KLZ4HC_HEAPMODE) && KLZ4HC_HEAPMODE==1 if (opt == NULL) goto _return_label; #endif DEBUGLOG(5, "KLZ4HC_compress_optimal(dst=%p, dstCapa=%u)", dst, (unsigned)dstCapacity); @@ -1575,7 +1591,7 @@ _last_literals: } else { *op++ = (BYTE)(lastRunSize << ML_BITS); } - memcpy(op, anchor, lastRunSize); + KLZ4_memcpy(op, anchor, lastRunSize); op += lastRunSize; } @@ -1608,7 +1624,7 @@ if (limit == fillOutput) { goto _last_literals; } _return_label: -#ifdef KLZ4HC_HEAPMODE +#if defined(KLZ4HC_HEAPMODE) && KLZ4HC_HEAPMODE==1 FREEMEM(opt); #endif return retval; diff --git a/src/third_party/librdkafka/dist/src/lz4hc.h b/src/third_party/librdkafka/dist/src/lz4hc.h index df627d0dd28..7558bf88180 100644 --- a/src/third_party/librdkafka/dist/src/lz4hc.h +++ b/src/third_party/librdkafka/dist/src/lz4hc.h @@ -1,7 +1,7 @@ /* KLZ4 HC - High Compression Mode of KLZ4 Header File - Copyright (C) 2011-2017, Yann Collet. + Copyright (C) 2011-2020, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without @@ -198,14 +198,17 @@ KLZ4LIB_API int KLZ4_saveDictHC (KLZ4_streamHC_t* streamHCPtr, char* safeBuffer, #define KLZ4HC_HASH_MASK (KLZ4HC_HASHTABLESIZE - 1) +/* Never ever use these definitions directly ! + * Declare or allocate an KLZ4_streamHC_t instead. +**/ typedef struct KLZ4HC_CCtx_internal KLZ4HC_CCtx_internal; struct KLZ4HC_CCtx_internal { KLZ4_u32 hashTable[KLZ4HC_HASHTABLESIZE]; KLZ4_u16 chainTable[KLZ4HC_MAXD]; const KLZ4_byte* end; /* next block here to continue on current prefix */ - const KLZ4_byte* base; /* All index relative to this position */ - const KLZ4_byte* dictBase; /* alternate base for extDict */ + const KLZ4_byte* prefixStart; /* Indexes relative to this position */ + const KLZ4_byte* dictStart; /* alternate reference for extDict */ KLZ4_u32 dictLimit; /* below that point, need extDict */ KLZ4_u32 lowLimit; /* below that point, no more dict */ KLZ4_u32 nextToUpdate; /* index from which to continue dictionary update */ @@ -216,20 +219,15 @@ struct KLZ4HC_CCtx_internal const KLZ4HC_CCtx_internal* dictCtx; }; - -/* Do not use these definitions directly ! - * Declare or allocate an KLZ4_streamHC_t instead. - */ -#define KLZ4_STREAMHCSIZE 262200 /* static size, for inter-version compatibility */ -#define KLZ4_STREAMHCSIZE_VOIDP (KLZ4_STREAMHCSIZE / sizeof(void*)) +#define KLZ4_STREAMHC_MINSIZE 262200 /* static size, for inter-version compatibility */ union KLZ4_streamHC_u { - void* table[KLZ4_STREAMHCSIZE_VOIDP]; + char minStateSize[KLZ4_STREAMHC_MINSIZE]; KLZ4HC_CCtx_internal internal_donotuse; }; /* previously typedef'd to KLZ4_streamHC_t */ /* KLZ4_streamHC_t : * This structure allows static allocation of KLZ4 HC streaming state. - * This can be used to allocate statically, on state, or as part of a larger structure. + * This can be used to allocate statically on stack, or as part of a larger structure. * * Such state **must** be initialized using KLZ4_initStreamHC() before first use. * @@ -244,7 +242,7 @@ union KLZ4_streamHC_u { * Required before first use of a statically allocated KLZ4_streamHC_t. * Before v1.9.0 : use KLZ4_resetStreamHC() instead */ -KLZ4LIB_API KLZ4_streamHC_t* KLZ4_initStreamHC (void* buffer, size_t size); +KLZ4LIB_API KLZ4_streamHC_t* KLZ4_initStreamHC(void* buffer, size_t size); /*-************************************ @@ -272,9 +270,11 @@ KLZ4_DEPRECATED("use KLZ4_compress_HC_continue() instead") KLZ4LIB_API int KLZ4_ * KLZ4_slideInputBufferHC() will truncate the history of the stream, rather * than preserve a window-sized chunk of history. */ +#if !defined(KLZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) KLZ4_DEPRECATED("use KLZ4_createStreamHC() instead") KLZ4LIB_API void* KLZ4_createHC (const char* inputBuffer); -KLZ4_DEPRECATED("use KLZ4_saveDictHC() instead") KLZ4LIB_API char* KLZ4_slideInputBufferHC (void* KLZ4HC_Data); KLZ4_DEPRECATED("use KLZ4_freeStreamHC() instead") KLZ4LIB_API int KLZ4_freeHC (void* KLZ4HC_Data); +#endif +KLZ4_DEPRECATED("use KLZ4_saveDictHC() instead") KLZ4LIB_API char* KLZ4_slideInputBufferHC (void* KLZ4HC_Data); KLZ4_DEPRECATED("use KLZ4_compress_HC_continue() instead") KLZ4LIB_API int KLZ4_compressHC2_continue (void* KLZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel); KLZ4_DEPRECATED("use KLZ4_compress_HC_continue() instead") KLZ4LIB_API int KLZ4_compressHC2_limitedOutput_continue (void* KLZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); KLZ4_DEPRECATED("use KLZ4_createStreamHC() instead") KLZ4LIB_API int KLZ4_sizeofStreamStateHC(void); @@ -305,7 +305,7 @@ KLZ4LIB_API void KLZ4_resetStreamHC (KLZ4_streamHC_t* streamHCPtr, int compressi * They should not be linked from DLL, * as there is no guarantee of API stability yet. * Prototypes will be promoted to "stable" status - * after successfull usage in real-life scenarios. + * after successful usage in real-life scenarios. ***************************************************/ #ifdef KLZ4_HC_STATIC_LINKING_ONLY /* protection macro */ #ifndef KLZ4_HC_SLO_098092834 diff --git a/src/third_party/librdkafka/dist/src/nanopb/pb.h b/src/third_party/librdkafka/dist/src/nanopb/pb.h new file mode 100644 index 00000000000..ef3d83e95a4 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/nanopb/pb.h @@ -0,0 +1,917 @@ +/* Common parts of the nanopb library. Most of these are quite low-level + * stuff. For the high-level interface, see pb_encode.h and pb_decode.h. + */ + +#ifndef PB_H_INCLUDED +#define PB_H_INCLUDED + +/***************************************************************** + * Nanopb compilation time options. You can change these here by * + * uncommenting the lines, or on the compiler command line. * + *****************************************************************/ + +/* Enable support for dynamically allocated fields */ +/* #define PB_ENABLE_MALLOC 1 */ + +/* Define this if your CPU / compiler combination does not support + * unaligned memory access to packed structures. Note that packed + * structures are only used when requested in .proto options. */ +/* #define PB_NO_PACKED_STRUCTS 1 */ + +/* Increase the number of required fields that are tracked. + * A compiler warning will tell if you need this. */ +/* #define PB_MAX_REQUIRED_FIELDS 256 */ + +/* Add support for tag numbers > 65536 and fields larger than 65536 bytes. */ +/* #define PB_FIELD_32BIT 1 */ + +/* Disable support for error messages in order to save some code space. */ +/* #define PB_NO_ERRMSG 1 */ + +/* Disable support for custom streams (support only memory buffers). */ +/* #define PB_BUFFER_ONLY 1 */ + +/* Disable support for 64-bit datatypes, for compilers without int64_t + or to save some code space. */ +/* #define PB_WITHOUT_64BIT 1 */ + +/* Don't encode scalar arrays as packed. This is only to be used when + * the decoder on the receiving side cannot process packed scalar arrays. + * Such example is older protobuf.js. */ +/* #define PB_ENCODE_ARRAYS_UNPACKED 1 */ + +/* Enable conversion of doubles to floats for platforms that do not + * support 64-bit doubles. Most commonly AVR. */ +/* #define PB_CONVERT_DOUBLE_FLOAT 1 */ + +/* Check whether incoming strings are valid UTF-8 sequences. Slows down + * the string processing slightly and slightly increases code size. */ +/* #define PB_VALIDATE_UTF8 1 */ + +/* This can be defined if the platform is little-endian and has 8-bit bytes. + * Normally it is automatically detected based on __BYTE_ORDER__ macro. */ +/* #define PB_LITTLE_ENDIAN_8BIT 1 */ + +/* Configure static assert mechanism. Instead of changing these, set your + * compiler to C11 standard mode if possible. */ +/* #define PB_C99_STATIC_ASSERT 1 */ +/* #define PB_NO_STATIC_ASSERT 1 */ + +/****************************************************************** + * You usually don't need to change anything below this line. * + * Feel free to look around and use the defined macros, though. * + ******************************************************************/ + + +/* Version of the nanopb library. Just in case you want to check it in + * your own program. */ +#define NANOPB_VERSION "nanopb-0.4.8-dev" + +/* Include all the system headers needed by nanopb. You will need the + * definitions of the following: + * - strlen, memcpy, memset functions + * - [u]int_least8_t, uint_fast8_t, [u]int_least16_t, [u]int32_t, [u]int64_t + * - size_t + * - bool + * + * If you don't have the standard header files, you can instead provide + * a custom header that defines or includes all this. In that case, + * define PB_SYSTEM_HEADER to the path of this file. + */ +#ifdef PB_SYSTEM_HEADER +#include PB_SYSTEM_HEADER +#else +#include +#include +#include +#include +#include + +#ifdef PB_ENABLE_MALLOC +#include +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Macro for defining packed structures (compiler dependent). + * This just reduces memory requirements, but is not required. + */ +#if defined(PB_NO_PACKED_STRUCTS) + /* Disable struct packing */ +# define PB_PACKED_STRUCT_START +# define PB_PACKED_STRUCT_END +# define pb_packed +#elif defined(__GNUC__) || defined(__clang__) + /* For GCC and clang */ +# define PB_PACKED_STRUCT_START +# define PB_PACKED_STRUCT_END +# define pb_packed __attribute__((packed)) +#elif defined(__ICCARM__) || defined(__CC_ARM) + /* For IAR ARM and Keil MDK-ARM compilers */ +# define PB_PACKED_STRUCT_START _Pragma("pack(push, 1)") +# define PB_PACKED_STRUCT_END _Pragma("pack(pop)") +# define pb_packed +#elif defined(_MSC_VER) && (_MSC_VER >= 1500) + /* For Microsoft Visual C++ */ +# define PB_PACKED_STRUCT_START __pragma(pack(push, 1)) +# define PB_PACKED_STRUCT_END __pragma(pack(pop)) +# define pb_packed +#else + /* Unknown compiler */ +# define PB_PACKED_STRUCT_START +# define PB_PACKED_STRUCT_END +# define pb_packed +#endif + +/* Detect endianness */ +#ifndef PB_LITTLE_ENDIAN_8BIT +#if ((defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || \ + defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || \ + defined(__THUMBEL__) || defined(__AARCH64EL__) || defined(_MIPSEL) || \ + defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM)) \ + && CHAR_BIT == 8 +#define PB_LITTLE_ENDIAN_8BIT 1 +#endif +#endif + +/* Handly macro for suppressing unreferenced-parameter compiler warnings. */ +#ifndef PB_UNUSED +#define PB_UNUSED(x) (void)(x) +#endif + +/* Harvard-architecture processors may need special attributes for storing + * field information in program memory. */ +#ifndef PB_PROGMEM +#ifdef __AVR__ +#include +#define PB_PROGMEM PROGMEM +#define PB_PROGMEM_READU32(x) pgm_read_dword(&x) +#else +#define PB_PROGMEM +#define PB_PROGMEM_READU32(x) (x) +#endif +#endif + +/* Compile-time assertion, used for checking compatible compilation options. + * If this does not work properly on your compiler, use + * #define PB_NO_STATIC_ASSERT to disable it. + * + * But before doing that, check carefully the error message / place where it + * comes from to see if the error has a real cause. Unfortunately the error + * message is not always very clear to read, but you can see the reason better + * in the place where the PB_STATIC_ASSERT macro was called. + */ +#ifndef PB_NO_STATIC_ASSERT +# ifndef PB_STATIC_ASSERT +# if defined(__ICCARM__) + /* IAR has static_assert keyword but no _Static_assert */ +# define PB_STATIC_ASSERT(COND,MSG) static_assert(COND,#MSG); +# elif defined(_MSC_VER) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112) + /* MSVC in C89 mode supports static_assert() keyword anyway */ +# define PB_STATIC_ASSERT(COND,MSG) static_assert(COND,#MSG); +# elif defined(PB_C99_STATIC_ASSERT) + /* Classic negative-size-array static assert mechanism */ +# define PB_STATIC_ASSERT(COND,MSG) typedef char PB_STATIC_ASSERT_MSG(MSG, __LINE__, __COUNTER__)[(COND)?1:-1]; +# define PB_STATIC_ASSERT_MSG(MSG, LINE, COUNTER) PB_STATIC_ASSERT_MSG_(MSG, LINE, COUNTER) +# define PB_STATIC_ASSERT_MSG_(MSG, LINE, COUNTER) pb_static_assertion_##MSG##_##LINE##_##COUNTER +# elif defined(__cplusplus) + /* C++11 standard static_assert mechanism */ +# define PB_STATIC_ASSERT(COND,MSG) static_assert(COND,#MSG); +# else + /* C11 standard _Static_assert mechanism */ +# define PB_STATIC_ASSERT(COND,MSG) _Static_assert(COND,#MSG); +# endif +# endif +#else + /* Static asserts disabled by PB_NO_STATIC_ASSERT */ +# define PB_STATIC_ASSERT(COND,MSG) +#endif + +/* Test that PB_STATIC_ASSERT works + * If you get errors here, you may need to do one of these: + * - Enable C11 standard support in your compiler + * - Define PB_C99_STATIC_ASSERT to enable C99 standard support + * - Define PB_NO_STATIC_ASSERT to disable static asserts altogether + */ +PB_STATIC_ASSERT(1, STATIC_ASSERT_IS_NOT_WORKING) + +/* Number of required fields to keep track of. */ +#ifndef PB_MAX_REQUIRED_FIELDS +#define PB_MAX_REQUIRED_FIELDS 64 +#endif + +#if PB_MAX_REQUIRED_FIELDS < 64 +#error You should not lower PB_MAX_REQUIRED_FIELDS from the default value (64). +#endif + +#ifdef PB_WITHOUT_64BIT +#ifdef PB_CONVERT_DOUBLE_FLOAT +/* Cannot use doubles without 64-bit types */ +#undef PB_CONVERT_DOUBLE_FLOAT +#endif +#endif + +/* List of possible field types. These are used in the autogenerated code. + * Least-significant 4 bits tell the scalar type + * Most-significant 4 bits specify repeated/required/packed etc. + */ + +typedef uint_least8_t pb_type_t; + +/**** Field data types ****/ + +/* Numeric types */ +#define PB_LTYPE_BOOL 0x00U /* bool */ +#define PB_LTYPE_VARINT 0x01U /* int32, int64, enum, bool */ +#define PB_LTYPE_UVARINT 0x02U /* uint32, uint64 */ +#define PB_LTYPE_SVARINT 0x03U /* sint32, sint64 */ +#define PB_LTYPE_FIXED32 0x04U /* fixed32, sfixed32, float */ +#define PB_LTYPE_FIXED64 0x05U /* fixed64, sfixed64, double */ + +/* Marker for last packable field type. */ +#define PB_LTYPE_LAST_PACKABLE 0x05U + +/* Byte array with pre-allocated buffer. + * data_size is the length of the allocated PB_BYTES_ARRAY structure. */ +#define PB_LTYPE_BYTES 0x06U + +/* String with pre-allocated buffer. + * data_size is the maximum length. */ +#define PB_LTYPE_STRING 0x07U + +/* Submessage + * submsg_fields is pointer to field descriptions */ +#define PB_LTYPE_SUBMESSAGE 0x08U + +/* Submessage with pre-decoding callback + * The pre-decoding callback is stored as pb_callback_t right before pSize. + * submsg_fields is pointer to field descriptions */ +#define PB_LTYPE_SUBMSG_W_CB 0x09U + +/* Extension pseudo-field + * The field contains a pointer to pb_extension_t */ +#define PB_LTYPE_EXTENSION 0x0AU + +/* Byte array with inline, pre-allocated byffer. + * data_size is the length of the inline, allocated buffer. + * This differs from PB_LTYPE_BYTES by defining the element as + * pb_byte_t[data_size] rather than pb_bytes_array_t. */ +#define PB_LTYPE_FIXED_LENGTH_BYTES 0x0BU + +/* Number of declared LTYPES */ +#define PB_LTYPES_COUNT 0x0CU +#define PB_LTYPE_MASK 0x0FU + +/**** Field repetition rules ****/ + +#define PB_HTYPE_REQUIRED 0x00U +#define PB_HTYPE_OPTIONAL 0x10U +#define PB_HTYPE_SINGULAR 0x10U +#define PB_HTYPE_REPEATED 0x20U +#define PB_HTYPE_FIXARRAY 0x20U +#define PB_HTYPE_ONEOF 0x30U +#define PB_HTYPE_MASK 0x30U + +/**** Field allocation types ****/ + +#define PB_ATYPE_STATIC 0x00U +#define PB_ATYPE_POINTER 0x80U +#define PB_ATYPE_CALLBACK 0x40U +#define PB_ATYPE_MASK 0xC0U + +#define PB_ATYPE(x) ((x) & PB_ATYPE_MASK) +#define PB_HTYPE(x) ((x) & PB_HTYPE_MASK) +#define PB_LTYPE(x) ((x) & PB_LTYPE_MASK) +#define PB_LTYPE_IS_SUBMSG(x) (PB_LTYPE(x) == PB_LTYPE_SUBMESSAGE || \ + PB_LTYPE(x) == PB_LTYPE_SUBMSG_W_CB) + +/* Data type used for storing sizes of struct fields + * and array counts. + */ +#if defined(PB_FIELD_32BIT) + typedef uint32_t pb_size_t; + typedef int32_t pb_ssize_t; +#else + typedef uint_least16_t pb_size_t; + typedef int_least16_t pb_ssize_t; +#endif +#define PB_SIZE_MAX ((pb_size_t)-1) + +/* Data type for storing encoded data and other byte streams. + * This typedef exists to support platforms where uint8_t does not exist. + * You can regard it as equivalent on uint8_t on other platforms. + */ +typedef uint_least8_t pb_byte_t; + +/* Forward declaration of struct types */ +typedef struct pb_istream_s pb_istream_t; +typedef struct pb_ostream_s pb_ostream_t; +typedef struct pb_field_iter_s pb_field_iter_t; + +/* This structure is used in auto-generated constants + * to specify struct fields. + */ +typedef struct pb_msgdesc_s pb_msgdesc_t; +struct pb_msgdesc_s { + const uint32_t *field_info; + const pb_msgdesc_t * const * submsg_info; + const pb_byte_t *default_value; + + bool (*field_callback)(pb_istream_t *istream, pb_ostream_t *ostream, const pb_field_iter_t *field); + + pb_size_t field_count; + pb_size_t required_field_count; + pb_size_t largest_tag; +}; + +/* Iterator for message descriptor */ +struct pb_field_iter_s { + const pb_msgdesc_t *descriptor; /* Pointer to message descriptor constant */ + void *message; /* Pointer to start of the structure */ + + pb_size_t index; /* Index of the field */ + pb_size_t field_info_index; /* Index to descriptor->field_info array */ + pb_size_t required_field_index; /* Index that counts only the required fields */ + pb_size_t submessage_index; /* Index that counts only submessages */ + + pb_size_t tag; /* Tag of current field */ + pb_size_t data_size; /* sizeof() of a single item */ + pb_size_t array_size; /* Number of array entries */ + pb_type_t type; /* Type of current field */ + + void *pField; /* Pointer to current field in struct */ + void *pData; /* Pointer to current data contents. Different than pField for arrays and pointers. */ + void *pSize; /* Pointer to count/has field */ + + const pb_msgdesc_t *submsg_desc; /* For submessage fields, pointer to field descriptor for the submessage. */ +}; + +/* For compatibility with legacy code */ +typedef pb_field_iter_t pb_field_t; + +/* Make sure that the standard integer types are of the expected sizes. + * Otherwise fixed32/fixed64 fields can break. + * + * If you get errors here, it probably means that your stdint.h is not + * correct for your platform. + */ +#ifndef PB_WITHOUT_64BIT +PB_STATIC_ASSERT(sizeof(int64_t) == 2 * sizeof(int32_t), INT64_T_WRONG_SIZE) +PB_STATIC_ASSERT(sizeof(uint64_t) == 2 * sizeof(uint32_t), UINT64_T_WRONG_SIZE) +#endif + +/* This structure is used for 'bytes' arrays. + * It has the number of bytes in the beginning, and after that an array. + * Note that actual structs used will have a different length of bytes array. + */ +#define PB_BYTES_ARRAY_T(n) struct { pb_size_t size; pb_byte_t bytes[n]; } +#define PB_BYTES_ARRAY_T_ALLOCSIZE(n) ((size_t)n + offsetof(pb_bytes_array_t, bytes)) + +struct pb_bytes_array_s { + pb_size_t size; + pb_byte_t bytes[1]; +}; +typedef struct pb_bytes_array_s pb_bytes_array_t; + +/* This structure is used for giving the callback function. + * It is stored in the message structure and filled in by the method that + * calls pb_decode. + * + * The decoding callback will be given a limited-length stream + * If the wire type was string, the length is the length of the string. + * If the wire type was a varint/fixed32/fixed64, the length is the length + * of the actual value. + * The function may be called multiple times (especially for repeated types, + * but also otherwise if the message happens to contain the field multiple + * times.) + * + * The encoding callback will receive the actual output stream. + * It should write all the data in one call, including the field tag and + * wire type. It can write multiple fields. + * + * The callback can be null if you want to skip a field. + */ +typedef struct pb_callback_s pb_callback_t; +struct pb_callback_s { + /* Callback functions receive a pointer to the arg field. + * You can access the value of the field as *arg, and modify it if needed. + */ + union { + bool (*decode)(pb_istream_t *stream, const pb_field_t *field, void **arg); + bool (*encode)(pb_ostream_t *stream, const pb_field_t *field, void * const *arg); + } funcs; + + /* Free arg for use by callback */ + void *arg; +}; + +extern bool pb_default_field_callback(pb_istream_t *istream, pb_ostream_t *ostream, const pb_field_t *field); + +/* Wire types. Library user needs these only in encoder callbacks. */ +typedef enum { + PB_WT_VARINT = 0, + PB_WT_64BIT = 1, + PB_WT_STRING = 2, + PB_WT_32BIT = 5, + PB_WT_PACKED = 255 /* PB_WT_PACKED is internal marker for packed arrays. */ +} pb_wire_type_t; + +/* Structure for defining the handling of unknown/extension fields. + * Usually the pb_extension_type_t structure is automatically generated, + * while the pb_extension_t structure is created by the user. However, + * if you want to catch all unknown fields, you can also create a custom + * pb_extension_type_t with your own callback. + */ +typedef struct pb_extension_type_s pb_extension_type_t; +typedef struct pb_extension_s pb_extension_t; +struct pb_extension_type_s { + /* Called for each unknown field in the message. + * If you handle the field, read off all of its data and return true. + * If you do not handle the field, do not read anything and return true. + * If you run into an error, return false. + * Set to NULL for default handler. + */ + bool (*decode)(pb_istream_t *stream, pb_extension_t *extension, + uint32_t tag, pb_wire_type_t wire_type); + + /* Called once after all regular fields have been encoded. + * If you have something to write, do so and return true. + * If you do not have anything to write, just return true. + * If you run into an error, return false. + * Set to NULL for default handler. + */ + bool (*encode)(pb_ostream_t *stream, const pb_extension_t *extension); + + /* Free field for use by the callback. */ + const void *arg; +}; + +struct pb_extension_s { + /* Type describing the extension field. Usually you'll initialize + * this to a pointer to the automatically generated structure. */ + const pb_extension_type_t *type; + + /* Destination for the decoded data. This must match the datatype + * of the extension field. */ + void *dest; + + /* Pointer to the next extension handler, or NULL. + * If this extension does not match a field, the next handler is + * automatically called. */ + pb_extension_t *next; + + /* The decoder sets this to true if the extension was found. + * Ignored for encoding. */ + bool found; +}; + +#define pb_extension_init_zero {NULL,NULL,NULL,false} + +/* Memory allocation functions to use. You can define pb_realloc and + * pb_free to custom functions if you want. */ +#ifdef PB_ENABLE_MALLOC +# ifndef pb_realloc +# define pb_realloc(ptr, size) realloc(ptr, size) +# endif +# ifndef pb_free +# define pb_free(ptr) free(ptr) +# endif +#endif + +/* This is used to inform about need to regenerate .pb.h/.pb.c files. */ +#define PB_PROTO_HEADER_VERSION 40 + +/* These macros are used to declare pb_field_t's in the constant array. */ +/* Size of a structure member, in bytes. */ +#define pb_membersize(st, m) (sizeof ((st*)0)->m) +/* Number of entries in an array. */ +#define pb_arraysize(st, m) (pb_membersize(st, m) / pb_membersize(st, m[0])) +/* Delta from start of one member to the start of another member. */ +#define pb_delta(st, m1, m2) ((int)offsetof(st, m1) - (int)offsetof(st, m2)) + +/* Force expansion of macro value */ +#define PB_EXPAND(x) x + +/* Binding of a message field set into a specific structure */ +#define PB_BIND(msgname, structname, width) \ + const uint32_t structname ## _field_info[] PB_PROGMEM = \ + { \ + msgname ## _FIELDLIST(PB_GEN_FIELD_INFO_ ## width, structname) \ + 0 \ + }; \ + const pb_msgdesc_t* const structname ## _submsg_info[] = \ + { \ + msgname ## _FIELDLIST(PB_GEN_SUBMSG_INFO, structname) \ + NULL \ + }; \ + const pb_msgdesc_t structname ## _msg = \ + { \ + structname ## _field_info, \ + structname ## _submsg_info, \ + msgname ## _DEFAULT, \ + msgname ## _CALLBACK, \ + 0 msgname ## _FIELDLIST(PB_GEN_FIELD_COUNT, structname), \ + 0 msgname ## _FIELDLIST(PB_GEN_REQ_FIELD_COUNT, structname), \ + 0 msgname ## _FIELDLIST(PB_GEN_LARGEST_TAG, structname), \ + }; \ + msgname ## _FIELDLIST(PB_GEN_FIELD_INFO_ASSERT_ ## width, structname) + +#define PB_GEN_FIELD_COUNT(structname, atype, htype, ltype, fieldname, tag) +1 +#define PB_GEN_REQ_FIELD_COUNT(structname, atype, htype, ltype, fieldname, tag) \ + + (PB_HTYPE_ ## htype == PB_HTYPE_REQUIRED) +#define PB_GEN_LARGEST_TAG(structname, atype, htype, ltype, fieldname, tag) \ + * 0 + tag + +/* X-macro for generating the entries in struct_field_info[] array. */ +#define PB_GEN_FIELD_INFO_1(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_1(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_GEN_FIELD_INFO_2(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_2(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_GEN_FIELD_INFO_4(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_4(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_GEN_FIELD_INFO_8(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_8(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_GEN_FIELD_INFO_AUTO(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_AUTO2(PB_FIELDINFO_WIDTH_AUTO(_PB_ATYPE_ ## atype, _PB_HTYPE_ ## htype, _PB_LTYPE_ ## ltype), \ + tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_FIELDINFO_AUTO2(width, tag, type, data_offset, data_size, size_offset, array_size) \ + PB_FIELDINFO_AUTO3(width, tag, type, data_offset, data_size, size_offset, array_size) + +#define PB_FIELDINFO_AUTO3(width, tag, type, data_offset, data_size, size_offset, array_size) \ + PB_FIELDINFO_ ## width(tag, type, data_offset, data_size, size_offset, array_size) + +/* X-macro for generating asserts that entries fit in struct_field_info[] array. + * The structure of macros here must match the structure above in PB_GEN_FIELD_INFO_x(), + * but it is not easily reused because of how macro substitutions work. */ +#define PB_GEN_FIELD_INFO_ASSERT_1(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_ASSERT_1(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_GEN_FIELD_INFO_ASSERT_2(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_ASSERT_2(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_GEN_FIELD_INFO_ASSERT_4(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_ASSERT_4(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_GEN_FIELD_INFO_ASSERT_8(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_ASSERT_8(tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_GEN_FIELD_INFO_ASSERT_AUTO(structname, atype, htype, ltype, fieldname, tag) \ + PB_FIELDINFO_ASSERT_AUTO2(PB_FIELDINFO_WIDTH_AUTO(_PB_ATYPE_ ## atype, _PB_HTYPE_ ## htype, _PB_LTYPE_ ## ltype), \ + tag, PB_ATYPE_ ## atype | PB_HTYPE_ ## htype | PB_LTYPE_MAP_ ## ltype, \ + PB_DATA_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_DATA_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_SIZE_OFFSET_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname), \ + PB_ARRAY_SIZE_ ## atype(_PB_HTYPE_ ## htype, structname, fieldname)) + +#define PB_FIELDINFO_ASSERT_AUTO2(width, tag, type, data_offset, data_size, size_offset, array_size) \ + PB_FIELDINFO_ASSERT_AUTO3(width, tag, type, data_offset, data_size, size_offset, array_size) + +#define PB_FIELDINFO_ASSERT_AUTO3(width, tag, type, data_offset, data_size, size_offset, array_size) \ + PB_FIELDINFO_ASSERT_ ## width(tag, type, data_offset, data_size, size_offset, array_size) + +#define PB_DATA_OFFSET_STATIC(htype, structname, fieldname) PB_DO ## htype(structname, fieldname) +#define PB_DATA_OFFSET_POINTER(htype, structname, fieldname) PB_DO ## htype(structname, fieldname) +#define PB_DATA_OFFSET_CALLBACK(htype, structname, fieldname) PB_DO ## htype(structname, fieldname) +#define PB_DO_PB_HTYPE_REQUIRED(structname, fieldname) offsetof(structname, fieldname) +#define PB_DO_PB_HTYPE_SINGULAR(structname, fieldname) offsetof(structname, fieldname) +#define PB_DO_PB_HTYPE_ONEOF(structname, fieldname) offsetof(structname, PB_ONEOF_NAME(FULL, fieldname)) +#define PB_DO_PB_HTYPE_OPTIONAL(structname, fieldname) offsetof(structname, fieldname) +#define PB_DO_PB_HTYPE_REPEATED(structname, fieldname) offsetof(structname, fieldname) +#define PB_DO_PB_HTYPE_FIXARRAY(structname, fieldname) offsetof(structname, fieldname) + +#define PB_SIZE_OFFSET_STATIC(htype, structname, fieldname) PB_SO ## htype(structname, fieldname) +#define PB_SIZE_OFFSET_POINTER(htype, structname, fieldname) PB_SO_PTR ## htype(structname, fieldname) +#define PB_SIZE_OFFSET_CALLBACK(htype, structname, fieldname) PB_SO_CB ## htype(structname, fieldname) +#define PB_SO_PB_HTYPE_REQUIRED(structname, fieldname) 0 +#define PB_SO_PB_HTYPE_SINGULAR(structname, fieldname) 0 +#define PB_SO_PB_HTYPE_ONEOF(structname, fieldname) PB_SO_PB_HTYPE_ONEOF2(structname, PB_ONEOF_NAME(FULL, fieldname), PB_ONEOF_NAME(UNION, fieldname)) +#define PB_SO_PB_HTYPE_ONEOF2(structname, fullname, unionname) PB_SO_PB_HTYPE_ONEOF3(structname, fullname, unionname) +#define PB_SO_PB_HTYPE_ONEOF3(structname, fullname, unionname) pb_delta(structname, fullname, which_ ## unionname) +#define PB_SO_PB_HTYPE_OPTIONAL(structname, fieldname) pb_delta(structname, fieldname, has_ ## fieldname) +#define PB_SO_PB_HTYPE_REPEATED(structname, fieldname) pb_delta(structname, fieldname, fieldname ## _count) +#define PB_SO_PB_HTYPE_FIXARRAY(structname, fieldname) 0 +#define PB_SO_PTR_PB_HTYPE_REQUIRED(structname, fieldname) 0 +#define PB_SO_PTR_PB_HTYPE_SINGULAR(structname, fieldname) 0 +#define PB_SO_PTR_PB_HTYPE_ONEOF(structname, fieldname) PB_SO_PB_HTYPE_ONEOF(structname, fieldname) +#define PB_SO_PTR_PB_HTYPE_OPTIONAL(structname, fieldname) 0 +#define PB_SO_PTR_PB_HTYPE_REPEATED(structname, fieldname) PB_SO_PB_HTYPE_REPEATED(structname, fieldname) +#define PB_SO_PTR_PB_HTYPE_FIXARRAY(structname, fieldname) 0 +#define PB_SO_CB_PB_HTYPE_REQUIRED(structname, fieldname) 0 +#define PB_SO_CB_PB_HTYPE_SINGULAR(structname, fieldname) 0 +#define PB_SO_CB_PB_HTYPE_ONEOF(structname, fieldname) PB_SO_PB_HTYPE_ONEOF(structname, fieldname) +#define PB_SO_CB_PB_HTYPE_OPTIONAL(structname, fieldname) 0 +#define PB_SO_CB_PB_HTYPE_REPEATED(structname, fieldname) 0 +#define PB_SO_CB_PB_HTYPE_FIXARRAY(structname, fieldname) 0 + +#define PB_ARRAY_SIZE_STATIC(htype, structname, fieldname) PB_AS ## htype(structname, fieldname) +#define PB_ARRAY_SIZE_POINTER(htype, structname, fieldname) PB_AS_PTR ## htype(structname, fieldname) +#define PB_ARRAY_SIZE_CALLBACK(htype, structname, fieldname) 1 +#define PB_AS_PB_HTYPE_REQUIRED(structname, fieldname) 1 +#define PB_AS_PB_HTYPE_SINGULAR(structname, fieldname) 1 +#define PB_AS_PB_HTYPE_OPTIONAL(structname, fieldname) 1 +#define PB_AS_PB_HTYPE_ONEOF(structname, fieldname) 1 +#define PB_AS_PB_HTYPE_REPEATED(structname, fieldname) pb_arraysize(structname, fieldname) +#define PB_AS_PB_HTYPE_FIXARRAY(structname, fieldname) pb_arraysize(structname, fieldname) +#define PB_AS_PTR_PB_HTYPE_REQUIRED(structname, fieldname) 1 +#define PB_AS_PTR_PB_HTYPE_SINGULAR(structname, fieldname) 1 +#define PB_AS_PTR_PB_HTYPE_OPTIONAL(structname, fieldname) 1 +#define PB_AS_PTR_PB_HTYPE_ONEOF(structname, fieldname) 1 +#define PB_AS_PTR_PB_HTYPE_REPEATED(structname, fieldname) 1 +#define PB_AS_PTR_PB_HTYPE_FIXARRAY(structname, fieldname) pb_arraysize(structname, fieldname[0]) + +#define PB_DATA_SIZE_STATIC(htype, structname, fieldname) PB_DS ## htype(structname, fieldname) +#define PB_DATA_SIZE_POINTER(htype, structname, fieldname) PB_DS_PTR ## htype(structname, fieldname) +#define PB_DATA_SIZE_CALLBACK(htype, structname, fieldname) PB_DS_CB ## htype(structname, fieldname) +#define PB_DS_PB_HTYPE_REQUIRED(structname, fieldname) pb_membersize(structname, fieldname) +#define PB_DS_PB_HTYPE_SINGULAR(structname, fieldname) pb_membersize(structname, fieldname) +#define PB_DS_PB_HTYPE_OPTIONAL(structname, fieldname) pb_membersize(structname, fieldname) +#define PB_DS_PB_HTYPE_ONEOF(structname, fieldname) pb_membersize(structname, PB_ONEOF_NAME(FULL, fieldname)) +#define PB_DS_PB_HTYPE_REPEATED(structname, fieldname) pb_membersize(structname, fieldname[0]) +#define PB_DS_PB_HTYPE_FIXARRAY(structname, fieldname) pb_membersize(structname, fieldname[0]) +#define PB_DS_PTR_PB_HTYPE_REQUIRED(structname, fieldname) pb_membersize(structname, fieldname[0]) +#define PB_DS_PTR_PB_HTYPE_SINGULAR(structname, fieldname) pb_membersize(structname, fieldname[0]) +#define PB_DS_PTR_PB_HTYPE_OPTIONAL(structname, fieldname) pb_membersize(structname, fieldname[0]) +#define PB_DS_PTR_PB_HTYPE_ONEOF(structname, fieldname) pb_membersize(structname, PB_ONEOF_NAME(FULL, fieldname)[0]) +#define PB_DS_PTR_PB_HTYPE_REPEATED(structname, fieldname) pb_membersize(structname, fieldname[0]) +#define PB_DS_PTR_PB_HTYPE_FIXARRAY(structname, fieldname) pb_membersize(structname, fieldname[0][0]) +#define PB_DS_CB_PB_HTYPE_REQUIRED(structname, fieldname) pb_membersize(structname, fieldname) +#define PB_DS_CB_PB_HTYPE_SINGULAR(structname, fieldname) pb_membersize(structname, fieldname) +#define PB_DS_CB_PB_HTYPE_OPTIONAL(structname, fieldname) pb_membersize(structname, fieldname) +#define PB_DS_CB_PB_HTYPE_ONEOF(structname, fieldname) pb_membersize(structname, PB_ONEOF_NAME(FULL, fieldname)) +#define PB_DS_CB_PB_HTYPE_REPEATED(structname, fieldname) pb_membersize(structname, fieldname) +#define PB_DS_CB_PB_HTYPE_FIXARRAY(structname, fieldname) pb_membersize(structname, fieldname) + +#define PB_ONEOF_NAME(type, tuple) PB_EXPAND(PB_ONEOF_NAME_ ## type tuple) +#define PB_ONEOF_NAME_UNION(unionname,membername,fullname) unionname +#define PB_ONEOF_NAME_MEMBER(unionname,membername,fullname) membername +#define PB_ONEOF_NAME_FULL(unionname,membername,fullname) fullname + +#define PB_GEN_SUBMSG_INFO(structname, atype, htype, ltype, fieldname, tag) \ + PB_SUBMSG_INFO_ ## htype(_PB_LTYPE_ ## ltype, structname, fieldname) + +#define PB_SUBMSG_INFO_REQUIRED(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE) +#define PB_SUBMSG_INFO_SINGULAR(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE) +#define PB_SUBMSG_INFO_OPTIONAL(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE) +#define PB_SUBMSG_INFO_ONEOF(ltype, structname, fieldname) PB_SUBMSG_INFO_ONEOF2(ltype, structname, PB_ONEOF_NAME(UNION, fieldname), PB_ONEOF_NAME(MEMBER, fieldname)) +#define PB_SUBMSG_INFO_ONEOF2(ltype, structname, unionname, membername) PB_SUBMSG_INFO_ONEOF3(ltype, structname, unionname, membername) +#define PB_SUBMSG_INFO_ONEOF3(ltype, structname, unionname, membername) PB_SI ## ltype(structname ## _ ## unionname ## _ ## membername ## _MSGTYPE) +#define PB_SUBMSG_INFO_REPEATED(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE) +#define PB_SUBMSG_INFO_FIXARRAY(ltype, structname, fieldname) PB_SI ## ltype(structname ## _ ## fieldname ## _MSGTYPE) +#define PB_SI_PB_LTYPE_BOOL(t) +#define PB_SI_PB_LTYPE_BYTES(t) +#define PB_SI_PB_LTYPE_DOUBLE(t) +#define PB_SI_PB_LTYPE_ENUM(t) +#define PB_SI_PB_LTYPE_UENUM(t) +#define PB_SI_PB_LTYPE_FIXED32(t) +#define PB_SI_PB_LTYPE_FIXED64(t) +#define PB_SI_PB_LTYPE_FLOAT(t) +#define PB_SI_PB_LTYPE_INT32(t) +#define PB_SI_PB_LTYPE_INT64(t) +#define PB_SI_PB_LTYPE_MESSAGE(t) PB_SUBMSG_DESCRIPTOR(t) +#define PB_SI_PB_LTYPE_MSG_W_CB(t) PB_SUBMSG_DESCRIPTOR(t) +#define PB_SI_PB_LTYPE_SFIXED32(t) +#define PB_SI_PB_LTYPE_SFIXED64(t) +#define PB_SI_PB_LTYPE_SINT32(t) +#define PB_SI_PB_LTYPE_SINT64(t) +#define PB_SI_PB_LTYPE_STRING(t) +#define PB_SI_PB_LTYPE_UINT32(t) +#define PB_SI_PB_LTYPE_UINT64(t) +#define PB_SI_PB_LTYPE_EXTENSION(t) +#define PB_SI_PB_LTYPE_FIXED_LENGTH_BYTES(t) +#define PB_SUBMSG_DESCRIPTOR(t) &(t ## _msg), + +/* The field descriptors use a variable width format, with width of either + * 1, 2, 4 or 8 of 32-bit words. The two lowest bytes of the first byte always + * encode the descriptor size, 6 lowest bits of field tag number, and 8 bits + * of the field type. + * + * Descriptor size is encoded as 0 = 1 word, 1 = 2 words, 2 = 4 words, 3 = 8 words. + * + * Formats, listed starting with the least significant bit of the first word. + * 1 word: [2-bit len] [6-bit tag] [8-bit type] [8-bit data_offset] [4-bit size_offset] [4-bit data_size] + * + * 2 words: [2-bit len] [6-bit tag] [8-bit type] [12-bit array_size] [4-bit size_offset] + * [16-bit data_offset] [12-bit data_size] [4-bit tag>>6] + * + * 4 words: [2-bit len] [6-bit tag] [8-bit type] [16-bit array_size] + * [8-bit size_offset] [24-bit tag>>6] + * [32-bit data_offset] + * [32-bit data_size] + * + * 8 words: [2-bit len] [6-bit tag] [8-bit type] [16-bit reserved] + * [8-bit size_offset] [24-bit tag>>6] + * [32-bit data_offset] + * [32-bit data_size] + * [32-bit array_size] + * [32-bit reserved] + * [32-bit reserved] + * [32-bit reserved] + */ + +#define PB_FIELDINFO_1(tag, type, data_offset, data_size, size_offset, array_size) \ + (0 | (((tag) << 2) & 0xFF) | ((type) << 8) | (((uint32_t)(data_offset) & 0xFF) << 16) | \ + (((uint32_t)(size_offset) & 0x0F) << 24) | (((uint32_t)(data_size) & 0x0F) << 28)), + +#define PB_FIELDINFO_2(tag, type, data_offset, data_size, size_offset, array_size) \ + (1 | (((tag) << 2) & 0xFF) | ((type) << 8) | (((uint32_t)(array_size) & 0xFFF) << 16) | (((uint32_t)(size_offset) & 0x0F) << 28)), \ + (((uint32_t)(data_offset) & 0xFFFF) | (((uint32_t)(data_size) & 0xFFF) << 16) | (((uint32_t)(tag) & 0x3c0) << 22)), + +#define PB_FIELDINFO_4(tag, type, data_offset, data_size, size_offset, array_size) \ + (2 | (((tag) << 2) & 0xFF) | ((type) << 8) | (((uint32_t)(array_size) & 0xFFFF) << 16)), \ + ((uint32_t)(int_least8_t)(size_offset) | (((uint32_t)(tag) << 2) & 0xFFFFFF00)), \ + (data_offset), (data_size), + +#define PB_FIELDINFO_8(tag, type, data_offset, data_size, size_offset, array_size) \ + (3 | (((tag) << 2) & 0xFF) | ((type) << 8)), \ + ((uint32_t)(int_least8_t)(size_offset) | (((uint32_t)(tag) << 2) & 0xFFFFFF00)), \ + (data_offset), (data_size), (array_size), 0, 0, 0, + +/* These assertions verify that the field information fits in the allocated space. + * The generator tries to automatically determine the correct width that can fit all + * data associated with a message. These asserts will fail only if there has been a + * problem in the automatic logic - this may be worth reporting as a bug. As a workaround, + * you can increase the descriptor width by defining PB_FIELDINFO_WIDTH or by setting + * descriptorsize option in .options file. + */ +#define PB_FITS(value,bits) ((uint32_t)(value) < ((uint32_t)1<2GB messages with nanopb anyway. + */ +#define PB_FIELDINFO_ASSERT_4(tag, type, data_offset, data_size, size_offset, array_size) \ + PB_STATIC_ASSERT(PB_FITS(tag,30) && PB_FITS(data_offset,31) && PB_FITS(size_offset,8) && PB_FITS(data_size,31) && PB_FITS(array_size,16), FIELDINFO_DOES_NOT_FIT_width4_field ## tag) + +#define PB_FIELDINFO_ASSERT_8(tag, type, data_offset, data_size, size_offset, array_size) \ + PB_STATIC_ASSERT(PB_FITS(tag,30) && PB_FITS(data_offset,31) && PB_FITS(size_offset,8) && PB_FITS(data_size,31) && PB_FITS(array_size,31), FIELDINFO_DOES_NOT_FIT_width8_field ## tag) +#endif + + +/* Automatic picking of FIELDINFO width: + * Uses width 1 when possible, otherwise resorts to width 2. + * This is used when PB_BIND() is called with "AUTO" as the argument. + * The generator will give explicit size argument when it knows that a message + * structure grows beyond 1-word format limits. + */ +#define PB_FIELDINFO_WIDTH_AUTO(atype, htype, ltype) PB_FI_WIDTH ## atype(htype, ltype) +#define PB_FI_WIDTH_PB_ATYPE_STATIC(htype, ltype) PB_FI_WIDTH ## htype(ltype) +#define PB_FI_WIDTH_PB_ATYPE_POINTER(htype, ltype) PB_FI_WIDTH ## htype(ltype) +#define PB_FI_WIDTH_PB_ATYPE_CALLBACK(htype, ltype) 2 +#define PB_FI_WIDTH_PB_HTYPE_REQUIRED(ltype) PB_FI_WIDTH ## ltype +#define PB_FI_WIDTH_PB_HTYPE_SINGULAR(ltype) PB_FI_WIDTH ## ltype +#define PB_FI_WIDTH_PB_HTYPE_OPTIONAL(ltype) PB_FI_WIDTH ## ltype +#define PB_FI_WIDTH_PB_HTYPE_ONEOF(ltype) PB_FI_WIDTH ## ltype +#define PB_FI_WIDTH_PB_HTYPE_REPEATED(ltype) 2 +#define PB_FI_WIDTH_PB_HTYPE_FIXARRAY(ltype) 2 +#define PB_FI_WIDTH_PB_LTYPE_BOOL 1 +#define PB_FI_WIDTH_PB_LTYPE_BYTES 2 +#define PB_FI_WIDTH_PB_LTYPE_DOUBLE 1 +#define PB_FI_WIDTH_PB_LTYPE_ENUM 1 +#define PB_FI_WIDTH_PB_LTYPE_UENUM 1 +#define PB_FI_WIDTH_PB_LTYPE_FIXED32 1 +#define PB_FI_WIDTH_PB_LTYPE_FIXED64 1 +#define PB_FI_WIDTH_PB_LTYPE_FLOAT 1 +#define PB_FI_WIDTH_PB_LTYPE_INT32 1 +#define PB_FI_WIDTH_PB_LTYPE_INT64 1 +#define PB_FI_WIDTH_PB_LTYPE_MESSAGE 2 +#define PB_FI_WIDTH_PB_LTYPE_MSG_W_CB 2 +#define PB_FI_WIDTH_PB_LTYPE_SFIXED32 1 +#define PB_FI_WIDTH_PB_LTYPE_SFIXED64 1 +#define PB_FI_WIDTH_PB_LTYPE_SINT32 1 +#define PB_FI_WIDTH_PB_LTYPE_SINT64 1 +#define PB_FI_WIDTH_PB_LTYPE_STRING 2 +#define PB_FI_WIDTH_PB_LTYPE_UINT32 1 +#define PB_FI_WIDTH_PB_LTYPE_UINT64 1 +#define PB_FI_WIDTH_PB_LTYPE_EXTENSION 1 +#define PB_FI_WIDTH_PB_LTYPE_FIXED_LENGTH_BYTES 2 + +/* The mapping from protobuf types to LTYPEs is done using these macros. */ +#define PB_LTYPE_MAP_BOOL PB_LTYPE_BOOL +#define PB_LTYPE_MAP_BYTES PB_LTYPE_BYTES +#define PB_LTYPE_MAP_DOUBLE PB_LTYPE_FIXED64 +#define PB_LTYPE_MAP_ENUM PB_LTYPE_VARINT +#define PB_LTYPE_MAP_UENUM PB_LTYPE_UVARINT +#define PB_LTYPE_MAP_FIXED32 PB_LTYPE_FIXED32 +#define PB_LTYPE_MAP_FIXED64 PB_LTYPE_FIXED64 +#define PB_LTYPE_MAP_FLOAT PB_LTYPE_FIXED32 +#define PB_LTYPE_MAP_INT32 PB_LTYPE_VARINT +#define PB_LTYPE_MAP_INT64 PB_LTYPE_VARINT +#define PB_LTYPE_MAP_MESSAGE PB_LTYPE_SUBMESSAGE +#define PB_LTYPE_MAP_MSG_W_CB PB_LTYPE_SUBMSG_W_CB +#define PB_LTYPE_MAP_SFIXED32 PB_LTYPE_FIXED32 +#define PB_LTYPE_MAP_SFIXED64 PB_LTYPE_FIXED64 +#define PB_LTYPE_MAP_SINT32 PB_LTYPE_SVARINT +#define PB_LTYPE_MAP_SINT64 PB_LTYPE_SVARINT +#define PB_LTYPE_MAP_STRING PB_LTYPE_STRING +#define PB_LTYPE_MAP_UINT32 PB_LTYPE_UVARINT +#define PB_LTYPE_MAP_UINT64 PB_LTYPE_UVARINT +#define PB_LTYPE_MAP_EXTENSION PB_LTYPE_EXTENSION +#define PB_LTYPE_MAP_FIXED_LENGTH_BYTES PB_LTYPE_FIXED_LENGTH_BYTES + +/* These macros are used for giving out error messages. + * They are mostly a debugging aid; the main error information + * is the true/false return value from functions. + * Some code space can be saved by disabling the error + * messages if not used. + * + * PB_SET_ERROR() sets the error message if none has been set yet. + * msg must be a constant string literal. + * PB_GET_ERROR() always returns a pointer to a string. + * PB_RETURN_ERROR() sets the error and returns false from current + * function. + */ +#ifdef PB_NO_ERRMSG +#define PB_SET_ERROR(stream, msg) PB_UNUSED(stream) +#define PB_GET_ERROR(stream) "(errmsg disabled)" +#else +#define PB_SET_ERROR(stream, msg) (stream->errmsg = (stream)->errmsg ? (stream)->errmsg : (msg)) +#define PB_GET_ERROR(stream) ((stream)->errmsg ? (stream)->errmsg : "(none)") +#endif + +#define PB_RETURN_ERROR(stream, msg) return PB_SET_ERROR(stream, msg), false + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#ifdef __cplusplus +#if __cplusplus >= 201103L +#define PB_CONSTEXPR constexpr +#else // __cplusplus >= 201103L +#define PB_CONSTEXPR +#endif // __cplusplus >= 201103L + +#if __cplusplus >= 201703L +#define PB_INLINE_CONSTEXPR inline constexpr +#else // __cplusplus >= 201703L +#define PB_INLINE_CONSTEXPR PB_CONSTEXPR +#endif // __cplusplus >= 201703L + +extern "C++" +{ +namespace nanopb { +// Each type will be partially specialized by the generator. +template struct MessageDescriptor; +} // namespace nanopb +} +#endif /* __cplusplus */ + +#endif diff --git a/src/third_party/librdkafka/dist/src/nanopb/pb_common.c b/src/third_party/librdkafka/dist/src/nanopb/pb_common.c new file mode 100644 index 00000000000..e4765d8a6cf --- /dev/null +++ b/src/third_party/librdkafka/dist/src/nanopb/pb_common.c @@ -0,0 +1,388 @@ +/* pb_common.c: Common support functions for pb_encode.c and pb_decode.c. + * + * 2014 Petteri Aimonen + */ + +#include "nanopb/pb_common.h" + +static bool load_descriptor_values(pb_field_iter_t *iter) +{ + uint32_t word0; + uint32_t data_offset; + int_least8_t size_offset; + + if (iter->index >= iter->descriptor->field_count) + return false; + + word0 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index]); + iter->type = (pb_type_t)((word0 >> 8) & 0xFF); + + switch(word0 & 3) + { + case 0: { + /* 1-word format */ + iter->array_size = 1; + iter->tag = (pb_size_t)((word0 >> 2) & 0x3F); + size_offset = (int_least8_t)((word0 >> 24) & 0x0F); + data_offset = (word0 >> 16) & 0xFF; + iter->data_size = (pb_size_t)((word0 >> 28) & 0x0F); + break; + } + + case 1: { + /* 2-word format */ + uint32_t word1 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 1]); + + iter->array_size = (pb_size_t)((word0 >> 16) & 0x0FFF); + iter->tag = (pb_size_t)(((word0 >> 2) & 0x3F) | ((word1 >> 28) << 6)); + size_offset = (int_least8_t)((word0 >> 28) & 0x0F); + data_offset = word1 & 0xFFFF; + iter->data_size = (pb_size_t)((word1 >> 16) & 0x0FFF); + break; + } + + case 2: { + /* 4-word format */ + uint32_t word1 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 1]); + uint32_t word2 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 2]); + uint32_t word3 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 3]); + + iter->array_size = (pb_size_t)(word0 >> 16); + iter->tag = (pb_size_t)(((word0 >> 2) & 0x3F) | ((word1 >> 8) << 6)); + size_offset = (int_least8_t)(word1 & 0xFF); + data_offset = word2; + iter->data_size = (pb_size_t)word3; + break; + } + + default: { + /* 8-word format */ + uint32_t word1 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 1]); + uint32_t word2 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 2]); + uint32_t word3 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 3]); + uint32_t word4 = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index + 4]); + + iter->array_size = (pb_size_t)word4; + iter->tag = (pb_size_t)(((word0 >> 2) & 0x3F) | ((word1 >> 8) << 6)); + size_offset = (int_least8_t)(word1 & 0xFF); + data_offset = word2; + iter->data_size = (pb_size_t)word3; + break; + } + } + + if (!iter->message) + { + /* Avoid doing arithmetic on null pointers, it is undefined */ + iter->pField = NULL; + iter->pSize = NULL; + } + else + { + iter->pField = (char*)iter->message + data_offset; + + if (size_offset) + { + iter->pSize = (char*)iter->pField - size_offset; + } + else if (PB_HTYPE(iter->type) == PB_HTYPE_REPEATED && + (PB_ATYPE(iter->type) == PB_ATYPE_STATIC || + PB_ATYPE(iter->type) == PB_ATYPE_POINTER)) + { + /* Fixed count array */ + iter->pSize = &iter->array_size; + } + else + { + iter->pSize = NULL; + } + + if (PB_ATYPE(iter->type) == PB_ATYPE_POINTER && iter->pField != NULL) + { + iter->pData = *(void**)iter->pField; + } + else + { + iter->pData = iter->pField; + } + } + + if (PB_LTYPE_IS_SUBMSG(iter->type)) + { + iter->submsg_desc = iter->descriptor->submsg_info[iter->submessage_index]; + } + else + { + iter->submsg_desc = NULL; + } + + return true; +} + +static void advance_iterator(pb_field_iter_t *iter) +{ + iter->index++; + + if (iter->index >= iter->descriptor->field_count) + { + /* Restart */ + iter->index = 0; + iter->field_info_index = 0; + iter->submessage_index = 0; + iter->required_field_index = 0; + } + else + { + /* Increment indexes based on previous field type. + * All field info formats have the following fields: + * - lowest 2 bits tell the amount of words in the descriptor (2^n words) + * - bits 2..7 give the lowest bits of tag number. + * - bits 8..15 give the field type. + */ + uint32_t prev_descriptor = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index]); + pb_type_t prev_type = (prev_descriptor >> 8) & 0xFF; + pb_size_t descriptor_len = (pb_size_t)(1 << (prev_descriptor & 3)); + + /* Add to fields. + * The cast to pb_size_t is needed to avoid -Wconversion warning. + * Because the data is is constants from generator, there is no danger of overflow. + */ + iter->field_info_index = (pb_size_t)(iter->field_info_index + descriptor_len); + iter->required_field_index = (pb_size_t)(iter->required_field_index + (PB_HTYPE(prev_type) == PB_HTYPE_REQUIRED)); + iter->submessage_index = (pb_size_t)(iter->submessage_index + PB_LTYPE_IS_SUBMSG(prev_type)); + } +} + +bool pb_field_iter_begin(pb_field_iter_t *iter, const pb_msgdesc_t *desc, void *message) +{ + memset(iter, 0, sizeof(*iter)); + + iter->descriptor = desc; + iter->message = message; + + return load_descriptor_values(iter); +} + +bool pb_field_iter_begin_extension(pb_field_iter_t *iter, pb_extension_t *extension) +{ + const pb_msgdesc_t *msg = (const pb_msgdesc_t*)extension->type->arg; + bool status; + + uint32_t word0 = PB_PROGMEM_READU32(msg->field_info[0]); + if (PB_ATYPE(word0 >> 8) == PB_ATYPE_POINTER) + { + /* For pointer extensions, the pointer is stored directly + * in the extension structure. This avoids having an extra + * indirection. */ + status = pb_field_iter_begin(iter, msg, &extension->dest); + } + else + { + status = pb_field_iter_begin(iter, msg, extension->dest); + } + + iter->pSize = &extension->found; + return status; +} + +bool pb_field_iter_next(pb_field_iter_t *iter) +{ + advance_iterator(iter); + (void)load_descriptor_values(iter); + return iter->index != 0; +} + +bool pb_field_iter_find(pb_field_iter_t *iter, uint32_t tag) +{ + if (iter->tag == tag) + { + return true; /* Nothing to do, correct field already. */ + } + else if (tag > iter->descriptor->largest_tag) + { + return false; + } + else + { + pb_size_t start = iter->index; + uint32_t fieldinfo; + + if (tag < iter->tag) + { + /* Fields are in tag number order, so we know that tag is between + * 0 and our start position. Setting index to end forces + * advance_iterator() call below to restart from beginning. */ + iter->index = iter->descriptor->field_count; + } + + do + { + /* Advance iterator but don't load values yet */ + advance_iterator(iter); + + /* Do fast check for tag number match */ + fieldinfo = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index]); + + if (((fieldinfo >> 2) & 0x3F) == (tag & 0x3F)) + { + /* Good candidate, check further */ + (void)load_descriptor_values(iter); + + if (iter->tag == tag && + PB_LTYPE(iter->type) != PB_LTYPE_EXTENSION) + { + /* Found it */ + return true; + } + } + } while (iter->index != start); + + /* Searched all the way back to start, and found nothing. */ + (void)load_descriptor_values(iter); + return false; + } +} + +bool pb_field_iter_find_extension(pb_field_iter_t *iter) +{ + if (PB_LTYPE(iter->type) == PB_LTYPE_EXTENSION) + { + return true; + } + else + { + pb_size_t start = iter->index; + uint32_t fieldinfo; + + do + { + /* Advance iterator but don't load values yet */ + advance_iterator(iter); + + /* Do fast check for field type */ + fieldinfo = PB_PROGMEM_READU32(iter->descriptor->field_info[iter->field_info_index]); + + if (PB_LTYPE((fieldinfo >> 8) & 0xFF) == PB_LTYPE_EXTENSION) + { + return load_descriptor_values(iter); + } + } while (iter->index != start); + + /* Searched all the way back to start, and found nothing. */ + (void)load_descriptor_values(iter); + return false; + } +} + +static void *pb_const_cast(const void *p) +{ + /* Note: this casts away const, in order to use the common field iterator + * logic for both encoding and decoding. The cast is done using union + * to avoid spurious compiler warnings. */ + union { + void *p1; + const void *p2; + } t; + t.p2 = p; + return t.p1; +} + +bool pb_field_iter_begin_const(pb_field_iter_t *iter, const pb_msgdesc_t *desc, const void *message) +{ + return pb_field_iter_begin(iter, desc, pb_const_cast(message)); +} + +bool pb_field_iter_begin_extension_const(pb_field_iter_t *iter, const pb_extension_t *extension) +{ + return pb_field_iter_begin_extension(iter, (pb_extension_t*)pb_const_cast(extension)); +} + +bool pb_default_field_callback(pb_istream_t *istream, pb_ostream_t *ostream, const pb_field_t *field) +{ + if (field->data_size == sizeof(pb_callback_t)) + { + pb_callback_t *pCallback = (pb_callback_t*)field->pData; + + if (pCallback != NULL) + { + if (istream != NULL && pCallback->funcs.decode != NULL) + { + return pCallback->funcs.decode(istream, field, &pCallback->arg); + } + + if (ostream != NULL && pCallback->funcs.encode != NULL) + { + return pCallback->funcs.encode(ostream, field, &pCallback->arg); + } + } + } + + return true; /* Success, but didn't do anything */ + +} + +#ifdef PB_VALIDATE_UTF8 + +/* This function checks whether a string is valid UTF-8 text. + * + * Algorithm is adapted from https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c + * Original copyright: Markus Kuhn 2005-03-30 + * Licensed under "Short code license", which allows use under MIT license or + * any compatible with it. + */ + +bool pb_validate_utf8(const char *str) +{ + const pb_byte_t *s = (const pb_byte_t*)str; + while (*s) + { + if (*s < 0x80) + { + /* 0xxxxxxx */ + s++; + } + else if ((s[0] & 0xe0) == 0xc0) + { + /* 110XXXXx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) /* overlong? */ + return false; + else + s += 2; + } + else if ((s[0] & 0xf0) == 0xe0) + { + /* 1110XXXX 10Xxxxxx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */ + return false; + else + s += 3; + } + else if ((s[0] & 0xf8) == 0xf0) + { + /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */ + return false; + else + s += 4; + } + else + { + return false; + } + } + + return true; +} + +#endif + diff --git a/src/third_party/librdkafka/dist/src/nanopb/pb_common.h b/src/third_party/librdkafka/dist/src/nanopb/pb_common.h new file mode 100644 index 00000000000..dda3af3b96f --- /dev/null +++ b/src/third_party/librdkafka/dist/src/nanopb/pb_common.h @@ -0,0 +1,49 @@ +/* pb_common.h: Common support functions for pb_encode.c and pb_decode.c. + * These functions are rarely needed by applications directly. + */ + +#ifndef PB_COMMON_H_INCLUDED +#define PB_COMMON_H_INCLUDED + +#include "nanopb/pb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Initialize the field iterator structure to beginning. + * Returns false if the message type is empty. */ +bool pb_field_iter_begin(pb_field_iter_t *iter, const pb_msgdesc_t *desc, void *message); + +/* Get a field iterator for extension field. */ +bool pb_field_iter_begin_extension(pb_field_iter_t *iter, pb_extension_t *extension); + +/* Same as pb_field_iter_begin(), but for const message pointer. + * Note that the pointers in pb_field_iter_t will be non-const but shouldn't + * be written to when using these functions. */ +bool pb_field_iter_begin_const(pb_field_iter_t *iter, const pb_msgdesc_t *desc, const void *message); +bool pb_field_iter_begin_extension_const(pb_field_iter_t *iter, const pb_extension_t *extension); + +/* Advance the iterator to the next field. + * Returns false when the iterator wraps back to the first field. */ +bool pb_field_iter_next(pb_field_iter_t *iter); + +/* Advance the iterator until it points at a field with the given tag. + * Returns false if no such field exists. */ +bool pb_field_iter_find(pb_field_iter_t *iter, uint32_t tag); + +/* Find a field with type PB_LTYPE_EXTENSION, or return false if not found. + * There can be only one extension range field per message. */ +bool pb_field_iter_find_extension(pb_field_iter_t *iter); + +#ifdef PB_VALIDATE_UTF8 +/* Validate UTF-8 text string */ +bool pb_validate_utf8(const char *s); +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif + diff --git a/src/third_party/librdkafka/dist/src/nanopb/pb_decode.c b/src/third_party/librdkafka/dist/src/nanopb/pb_decode.c new file mode 100644 index 00000000000..28ad344f575 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/nanopb/pb_decode.c @@ -0,0 +1,1727 @@ +/* pb_decode.c -- decode a protobuf using minimal resources + * + * 2011 Petteri Aimonen + */ + +/* Use the GCC warn_unused_result attribute to check that all return values + * are propagated correctly. On other compilers and gcc before 3.4.0 just + * ignore the annotation. + */ +#if !defined(__GNUC__) || ( __GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 4) + #define checkreturn +#else + #define checkreturn __attribute__((warn_unused_result)) +#endif + +#include "nanopb/pb.h" +#include "nanopb/pb_decode.h" +#include "nanopb/pb_common.h" + +/************************************** + * Declarations internal to this file * + **************************************/ + +static bool checkreturn buf_read(pb_istream_t *stream, pb_byte_t *buf, size_t count); +static bool checkreturn pb_decode_varint32_eof(pb_istream_t *stream, uint32_t *dest, bool *eof); +static bool checkreturn read_raw_value(pb_istream_t *stream, pb_wire_type_t wire_type, pb_byte_t *buf, size_t *size); +static bool checkreturn decode_basic_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field); +static bool checkreturn decode_static_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field); +static bool checkreturn decode_pointer_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field); +static bool checkreturn decode_callback_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field); +static bool checkreturn decode_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field); +static bool checkreturn default_extension_decoder(pb_istream_t *stream, pb_extension_t *extension, uint32_t tag, pb_wire_type_t wire_type); +static bool checkreturn decode_extension(pb_istream_t *stream, uint32_t tag, pb_wire_type_t wire_type, pb_extension_t *extension); +static bool pb_field_set_to_default(pb_field_iter_t *field); +static bool pb_message_set_to_defaults(pb_field_iter_t *iter); +static bool checkreturn pb_dec_bool(pb_istream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_dec_varint(pb_istream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_dec_bytes(pb_istream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_dec_fixed_length_bytes(pb_istream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_skip_varint(pb_istream_t *stream); +static bool checkreturn pb_skip_string(pb_istream_t *stream); + +#ifdef PB_ENABLE_MALLOC +static bool checkreturn allocate_field(pb_istream_t *stream, void *pData, size_t data_size, size_t array_size); +static void initialize_pointer_field(void *pItem, pb_field_iter_t *field); +static bool checkreturn pb_release_union_field(pb_istream_t *stream, pb_field_iter_t *field); +static void pb_release_single_field(pb_field_iter_t *field); +#endif + +#ifdef PB_WITHOUT_64BIT +#define pb_int64_t int32_t +#define pb_uint64_t uint32_t +#else +#define pb_int64_t int64_t +#define pb_uint64_t uint64_t +#endif + +typedef struct { + uint32_t bitfield[(PB_MAX_REQUIRED_FIELDS + 31) / 32]; +} pb_fields_seen_t; + +/******************************* + * pb_istream_t implementation * + *******************************/ + +static bool checkreturn buf_read(pb_istream_t *stream, pb_byte_t *buf, size_t count) +{ + const pb_byte_t *source = (const pb_byte_t*)stream->state; + stream->state = (pb_byte_t*)stream->state + count; + + if (buf != NULL) + { + memcpy(buf, source, count * sizeof(pb_byte_t)); + } + + return true; +} + +bool checkreturn pb_read(pb_istream_t *stream, pb_byte_t *buf, size_t count) +{ + if (count == 0) + return true; + +#ifndef PB_BUFFER_ONLY + if (buf == NULL && stream->callback != buf_read) + { + /* Skip input bytes */ + pb_byte_t tmp[16]; + while (count > 16) + { + if (!pb_read(stream, tmp, 16)) + return false; + + count -= 16; + } + + return pb_read(stream, tmp, count); + } +#endif + + if (stream->bytes_left < count) + PB_RETURN_ERROR(stream, "end-of-stream"); + +#ifndef PB_BUFFER_ONLY + if (!stream->callback(stream, buf, count)) + PB_RETURN_ERROR(stream, "io error"); +#else + if (!buf_read(stream, buf, count)) + return false; +#endif + + if (stream->bytes_left < count) + stream->bytes_left = 0; + else + stream->bytes_left -= count; + + return true; +} + +/* Read a single byte from input stream. buf may not be NULL. + * This is an optimization for the varint decoding. */ +static bool checkreturn pb_readbyte(pb_istream_t *stream, pb_byte_t *buf) +{ + if (stream->bytes_left == 0) + PB_RETURN_ERROR(stream, "end-of-stream"); + +#ifndef PB_BUFFER_ONLY + if (!stream->callback(stream, buf, 1)) + PB_RETURN_ERROR(stream, "io error"); +#else + *buf = *(const pb_byte_t*)stream->state; + stream->state = (pb_byte_t*)stream->state + 1; +#endif + + stream->bytes_left--; + + return true; +} + +pb_istream_t pb_istream_from_buffer(const pb_byte_t *buf, size_t msglen) +{ + pb_istream_t stream; + /* Cast away the const from buf without a compiler error. We are + * careful to use it only in a const manner in the callbacks. + */ + union { + void *state; + const void *c_state; + } state; +#ifdef PB_BUFFER_ONLY + stream.callback = NULL; +#else + stream.callback = &buf_read; +#endif + state.c_state = buf; + stream.state = state.state; + stream.bytes_left = msglen; +#ifndef PB_NO_ERRMSG + stream.errmsg = NULL; +#endif + return stream; +} + +/******************** + * Helper functions * + ********************/ + +static bool checkreturn pb_decode_varint32_eof(pb_istream_t *stream, uint32_t *dest, bool *eof) +{ + pb_byte_t byte; + uint32_t result; + + if (!pb_readbyte(stream, &byte)) + { + if (stream->bytes_left == 0) + { + if (eof) + { + *eof = true; + } + } + + return false; + } + + if ((byte & 0x80) == 0) + { + /* Quick case, 1 byte value */ + result = byte; + } + else + { + /* Multibyte case */ + uint_fast8_t bitpos = 7; + result = byte & 0x7F; + + do + { + if (!pb_readbyte(stream, &byte)) + return false; + + if (bitpos >= 32) + { + /* Note: The varint could have trailing 0x80 bytes, or 0xFF for negative. */ + pb_byte_t sign_extension = (bitpos < 63) ? 0xFF : 0x01; + bool valid_extension = ((byte & 0x7F) == 0x00 || + ((result >> 31) != 0 && byte == sign_extension)); + + if (bitpos >= 64 || !valid_extension) + { + PB_RETURN_ERROR(stream, "varint overflow"); + } + } + else if (bitpos == 28) + { + if ((byte & 0x70) != 0 && (byte & 0x78) != 0x78) + { + PB_RETURN_ERROR(stream, "varint overflow"); + } + result |= (uint32_t)(byte & 0x0F) << bitpos; + } + else + { + result |= (uint32_t)(byte & 0x7F) << bitpos; + } + bitpos = (uint_fast8_t)(bitpos + 7); + } while (byte & 0x80); + } + + *dest = result; + return true; +} + +bool checkreturn pb_decode_varint32(pb_istream_t *stream, uint32_t *dest) +{ + return pb_decode_varint32_eof(stream, dest, NULL); +} + +#ifndef PB_WITHOUT_64BIT +bool checkreturn pb_decode_varint(pb_istream_t *stream, uint64_t *dest) +{ + pb_byte_t byte; + uint_fast8_t bitpos = 0; + uint64_t result = 0; + + do + { + if (!pb_readbyte(stream, &byte)) + return false; + + if (bitpos >= 63 && (byte & 0xFE) != 0) + PB_RETURN_ERROR(stream, "varint overflow"); + + result |= (uint64_t)(byte & 0x7F) << bitpos; + bitpos = (uint_fast8_t)(bitpos + 7); + } while (byte & 0x80); + + *dest = result; + return true; +} +#endif + +bool checkreturn pb_skip_varint(pb_istream_t *stream) +{ + pb_byte_t byte; + do + { + if (!pb_read(stream, &byte, 1)) + return false; + } while (byte & 0x80); + return true; +} + +bool checkreturn pb_skip_string(pb_istream_t *stream) +{ + uint32_t length; + if (!pb_decode_varint32(stream, &length)) + return false; + + if ((size_t)length != length) + { + PB_RETURN_ERROR(stream, "size too large"); + } + + return pb_read(stream, NULL, (size_t)length); +} + +bool checkreturn pb_decode_tag(pb_istream_t *stream, pb_wire_type_t *wire_type, uint32_t *tag, bool *eof) +{ + uint32_t temp; + *eof = false; + *wire_type = (pb_wire_type_t) 0; + *tag = 0; + + if (!pb_decode_varint32_eof(stream, &temp, eof)) + { + return false; + } + + *tag = temp >> 3; + *wire_type = (pb_wire_type_t)(temp & 7); + return true; +} + +bool checkreturn pb_skip_field(pb_istream_t *stream, pb_wire_type_t wire_type) +{ + switch (wire_type) + { + case PB_WT_VARINT: return pb_skip_varint(stream); + case PB_WT_64BIT: return pb_read(stream, NULL, 8); + case PB_WT_STRING: return pb_skip_string(stream); + case PB_WT_32BIT: return pb_read(stream, NULL, 4); + default: PB_RETURN_ERROR(stream, "invalid wire_type"); + } +} + +/* Read a raw value to buffer, for the purpose of passing it to callback as + * a substream. Size is maximum size on call, and actual size on return. + */ +static bool checkreturn read_raw_value(pb_istream_t *stream, pb_wire_type_t wire_type, pb_byte_t *buf, size_t *size) +{ + size_t max_size = *size; + switch (wire_type) + { + case PB_WT_VARINT: + *size = 0; + do + { + (*size)++; + if (*size > max_size) + PB_RETURN_ERROR(stream, "varint overflow"); + + if (!pb_read(stream, buf, 1)) + return false; + } while (*buf++ & 0x80); + return true; + + case PB_WT_64BIT: + *size = 8; + return pb_read(stream, buf, 8); + + case PB_WT_32BIT: + *size = 4; + return pb_read(stream, buf, 4); + + case PB_WT_STRING: + /* Calling read_raw_value with a PB_WT_STRING is an error. + * Explicitly handle this case and fallthrough to default to avoid + * compiler warnings. + */ + + default: PB_RETURN_ERROR(stream, "invalid wire_type"); + } +} + +/* Decode string length from stream and return a substream with limited length. + * Remember to close the substream using pb_close_string_substream(). + */ +bool checkreturn pb_make_string_substream(pb_istream_t *stream, pb_istream_t *substream) +{ + uint32_t size; + if (!pb_decode_varint32(stream, &size)) + return false; + + *substream = *stream; + if (substream->bytes_left < size) + PB_RETURN_ERROR(stream, "parent stream too short"); + + substream->bytes_left = (size_t)size; + stream->bytes_left -= (size_t)size; + return true; +} + +bool checkreturn pb_close_string_substream(pb_istream_t *stream, pb_istream_t *substream) +{ + if (substream->bytes_left) { + if (!pb_read(substream, NULL, substream->bytes_left)) + return false; + } + + stream->state = substream->state; + +#ifndef PB_NO_ERRMSG + stream->errmsg = substream->errmsg; +#endif + return true; +} + +/************************* + * Decode a single field * + *************************/ + +static bool checkreturn decode_basic_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field) +{ + switch (PB_LTYPE(field->type)) + { + case PB_LTYPE_BOOL: + if (wire_type != PB_WT_VARINT && wire_type != PB_WT_PACKED) + PB_RETURN_ERROR(stream, "wrong wire type"); + + return pb_dec_bool(stream, field); + + case PB_LTYPE_VARINT: + case PB_LTYPE_UVARINT: + case PB_LTYPE_SVARINT: + if (wire_type != PB_WT_VARINT && wire_type != PB_WT_PACKED) + PB_RETURN_ERROR(stream, "wrong wire type"); + + return pb_dec_varint(stream, field); + + case PB_LTYPE_FIXED32: + if (wire_type != PB_WT_32BIT && wire_type != PB_WT_PACKED) + PB_RETURN_ERROR(stream, "wrong wire type"); + + return pb_decode_fixed32(stream, field->pData); + + case PB_LTYPE_FIXED64: + if (wire_type != PB_WT_64BIT && wire_type != PB_WT_PACKED) + PB_RETURN_ERROR(stream, "wrong wire type"); + +#ifdef PB_CONVERT_DOUBLE_FLOAT + if (field->data_size == sizeof(float)) + { + return pb_decode_double_as_float(stream, (float*)field->pData); + } +#endif + +#ifdef PB_WITHOUT_64BIT + PB_RETURN_ERROR(stream, "invalid data_size"); +#else + return pb_decode_fixed64(stream, field->pData); +#endif + + case PB_LTYPE_BYTES: + if (wire_type != PB_WT_STRING) + PB_RETURN_ERROR(stream, "wrong wire type"); + + return pb_dec_bytes(stream, field); + + case PB_LTYPE_STRING: + if (wire_type != PB_WT_STRING) + PB_RETURN_ERROR(stream, "wrong wire type"); + + return pb_dec_string(stream, field); + + case PB_LTYPE_SUBMESSAGE: + case PB_LTYPE_SUBMSG_W_CB: + if (wire_type != PB_WT_STRING) + PB_RETURN_ERROR(stream, "wrong wire type"); + + return pb_dec_submessage(stream, field); + + case PB_LTYPE_FIXED_LENGTH_BYTES: + if (wire_type != PB_WT_STRING) + PB_RETURN_ERROR(stream, "wrong wire type"); + + return pb_dec_fixed_length_bytes(stream, field); + + default: + PB_RETURN_ERROR(stream, "invalid field type"); + } +} + +static bool checkreturn decode_static_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field) +{ + switch (PB_HTYPE(field->type)) + { + case PB_HTYPE_REQUIRED: + return decode_basic_field(stream, wire_type, field); + + case PB_HTYPE_OPTIONAL: + if (field->pSize != NULL) + *(bool*)field->pSize = true; + return decode_basic_field(stream, wire_type, field); + + case PB_HTYPE_REPEATED: + if (wire_type == PB_WT_STRING + && PB_LTYPE(field->type) <= PB_LTYPE_LAST_PACKABLE) + { + /* Packed array */ + bool status = true; + pb_istream_t substream; + pb_size_t *size = (pb_size_t*)field->pSize; + field->pData = (char*)field->pField + field->data_size * (*size); + + if (!pb_make_string_substream(stream, &substream)) + return false; + + while (substream.bytes_left > 0 && *size < field->array_size) + { + if (!decode_basic_field(&substream, PB_WT_PACKED, field)) + { + status = false; + break; + } + (*size)++; + field->pData = (char*)field->pData + field->data_size; + } + + if (substream.bytes_left != 0) + PB_RETURN_ERROR(stream, "array overflow"); + if (!pb_close_string_substream(stream, &substream)) + return false; + + return status; + } + else + { + /* Repeated field */ + pb_size_t *size = (pb_size_t*)field->pSize; + field->pData = (char*)field->pField + field->data_size * (*size); + + if ((*size)++ >= field->array_size) + PB_RETURN_ERROR(stream, "array overflow"); + + return decode_basic_field(stream, wire_type, field); + } + + case PB_HTYPE_ONEOF: + if (PB_LTYPE_IS_SUBMSG(field->type) && + *(pb_size_t*)field->pSize != field->tag) + { + /* We memset to zero so that any callbacks are set to NULL. + * This is because the callbacks might otherwise have values + * from some other union field. + * If callbacks are needed inside oneof field, use .proto + * option submsg_callback to have a separate callback function + * that can set the fields before submessage is decoded. + * pb_dec_submessage() will set any default values. */ + memset(field->pData, 0, (size_t)field->data_size); + + /* Set default values for the submessage fields. */ + if (field->submsg_desc->default_value != NULL || + field->submsg_desc->field_callback != NULL || + field->submsg_desc->submsg_info[0] != NULL) + { + pb_field_iter_t submsg_iter; + if (pb_field_iter_begin(&submsg_iter, field->submsg_desc, field->pData)) + { + if (!pb_message_set_to_defaults(&submsg_iter)) + PB_RETURN_ERROR(stream, "failed to set defaults"); + } + } + } + *(pb_size_t*)field->pSize = field->tag; + + return decode_basic_field(stream, wire_type, field); + + default: + PB_RETURN_ERROR(stream, "invalid field type"); + } +} + +#ifdef PB_ENABLE_MALLOC +/* Allocate storage for the field and store the pointer at iter->pData. + * array_size is the number of entries to reserve in an array. + * Zero size is not allowed, use pb_free() for releasing. + */ +static bool checkreturn allocate_field(pb_istream_t *stream, void *pData, size_t data_size, size_t array_size) +{ + void *ptr = *(void**)pData; + + if (data_size == 0 || array_size == 0) + PB_RETURN_ERROR(stream, "invalid size"); + +#ifdef __AVR__ + /* Workaround for AVR libc bug 53284: http://savannah.nongnu.org/bugs/?53284 + * Realloc to size of 1 byte can cause corruption of the malloc structures. + */ + if (data_size == 1 && array_size == 1) + { + data_size = 2; + } +#endif + + /* Check for multiplication overflows. + * This code avoids the costly division if the sizes are small enough. + * Multiplication is safe as long as only half of bits are set + * in either multiplicand. + */ + { + const size_t check_limit = (size_t)1 << (sizeof(size_t) * 4); + if (data_size >= check_limit || array_size >= check_limit) + { + const size_t size_max = (size_t)-1; + if (size_max / array_size < data_size) + { + PB_RETURN_ERROR(stream, "size too large"); + } + } + } + + /* Allocate new or expand previous allocation */ + /* Note: on failure the old pointer will remain in the structure, + * the message must be freed by caller also on error return. */ + ptr = pb_realloc(ptr, array_size * data_size); + if (ptr == NULL) + PB_RETURN_ERROR(stream, "realloc failed"); + + *(void**)pData = ptr; + return true; +} + +/* Clear a newly allocated item in case it contains a pointer, or is a submessage. */ +static void initialize_pointer_field(void *pItem, pb_field_iter_t *field) +{ + if (PB_LTYPE(field->type) == PB_LTYPE_STRING || + PB_LTYPE(field->type) == PB_LTYPE_BYTES) + { + *(void**)pItem = NULL; + } + else if (PB_LTYPE_IS_SUBMSG(field->type)) + { + /* We memset to zero so that any callbacks are set to NULL. + * Default values will be set by pb_dec_submessage(). */ + memset(pItem, 0, field->data_size); + } +} +#endif + +static bool checkreturn decode_pointer_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field) +{ +#ifndef PB_ENABLE_MALLOC + PB_UNUSED(wire_type); + PB_UNUSED(field); + PB_RETURN_ERROR(stream, "no malloc support"); +#else + switch (PB_HTYPE(field->type)) + { + case PB_HTYPE_REQUIRED: + case PB_HTYPE_OPTIONAL: + case PB_HTYPE_ONEOF: + if (PB_LTYPE_IS_SUBMSG(field->type) && *(void**)field->pField != NULL) + { + /* Duplicate field, have to release the old allocation first. */ + /* FIXME: Does this work correctly for oneofs? */ + pb_release_single_field(field); + } + + if (PB_HTYPE(field->type) == PB_HTYPE_ONEOF) + { + *(pb_size_t*)field->pSize = field->tag; + } + + if (PB_LTYPE(field->type) == PB_LTYPE_STRING || + PB_LTYPE(field->type) == PB_LTYPE_BYTES) + { + /* pb_dec_string and pb_dec_bytes handle allocation themselves */ + field->pData = field->pField; + return decode_basic_field(stream, wire_type, field); + } + else + { + if (!allocate_field(stream, field->pField, field->data_size, 1)) + return false; + + field->pData = *(void**)field->pField; + initialize_pointer_field(field->pData, field); + return decode_basic_field(stream, wire_type, field); + } + + case PB_HTYPE_REPEATED: + if (wire_type == PB_WT_STRING + && PB_LTYPE(field->type) <= PB_LTYPE_LAST_PACKABLE) + { + /* Packed array, multiple items come in at once. */ + bool status = true; + pb_size_t *size = (pb_size_t*)field->pSize; + size_t allocated_size = *size; + pb_istream_t substream; + + if (!pb_make_string_substream(stream, &substream)) + return false; + + while (substream.bytes_left) + { + if (*size == PB_SIZE_MAX) + { +#ifndef PB_NO_ERRMSG + stream->errmsg = "too many array entries"; +#endif + status = false; + break; + } + + if ((size_t)*size + 1 > allocated_size) + { + /* Allocate more storage. This tries to guess the + * number of remaining entries. Round the division + * upwards. */ + size_t remain = (substream.bytes_left - 1) / field->data_size + 1; + if (remain < PB_SIZE_MAX - allocated_size) + allocated_size += remain; + else + allocated_size += 1; + + if (!allocate_field(&substream, field->pField, field->data_size, allocated_size)) + { + status = false; + break; + } + } + + /* Decode the array entry */ + field->pData = *(char**)field->pField + field->data_size * (*size); + if (field->pData == NULL) + { + /* Shouldn't happen, but satisfies static analyzers */ + status = false; + break; + } + initialize_pointer_field(field->pData, field); + if (!decode_basic_field(&substream, PB_WT_PACKED, field)) + { + status = false; + break; + } + + (*size)++; + } + if (!pb_close_string_substream(stream, &substream)) + return false; + + return status; + } + else + { + /* Normal repeated field, i.e. only one item at a time. */ + pb_size_t *size = (pb_size_t*)field->pSize; + + if (*size == PB_SIZE_MAX) + PB_RETURN_ERROR(stream, "too many array entries"); + + if (!allocate_field(stream, field->pField, field->data_size, (size_t)(*size + 1))) + return false; + + field->pData = *(char**)field->pField + field->data_size * (*size); + (*size)++; + initialize_pointer_field(field->pData, field); + return decode_basic_field(stream, wire_type, field); + } + + default: + PB_RETURN_ERROR(stream, "invalid field type"); + } +#endif +} + +static bool checkreturn decode_callback_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field) +{ + if (!field->descriptor->field_callback) + return pb_skip_field(stream, wire_type); + + if (wire_type == PB_WT_STRING) + { + pb_istream_t substream; + size_t prev_bytes_left; + + if (!pb_make_string_substream(stream, &substream)) + return false; + + do + { + prev_bytes_left = substream.bytes_left; + if (!field->descriptor->field_callback(&substream, NULL, field)) + { + PB_SET_ERROR(stream, substream.errmsg ? substream.errmsg : "callback failed"); + return false; + } + } while (substream.bytes_left > 0 && substream.bytes_left < prev_bytes_left); + + if (!pb_close_string_substream(stream, &substream)) + return false; + + return true; + } + else + { + /* Copy the single scalar value to stack. + * This is required so that we can limit the stream length, + * which in turn allows to use same callback for packed and + * not-packed fields. */ + pb_istream_t substream; + pb_byte_t buffer[10]; + size_t size = sizeof(buffer); + + if (!read_raw_value(stream, wire_type, buffer, &size)) + return false; + substream = pb_istream_from_buffer(buffer, size); + + return field->descriptor->field_callback(&substream, NULL, field); + } +} + +static bool checkreturn decode_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *field) +{ +#ifdef PB_ENABLE_MALLOC + /* When decoding an oneof field, check if there is old data that must be + * released first. */ + if (PB_HTYPE(field->type) == PB_HTYPE_ONEOF) + { + if (!pb_release_union_field(stream, field)) + return false; + } +#endif + + switch (PB_ATYPE(field->type)) + { + case PB_ATYPE_STATIC: + return decode_static_field(stream, wire_type, field); + + case PB_ATYPE_POINTER: + return decode_pointer_field(stream, wire_type, field); + + case PB_ATYPE_CALLBACK: + return decode_callback_field(stream, wire_type, field); + + default: + PB_RETURN_ERROR(stream, "invalid field type"); + } +} + +/* Default handler for extension fields. Expects to have a pb_msgdesc_t + * pointer in the extension->type->arg field, pointing to a message with + * only one field in it. */ +static bool checkreturn default_extension_decoder(pb_istream_t *stream, + pb_extension_t *extension, uint32_t tag, pb_wire_type_t wire_type) +{ + pb_field_iter_t iter; + + if (!pb_field_iter_begin_extension(&iter, extension)) + PB_RETURN_ERROR(stream, "invalid extension"); + + if (iter.tag != tag || !iter.message) + return true; + + extension->found = true; + return decode_field(stream, wire_type, &iter); +} + +/* Try to decode an unknown field as an extension field. Tries each extension + * decoder in turn, until one of them handles the field or loop ends. */ +static bool checkreturn decode_extension(pb_istream_t *stream, + uint32_t tag, pb_wire_type_t wire_type, pb_extension_t *extension) +{ + size_t pos = stream->bytes_left; + + while (extension != NULL && pos == stream->bytes_left) + { + bool status; + if (extension->type->decode) + status = extension->type->decode(stream, extension, tag, wire_type); + else + status = default_extension_decoder(stream, extension, tag, wire_type); + + if (!status) + return false; + + extension = extension->next; + } + + return true; +} + +/* Initialize message fields to default values, recursively */ +static bool pb_field_set_to_default(pb_field_iter_t *field) +{ + pb_type_t type; + type = field->type; + + if (PB_LTYPE(type) == PB_LTYPE_EXTENSION) + { + pb_extension_t *ext = *(pb_extension_t* const *)field->pData; + while (ext != NULL) + { + pb_field_iter_t ext_iter; + if (pb_field_iter_begin_extension(&ext_iter, ext)) + { + ext->found = false; + if (!pb_message_set_to_defaults(&ext_iter)) + return false; + } + ext = ext->next; + } + } + else if (PB_ATYPE(type) == PB_ATYPE_STATIC) + { + bool init_data = true; + if (PB_HTYPE(type) == PB_HTYPE_OPTIONAL && field->pSize != NULL) + { + /* Set has_field to false. Still initialize the optional field + * itself also. */ + *(bool*)field->pSize = false; + } + else if (PB_HTYPE(type) == PB_HTYPE_REPEATED || + PB_HTYPE(type) == PB_HTYPE_ONEOF) + { + /* REPEATED: Set array count to 0, no need to initialize contents. + ONEOF: Set which_field to 0. */ + *(pb_size_t*)field->pSize = 0; + init_data = false; + } + + if (init_data) + { + if (PB_LTYPE_IS_SUBMSG(field->type) && + (field->submsg_desc->default_value != NULL || + field->submsg_desc->field_callback != NULL || + field->submsg_desc->submsg_info[0] != NULL)) + { + /* Initialize submessage to defaults. + * Only needed if it has default values + * or callback/submessage fields. */ + pb_field_iter_t submsg_iter; + if (pb_field_iter_begin(&submsg_iter, field->submsg_desc, field->pData)) + { + if (!pb_message_set_to_defaults(&submsg_iter)) + return false; + } + } + else + { + /* Initialize to zeros */ + memset(field->pData, 0, (size_t)field->data_size); + } + } + } + else if (PB_ATYPE(type) == PB_ATYPE_POINTER) + { + /* Initialize the pointer to NULL. */ + *(void**)field->pField = NULL; + + /* Initialize array count to 0. */ + if (PB_HTYPE(type) == PB_HTYPE_REPEATED || + PB_HTYPE(type) == PB_HTYPE_ONEOF) + { + *(pb_size_t*)field->pSize = 0; + } + } + else if (PB_ATYPE(type) == PB_ATYPE_CALLBACK) + { + /* Don't overwrite callback */ + } + + return true; +} + +static bool pb_message_set_to_defaults(pb_field_iter_t *iter) +{ + pb_istream_t defstream = PB_ISTREAM_EMPTY; + uint32_t tag = 0; + pb_wire_type_t wire_type = PB_WT_VARINT; + bool eof; + + if (iter->descriptor->default_value) + { + defstream = pb_istream_from_buffer(iter->descriptor->default_value, (size_t)-1); + if (!pb_decode_tag(&defstream, &wire_type, &tag, &eof)) + return false; + } + + do + { + if (!pb_field_set_to_default(iter)) + return false; + + if (tag != 0 && iter->tag == tag) + { + /* We have a default value for this field in the defstream */ + if (!decode_field(&defstream, wire_type, iter)) + return false; + if (!pb_decode_tag(&defstream, &wire_type, &tag, &eof)) + return false; + + if (iter->pSize) + *(bool*)iter->pSize = false; + } + } while (pb_field_iter_next(iter)); + + return true; +} + +/********************* + * Decode all fields * + *********************/ + +static bool checkreturn pb_decode_inner(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct, unsigned int flags) +{ + uint32_t extension_range_start = 0; + pb_extension_t *extensions = NULL; + + /* 'fixed_count_field' and 'fixed_count_size' track position of a repeated fixed + * count field. This can only handle _one_ repeated fixed count field that + * is unpacked and unordered among other (non repeated fixed count) fields. + */ + pb_size_t fixed_count_field = PB_SIZE_MAX; + pb_size_t fixed_count_size = 0; + pb_size_t fixed_count_total_size = 0; + + pb_fields_seen_t fields_seen = {{0, 0}}; + const uint32_t allbits = ~(uint32_t)0; + pb_field_iter_t iter; + + if (pb_field_iter_begin(&iter, fields, dest_struct)) + { + if ((flags & PB_DECODE_NOINIT) == 0) + { + if (!pb_message_set_to_defaults(&iter)) + PB_RETURN_ERROR(stream, "failed to set defaults"); + } + } + + while (stream->bytes_left) + { + uint32_t tag; + pb_wire_type_t wire_type; + bool eof; + + if (!pb_decode_tag(stream, &wire_type, &tag, &eof)) + { + if (eof) + break; + else + return false; + } + + if (tag == 0) + { + if (flags & PB_DECODE_NULLTERMINATED) + { + break; + } + else + { + PB_RETURN_ERROR(stream, "zero tag"); + } + } + + if (!pb_field_iter_find(&iter, tag) || PB_LTYPE(iter.type) == PB_LTYPE_EXTENSION) + { + /* No match found, check if it matches an extension. */ + if (extension_range_start == 0) + { + if (pb_field_iter_find_extension(&iter)) + { + extensions = *(pb_extension_t* const *)iter.pData; + extension_range_start = iter.tag; + } + + if (!extensions) + { + extension_range_start = (uint32_t)-1; + } + } + + if (tag >= extension_range_start) + { + size_t pos = stream->bytes_left; + + if (!decode_extension(stream, tag, wire_type, extensions)) + return false; + + if (pos != stream->bytes_left) + { + /* The field was handled */ + continue; + } + } + + /* No match found, skip data */ + if (!pb_skip_field(stream, wire_type)) + return false; + continue; + } + + /* If a repeated fixed count field was found, get size from + * 'fixed_count_field' as there is no counter contained in the struct. + */ + if (PB_HTYPE(iter.type) == PB_HTYPE_REPEATED && iter.pSize == &iter.array_size) + { + if (fixed_count_field != iter.index) { + /* If the new fixed count field does not match the previous one, + * check that the previous one is NULL or that it finished + * receiving all the expected data. + */ + if (fixed_count_field != PB_SIZE_MAX && + fixed_count_size != fixed_count_total_size) + { + PB_RETURN_ERROR(stream, "wrong size for fixed count field"); + } + + fixed_count_field = iter.index; + fixed_count_size = 0; + fixed_count_total_size = iter.array_size; + } + + iter.pSize = &fixed_count_size; + } + + if (PB_HTYPE(iter.type) == PB_HTYPE_REQUIRED + && iter.required_field_index < PB_MAX_REQUIRED_FIELDS) + { + uint32_t tmp = ((uint32_t)1 << (iter.required_field_index & 31)); + fields_seen.bitfield[iter.required_field_index >> 5] |= tmp; + } + + if (!decode_field(stream, wire_type, &iter)) + return false; + } + + /* Check that all elements of the last decoded fixed count field were present. */ + if (fixed_count_field != PB_SIZE_MAX && + fixed_count_size != fixed_count_total_size) + { + PB_RETURN_ERROR(stream, "wrong size for fixed count field"); + } + + /* Check that all required fields were present. */ + { + pb_size_t req_field_count = iter.descriptor->required_field_count; + + if (req_field_count > 0) + { + pb_size_t i; + + if (req_field_count > PB_MAX_REQUIRED_FIELDS) + req_field_count = PB_MAX_REQUIRED_FIELDS; + + /* Check the whole words */ + for (i = 0; i < (req_field_count >> 5); i++) + { + if (fields_seen.bitfield[i] != allbits) + PB_RETURN_ERROR(stream, "missing required field"); + } + + /* Check the remaining bits (if any) */ + if ((req_field_count & 31) != 0) + { + if (fields_seen.bitfield[req_field_count >> 5] != + (allbits >> (uint_least8_t)(32 - (req_field_count & 31)))) + { + PB_RETURN_ERROR(stream, "missing required field"); + } + } + } + } + + return true; +} + +bool checkreturn pb_decode_ex(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct, unsigned int flags) +{ + bool status; + + if ((flags & PB_DECODE_DELIMITED) == 0) + { + status = pb_decode_inner(stream, fields, dest_struct, flags); + } + else + { + pb_istream_t substream; + if (!pb_make_string_substream(stream, &substream)) + return false; + + status = pb_decode_inner(&substream, fields, dest_struct, flags); + + if (!pb_close_string_substream(stream, &substream)) + return false; + } + +#ifdef PB_ENABLE_MALLOC + if (!status) + pb_release(fields, dest_struct); +#endif + + return status; +} + +bool checkreturn pb_decode(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct) +{ + bool status; + + status = pb_decode_inner(stream, fields, dest_struct, 0); + +#ifdef PB_ENABLE_MALLOC + if (!status) + pb_release(fields, dest_struct); +#endif + + return status; +} + +#ifdef PB_ENABLE_MALLOC +/* Given an oneof field, if there has already been a field inside this oneof, + * release it before overwriting with a different one. */ +static bool pb_release_union_field(pb_istream_t *stream, pb_field_iter_t *field) +{ + pb_field_iter_t old_field = *field; + pb_size_t old_tag = *(pb_size_t*)field->pSize; /* Previous which_ value */ + pb_size_t new_tag = field->tag; /* New which_ value */ + + if (old_tag == 0) + return true; /* Ok, no old data in union */ + + if (old_tag == new_tag) + return true; /* Ok, old data is of same type => merge */ + + /* Release old data. The find can fail if the message struct contains + * invalid data. */ + if (!pb_field_iter_find(&old_field, old_tag)) + PB_RETURN_ERROR(stream, "invalid union tag"); + + pb_release_single_field(&old_field); + + if (PB_ATYPE(field->type) == PB_ATYPE_POINTER) + { + /* Initialize the pointer to NULL to make sure it is valid + * even in case of error return. */ + *(void**)field->pField = NULL; + field->pData = NULL; + } + + return true; +} + +static void pb_release_single_field(pb_field_iter_t *field) +{ + pb_type_t type; + type = field->type; + + if (PB_HTYPE(type) == PB_HTYPE_ONEOF) + { + if (*(pb_size_t*)field->pSize != field->tag) + return; /* This is not the current field in the union */ + } + + /* Release anything contained inside an extension or submsg. + * This has to be done even if the submsg itself is statically + * allocated. */ + if (PB_LTYPE(type) == PB_LTYPE_EXTENSION) + { + /* Release fields from all extensions in the linked list */ + pb_extension_t *ext = *(pb_extension_t**)field->pData; + while (ext != NULL) + { + pb_field_iter_t ext_iter; + if (pb_field_iter_begin_extension(&ext_iter, ext)) + { + pb_release_single_field(&ext_iter); + } + ext = ext->next; + } + } + else if (PB_LTYPE_IS_SUBMSG(type) && PB_ATYPE(type) != PB_ATYPE_CALLBACK) + { + /* Release fields in submessage or submsg array */ + pb_size_t count = 1; + + if (PB_ATYPE(type) == PB_ATYPE_POINTER) + { + field->pData = *(void**)field->pField; + } + else + { + field->pData = field->pField; + } + + if (PB_HTYPE(type) == PB_HTYPE_REPEATED) + { + count = *(pb_size_t*)field->pSize; + + if (PB_ATYPE(type) == PB_ATYPE_STATIC && count > field->array_size) + { + /* Protect against corrupted _count fields */ + count = field->array_size; + } + } + + if (field->pData) + { + for (; count > 0; count--) + { + pb_release(field->submsg_desc, field->pData); + field->pData = (char*)field->pData + field->data_size; + } + } + } + + if (PB_ATYPE(type) == PB_ATYPE_POINTER) + { + if (PB_HTYPE(type) == PB_HTYPE_REPEATED && + (PB_LTYPE(type) == PB_LTYPE_STRING || + PB_LTYPE(type) == PB_LTYPE_BYTES)) + { + /* Release entries in repeated string or bytes array */ + void **pItem = *(void***)field->pField; + pb_size_t count = *(pb_size_t*)field->pSize; + for (; count > 0; count--) + { + pb_free(*pItem); + *pItem++ = NULL; + } + } + + if (PB_HTYPE(type) == PB_HTYPE_REPEATED) + { + /* We are going to release the array, so set the size to 0 */ + *(pb_size_t*)field->pSize = 0; + } + + /* Release main pointer */ + pb_free(*(void**)field->pField); + *(void**)field->pField = NULL; + } +} + +void pb_release(const pb_msgdesc_t *fields, void *dest_struct) +{ + pb_field_iter_t iter; + + if (!dest_struct) + return; /* Ignore NULL pointers, similar to free() */ + + if (!pb_field_iter_begin(&iter, fields, dest_struct)) + return; /* Empty message type */ + + do + { + pb_release_single_field(&iter); + } while (pb_field_iter_next(&iter)); +} +#else +void pb_release(const pb_msgdesc_t *fields, void *dest_struct) +{ + /* Nothing to release without PB_ENABLE_MALLOC. */ + PB_UNUSED(fields); + PB_UNUSED(dest_struct); +} +#endif + +/* Field decoders */ + +bool pb_decode_bool(pb_istream_t *stream, bool *dest) +{ + uint32_t value; + if (!pb_decode_varint32(stream, &value)) + return false; + + *(bool*)dest = (value != 0); + return true; +} + +bool pb_decode_svarint(pb_istream_t *stream, pb_int64_t *dest) +{ + pb_uint64_t value; + if (!pb_decode_varint(stream, &value)) + return false; + + if (value & 1) + *dest = (pb_int64_t)(~(value >> 1)); + else + *dest = (pb_int64_t)(value >> 1); + + return true; +} + +bool pb_decode_fixed32(pb_istream_t *stream, void *dest) +{ + union { + uint32_t fixed32; + pb_byte_t bytes[4]; + } u; + + if (!pb_read(stream, u.bytes, 4)) + return false; + +#if defined(PB_LITTLE_ENDIAN_8BIT) && PB_LITTLE_ENDIAN_8BIT == 1 + /* fast path - if we know that we're on little endian, assign directly */ + *(uint32_t*)dest = u.fixed32; +#else + *(uint32_t*)dest = ((uint32_t)u.bytes[0] << 0) | + ((uint32_t)u.bytes[1] << 8) | + ((uint32_t)u.bytes[2] << 16) | + ((uint32_t)u.bytes[3] << 24); +#endif + return true; +} + +#ifndef PB_WITHOUT_64BIT +bool pb_decode_fixed64(pb_istream_t *stream, void *dest) +{ + union { + uint64_t fixed64; + pb_byte_t bytes[8]; + } u; + + if (!pb_read(stream, u.bytes, 8)) + return false; + +#if defined(PB_LITTLE_ENDIAN_8BIT) && PB_LITTLE_ENDIAN_8BIT == 1 + /* fast path - if we know that we're on little endian, assign directly */ + *(uint64_t*)dest = u.fixed64; +#else + *(uint64_t*)dest = ((uint64_t)u.bytes[0] << 0) | + ((uint64_t)u.bytes[1] << 8) | + ((uint64_t)u.bytes[2] << 16) | + ((uint64_t)u.bytes[3] << 24) | + ((uint64_t)u.bytes[4] << 32) | + ((uint64_t)u.bytes[5] << 40) | + ((uint64_t)u.bytes[6] << 48) | + ((uint64_t)u.bytes[7] << 56); +#endif + return true; +} +#endif + +static bool checkreturn pb_dec_bool(pb_istream_t *stream, const pb_field_iter_t *field) +{ + return pb_decode_bool(stream, (bool*)field->pData); +} + +static bool checkreturn pb_dec_varint(pb_istream_t *stream, const pb_field_iter_t *field) +{ + if (PB_LTYPE(field->type) == PB_LTYPE_UVARINT) + { + pb_uint64_t value, clamped; + if (!pb_decode_varint(stream, &value)) + return false; + + /* Cast to the proper field size, while checking for overflows */ + if (field->data_size == sizeof(pb_uint64_t)) + clamped = *(pb_uint64_t*)field->pData = value; + else if (field->data_size == sizeof(uint32_t)) + clamped = *(uint32_t*)field->pData = (uint32_t)value; + else if (field->data_size == sizeof(uint_least16_t)) + clamped = *(uint_least16_t*)field->pData = (uint_least16_t)value; + else if (field->data_size == sizeof(uint_least8_t)) + clamped = *(uint_least8_t*)field->pData = (uint_least8_t)value; + else + PB_RETURN_ERROR(stream, "invalid data_size"); + + if (clamped != value) + PB_RETURN_ERROR(stream, "integer too large"); + + return true; + } + else + { + pb_uint64_t value; + pb_int64_t svalue; + pb_int64_t clamped; + + if (PB_LTYPE(field->type) == PB_LTYPE_SVARINT) + { + if (!pb_decode_svarint(stream, &svalue)) + return false; + } + else + { + if (!pb_decode_varint(stream, &value)) + return false; + + /* See issue 97: Google's C++ protobuf allows negative varint values to + * be cast as int32_t, instead of the int64_t that should be used when + * encoding. Nanopb versions before 0.2.5 had a bug in encoding. In order to + * not break decoding of such messages, we cast <=32 bit fields to + * int32_t first to get the sign correct. + */ + if (field->data_size == sizeof(pb_int64_t)) + svalue = (pb_int64_t)value; + else + svalue = (int32_t)value; + } + + /* Cast to the proper field size, while checking for overflows */ + if (field->data_size == sizeof(pb_int64_t)) + clamped = *(pb_int64_t*)field->pData = svalue; + else if (field->data_size == sizeof(int32_t)) + clamped = *(int32_t*)field->pData = (int32_t)svalue; + else if (field->data_size == sizeof(int_least16_t)) + clamped = *(int_least16_t*)field->pData = (int_least16_t)svalue; + else if (field->data_size == sizeof(int_least8_t)) + clamped = *(int_least8_t*)field->pData = (int_least8_t)svalue; + else + PB_RETURN_ERROR(stream, "invalid data_size"); + + if (clamped != svalue) + PB_RETURN_ERROR(stream, "integer too large"); + + return true; + } +} + +static bool checkreturn pb_dec_bytes(pb_istream_t *stream, const pb_field_iter_t *field) +{ + uint32_t size; + size_t alloc_size; + pb_bytes_array_t *dest; + + if (!pb_decode_varint32(stream, &size)) + return false; + + if (size > PB_SIZE_MAX) + PB_RETURN_ERROR(stream, "bytes overflow"); + + alloc_size = PB_BYTES_ARRAY_T_ALLOCSIZE(size); + if (size > alloc_size) + PB_RETURN_ERROR(stream, "size too large"); + + if (PB_ATYPE(field->type) == PB_ATYPE_POINTER) + { +#ifndef PB_ENABLE_MALLOC + PB_RETURN_ERROR(stream, "no malloc support"); +#else + if (stream->bytes_left < size) + PB_RETURN_ERROR(stream, "end-of-stream"); + + if (!allocate_field(stream, field->pData, alloc_size, 1)) + return false; + dest = *(pb_bytes_array_t**)field->pData; +#endif + } + else + { + if (alloc_size > field->data_size) + PB_RETURN_ERROR(stream, "bytes overflow"); + dest = (pb_bytes_array_t*)field->pData; + } + + dest->size = (pb_size_t)size; + return pb_read(stream, dest->bytes, (size_t)size); +} + +static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_iter_t *field) +{ + uint32_t size; + size_t alloc_size; + pb_byte_t *dest = (pb_byte_t*)field->pData; + + if (!pb_decode_varint32(stream, &size)) + return false; + + if (size == (uint32_t)-1) + PB_RETURN_ERROR(stream, "size too large"); + + /* Space for null terminator */ + alloc_size = (size_t)(size + 1); + + if (alloc_size < size) + PB_RETURN_ERROR(stream, "size too large"); + + if (PB_ATYPE(field->type) == PB_ATYPE_POINTER) + { +#ifndef PB_ENABLE_MALLOC + PB_RETURN_ERROR(stream, "no malloc support"); +#else + if (stream->bytes_left < size) + PB_RETURN_ERROR(stream, "end-of-stream"); + + if (!allocate_field(stream, field->pData, alloc_size, 1)) + return false; + dest = *(pb_byte_t**)field->pData; +#endif + } + else + { + if (alloc_size > field->data_size) + PB_RETURN_ERROR(stream, "string overflow"); + } + + dest[size] = 0; + + if (!pb_read(stream, dest, (size_t)size)) + return false; + +#ifdef PB_VALIDATE_UTF8 + if (!pb_validate_utf8((const char*)dest)) + PB_RETURN_ERROR(stream, "invalid utf8"); +#endif + + return true; +} + +static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_iter_t *field) +{ + bool status = true; + bool submsg_consumed = false; + pb_istream_t substream; + + if (!pb_make_string_substream(stream, &substream)) + return false; + + if (field->submsg_desc == NULL) + PB_RETURN_ERROR(stream, "invalid field descriptor"); + + /* Submessages can have a separate message-level callback that is called + * before decoding the message. Typically it is used to set callback fields + * inside oneofs. */ + if (PB_LTYPE(field->type) == PB_LTYPE_SUBMSG_W_CB && field->pSize != NULL) + { + /* Message callback is stored right before pSize. */ + pb_callback_t *callback = (pb_callback_t*)field->pSize - 1; + if (callback->funcs.decode) + { + status = callback->funcs.decode(&substream, field, &callback->arg); + + if (substream.bytes_left == 0) + { + submsg_consumed = true; + } + } + } + + /* Now decode the submessage contents */ + if (status && !submsg_consumed) + { + unsigned int flags = 0; + + /* Static required/optional fields are already initialized by top-level + * pb_decode(), no need to initialize them again. */ + if (PB_ATYPE(field->type) == PB_ATYPE_STATIC && + PB_HTYPE(field->type) != PB_HTYPE_REPEATED) + { + flags = PB_DECODE_NOINIT; + } + + status = pb_decode_inner(&substream, field->submsg_desc, field->pData, flags); + } + + if (!pb_close_string_substream(stream, &substream)) + return false; + + return status; +} + +static bool checkreturn pb_dec_fixed_length_bytes(pb_istream_t *stream, const pb_field_iter_t *field) +{ + uint32_t size; + + if (!pb_decode_varint32(stream, &size)) + return false; + + if (size > PB_SIZE_MAX) + PB_RETURN_ERROR(stream, "bytes overflow"); + + if (size == 0) + { + /* As a special case, treat empty bytes string as all zeros for fixed_length_bytes. */ + memset(field->pData, 0, (size_t)field->data_size); + return true; + } + + if (size != field->data_size) + PB_RETURN_ERROR(stream, "incorrect fixed length bytes size"); + + return pb_read(stream, (pb_byte_t*)field->pData, (size_t)field->data_size); +} + +#ifdef PB_CONVERT_DOUBLE_FLOAT +bool pb_decode_double_as_float(pb_istream_t *stream, float *dest) +{ + uint_least8_t sign; + int exponent; + uint32_t mantissa; + uint64_t value; + union { float f; uint32_t i; } out; + + if (!pb_decode_fixed64(stream, &value)) + return false; + + /* Decompose input value */ + sign = (uint_least8_t)((value >> 63) & 1); + exponent = (int)((value >> 52) & 0x7FF) - 1023; + mantissa = (value >> 28) & 0xFFFFFF; /* Highest 24 bits */ + + /* Figure if value is in range representable by floats. */ + if (exponent == 1024) + { + /* Special value */ + exponent = 128; + mantissa >>= 1; + } + else + { + if (exponent > 127) + { + /* Too large, convert to infinity */ + exponent = 128; + mantissa = 0; + } + else if (exponent < -150) + { + /* Too small, convert to zero */ + exponent = -127; + mantissa = 0; + } + else if (exponent < -126) + { + /* Denormalized */ + mantissa |= 0x1000000; + mantissa >>= (-126 - exponent); + exponent = -127; + } + + /* Round off mantissa */ + mantissa = (mantissa + 1) >> 1; + + /* Check if mantissa went over 2.0 */ + if (mantissa & 0x800000) + { + exponent += 1; + mantissa &= 0x7FFFFF; + mantissa >>= 1; + } + } + + /* Combine fields */ + out.i = mantissa; + out.i |= (uint32_t)(exponent + 127) << 23; + out.i |= (uint32_t)sign << 31; + + *dest = out.f; + return true; +} +#endif diff --git a/src/third_party/librdkafka/dist/src/nanopb/pb_decode.h b/src/third_party/librdkafka/dist/src/nanopb/pb_decode.h new file mode 100644 index 00000000000..02f11653a2b --- /dev/null +++ b/src/third_party/librdkafka/dist/src/nanopb/pb_decode.h @@ -0,0 +1,193 @@ +/* pb_decode.h: Functions to decode protocol buffers. Depends on pb_decode.c. + * The main function is pb_decode. You also need an input stream, and the + * field descriptions created by nanopb_generator.py. + */ + +#ifndef PB_DECODE_H_INCLUDED +#define PB_DECODE_H_INCLUDED + +#include "nanopb/pb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Structure for defining custom input streams. You will need to provide + * a callback function to read the bytes from your storage, which can be + * for example a file or a network socket. + * + * The callback must conform to these rules: + * + * 1) Return false on IO errors. This will cause decoding to abort. + * 2) You can use state to store your own data (e.g. buffer pointer), + * and rely on pb_read to verify that no-body reads past bytes_left. + * 3) Your callback may be used with substreams, in which case bytes_left + * is different than from the main stream. Don't use bytes_left to compute + * any pointers. + */ +struct pb_istream_s +{ +#ifdef PB_BUFFER_ONLY + /* Callback pointer is not used in buffer-only configuration. + * Having an int pointer here allows binary compatibility but + * gives an error if someone tries to assign callback function. + */ + int *callback; +#else + bool (*callback)(pb_istream_t *stream, pb_byte_t *buf, size_t count); +#endif + + void *state; /* Free field for use by callback implementation */ + size_t bytes_left; + +#ifndef PB_NO_ERRMSG + const char *errmsg; +#endif +}; + +#ifndef PB_NO_ERRMSG +#define PB_ISTREAM_EMPTY {0,0,0,0} +#else +#define PB_ISTREAM_EMPTY {0,0,0} +#endif + +/*************************** + * Main decoding functions * + ***************************/ + +/* Decode a single protocol buffers message from input stream into a C structure. + * Returns true on success, false on any failure. + * The actual struct pointed to by dest must match the description in fields. + * Callback fields of the destination structure must be initialized by caller. + * All other fields will be initialized by this function. + * + * Example usage: + * MyMessage msg = {}; + * uint8_t buffer[64]; + * pb_istream_t stream; + * + * // ... read some data into buffer ... + * + * stream = pb_istream_from_buffer(buffer, count); + * pb_decode(&stream, MyMessage_fields, &msg); + */ +bool pb_decode(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct); + +/* Extended version of pb_decode, with several options to control + * the decoding process: + * + * PB_DECODE_NOINIT: Do not initialize the fields to default values. + * This is slightly faster if you do not need the default + * values and instead initialize the structure to 0 using + * e.g. memset(). This can also be used for merging two + * messages, i.e. combine already existing data with new + * values. + * + * PB_DECODE_DELIMITED: Input message starts with the message size as varint. + * Corresponds to parseDelimitedFrom() in Google's + * protobuf API. + * + * PB_DECODE_NULLTERMINATED: Stop reading when field tag is read as 0. This allows + * reading null terminated messages. + * NOTE: Until nanopb-0.4.0, pb_decode() also allows + * null-termination. This behaviour is not supported in + * most other protobuf implementations, so PB_DECODE_DELIMITED + * is a better option for compatibility. + * + * Multiple flags can be combined with bitwise or (| operator) + */ +#define PB_DECODE_NOINIT 0x01U +#define PB_DECODE_DELIMITED 0x02U +#define PB_DECODE_NULLTERMINATED 0x04U +bool pb_decode_ex(pb_istream_t *stream, const pb_msgdesc_t *fields, void *dest_struct, unsigned int flags); + +/* Defines for backwards compatibility with code written before nanopb-0.4.0 */ +#define pb_decode_noinit(s,f,d) pb_decode_ex(s,f,d, PB_DECODE_NOINIT) +#define pb_decode_delimited(s,f,d) pb_decode_ex(s,f,d, PB_DECODE_DELIMITED) +#define pb_decode_delimited_noinit(s,f,d) pb_decode_ex(s,f,d, PB_DECODE_DELIMITED | PB_DECODE_NOINIT) +#define pb_decode_nullterminated(s,f,d) pb_decode_ex(s,f,d, PB_DECODE_NULLTERMINATED) + +/* Release any allocated pointer fields. If you use dynamic allocation, you should + * call this for any successfully decoded message when you are done with it. If + * pb_decode() returns with an error, the message is already released. + */ +void pb_release(const pb_msgdesc_t *fields, void *dest_struct); + +/************************************** + * Functions for manipulating streams * + **************************************/ + +/* Create an input stream for reading from a memory buffer. + * + * msglen should be the actual length of the message, not the full size of + * allocated buffer. + * + * Alternatively, you can use a custom stream that reads directly from e.g. + * a file or a network socket. + */ +pb_istream_t pb_istream_from_buffer(const pb_byte_t *buf, size_t msglen); + +/* Function to read from a pb_istream_t. You can use this if you need to + * read some custom header data, or to read data in field callbacks. + */ +bool pb_read(pb_istream_t *stream, pb_byte_t *buf, size_t count); + + +/************************************************ + * Helper functions for writing field callbacks * + ************************************************/ + +/* Decode the tag for the next field in the stream. Gives the wire type and + * field tag. At end of the message, returns false and sets eof to true. */ +bool pb_decode_tag(pb_istream_t *stream, pb_wire_type_t *wire_type, uint32_t *tag, bool *eof); + +/* Skip the field payload data, given the wire type. */ +bool pb_skip_field(pb_istream_t *stream, pb_wire_type_t wire_type); + +/* Decode an integer in the varint format. This works for enum, int32, + * int64, uint32 and uint64 field types. */ +#ifndef PB_WITHOUT_64BIT +bool pb_decode_varint(pb_istream_t *stream, uint64_t *dest); +#else +#define pb_decode_varint pb_decode_varint32 +#endif + +/* Decode an integer in the varint format. This works for enum, int32, + * and uint32 field types. */ +bool pb_decode_varint32(pb_istream_t *stream, uint32_t *dest); + +/* Decode a bool value in varint format. */ +bool pb_decode_bool(pb_istream_t *stream, bool *dest); + +/* Decode an integer in the zig-zagged svarint format. This works for sint32 + * and sint64. */ +#ifndef PB_WITHOUT_64BIT +bool pb_decode_svarint(pb_istream_t *stream, int64_t *dest); +#else +bool pb_decode_svarint(pb_istream_t *stream, int32_t *dest); +#endif + +/* Decode a fixed32, sfixed32 or float value. You need to pass a pointer to + * a 4-byte wide C variable. */ +bool pb_decode_fixed32(pb_istream_t *stream, void *dest); + +#ifndef PB_WITHOUT_64BIT +/* Decode a fixed64, sfixed64 or double value. You need to pass a pointer to + * a 8-byte wide C variable. */ +bool pb_decode_fixed64(pb_istream_t *stream, void *dest); +#endif + +#ifdef PB_CONVERT_DOUBLE_FLOAT +/* Decode a double value into float variable. */ +bool pb_decode_double_as_float(pb_istream_t *stream, float *dest); +#endif + +/* Make a limited-length substream for reading a PB_WT_STRING field. */ +bool pb_make_string_substream(pb_istream_t *stream, pb_istream_t *substream); +bool pb_close_string_substream(pb_istream_t *stream, pb_istream_t *substream); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/src/third_party/librdkafka/dist/src/nanopb/pb_encode.c b/src/third_party/librdkafka/dist/src/nanopb/pb_encode.c new file mode 100644 index 00000000000..d85e03185a0 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/nanopb/pb_encode.c @@ -0,0 +1,1000 @@ +/* pb_encode.c -- encode a protobuf using minimal resources + * + * 2011 Petteri Aimonen + */ + +#include "nanopb/pb.h" +#include "nanopb/pb_encode.h" +#include "nanopb/pb_common.h" + +/* Use the GCC warn_unused_result attribute to check that all return values + * are propagated correctly. On other compilers and gcc before 3.4.0 just + * ignore the annotation. + */ +#if !defined(__GNUC__) || ( __GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 4) + #define checkreturn +#else + #define checkreturn __attribute__((warn_unused_result)) +#endif + +/************************************** + * Declarations internal to this file * + **************************************/ +static bool checkreturn buf_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count); +static bool checkreturn encode_array(pb_ostream_t *stream, pb_field_iter_t *field); +static bool checkreturn pb_check_proto3_default_value(const pb_field_iter_t *field); +static bool checkreturn encode_basic_field(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn encode_callback_field(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn encode_field(pb_ostream_t *stream, pb_field_iter_t *field); +static bool checkreturn encode_extension_field(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn default_extension_encoder(pb_ostream_t *stream, const pb_extension_t *extension); +static bool checkreturn pb_encode_varint_32(pb_ostream_t *stream, uint32_t low, uint32_t high); +static bool checkreturn pb_enc_bool(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_enc_varint(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_enc_fixed(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_enc_bytes(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_enc_string(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_enc_submessage(pb_ostream_t *stream, const pb_field_iter_t *field); +static bool checkreturn pb_enc_fixed_length_bytes(pb_ostream_t *stream, const pb_field_iter_t *field); + +#ifdef PB_WITHOUT_64BIT +#define pb_int64_t int32_t +#define pb_uint64_t uint32_t +#else +#define pb_int64_t int64_t +#define pb_uint64_t uint64_t +#endif + +/******************************* + * pb_ostream_t implementation * + *******************************/ + +static bool checkreturn buf_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count) +{ + pb_byte_t *dest = (pb_byte_t*)stream->state; + stream->state = dest + count; + + memcpy(dest, buf, count * sizeof(pb_byte_t)); + + return true; +} + +pb_ostream_t pb_ostream_from_buffer(pb_byte_t *buf, size_t bufsize) +{ + pb_ostream_t stream; +#ifdef PB_BUFFER_ONLY + /* In PB_BUFFER_ONLY configuration the callback pointer is just int*. + * NULL pointer marks a sizing field, so put a non-NULL value to mark a buffer stream. + */ + static const int marker = 0; + stream.callback = ▮ +#else + stream.callback = &buf_write; +#endif + stream.state = buf; + stream.max_size = bufsize; + stream.bytes_written = 0; +#ifndef PB_NO_ERRMSG + stream.errmsg = NULL; +#endif + return stream; +} + +bool checkreturn pb_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count) +{ + if (count > 0 && stream->callback != NULL) + { + if (stream->bytes_written + count < stream->bytes_written || + stream->bytes_written + count > stream->max_size) + { + PB_RETURN_ERROR(stream, "stream full"); + } + +#ifdef PB_BUFFER_ONLY + if (!buf_write(stream, buf, count)) + PB_RETURN_ERROR(stream, "io error"); +#else + if (!stream->callback(stream, buf, count)) + PB_RETURN_ERROR(stream, "io error"); +#endif + } + + stream->bytes_written += count; + return true; +} + +/************************* + * Encode a single field * + *************************/ + +/* Read a bool value without causing undefined behavior even if the value + * is invalid. See issue #434 and + * https://stackoverflow.com/questions/27661768/weird-results-for-conditional + */ +static bool safe_read_bool(const void *pSize) +{ + const char *p = (const char *)pSize; + size_t i; + for (i = 0; i < sizeof(bool); i++) + { + if (p[i] != 0) + return true; + } + return false; +} + +/* Encode a static array. Handles the size calculations and possible packing. */ +static bool checkreturn encode_array(pb_ostream_t *stream, pb_field_iter_t *field) +{ + pb_size_t i; + pb_size_t count; +#ifndef PB_ENCODE_ARRAYS_UNPACKED + size_t size; +#endif + + count = *(pb_size_t*)field->pSize; + + if (count == 0) + return true; + + if (PB_ATYPE(field->type) != PB_ATYPE_POINTER && count > field->array_size) + PB_RETURN_ERROR(stream, "array max size exceeded"); + +#ifndef PB_ENCODE_ARRAYS_UNPACKED + /* We always pack arrays if the datatype allows it. */ + if (PB_LTYPE(field->type) <= PB_LTYPE_LAST_PACKABLE) + { + if (!pb_encode_tag(stream, PB_WT_STRING, field->tag)) + return false; + + /* Determine the total size of packed array. */ + if (PB_LTYPE(field->type) == PB_LTYPE_FIXED32) + { + size = 4 * (size_t)count; + } + else if (PB_LTYPE(field->type) == PB_LTYPE_FIXED64) + { + size = 8 * (size_t)count; + } + else + { + pb_ostream_t sizestream = PB_OSTREAM_SIZING; + void *pData_orig = field->pData; + for (i = 0; i < count; i++) + { + if (!pb_enc_varint(&sizestream, field)) + PB_RETURN_ERROR(stream, PB_GET_ERROR(&sizestream)); + field->pData = (char*)field->pData + field->data_size; + } + field->pData = pData_orig; + size = sizestream.bytes_written; + } + + if (!pb_encode_varint(stream, (pb_uint64_t)size)) + return false; + + if (stream->callback == NULL) + return pb_write(stream, NULL, size); /* Just sizing.. */ + + /* Write the data */ + for (i = 0; i < count; i++) + { + if (PB_LTYPE(field->type) == PB_LTYPE_FIXED32 || PB_LTYPE(field->type) == PB_LTYPE_FIXED64) + { + if (!pb_enc_fixed(stream, field)) + return false; + } + else + { + if (!pb_enc_varint(stream, field)) + return false; + } + + field->pData = (char*)field->pData + field->data_size; + } + } + else /* Unpacked fields */ +#endif + { + for (i = 0; i < count; i++) + { + /* Normally the data is stored directly in the array entries, but + * for pointer-type string and bytes fields, the array entries are + * actually pointers themselves also. So we have to dereference once + * more to get to the actual data. */ + if (PB_ATYPE(field->type) == PB_ATYPE_POINTER && + (PB_LTYPE(field->type) == PB_LTYPE_STRING || + PB_LTYPE(field->type) == PB_LTYPE_BYTES)) + { + bool status; + void *pData_orig = field->pData; + field->pData = *(void* const*)field->pData; + + if (!field->pData) + { + /* Null pointer in array is treated as empty string / bytes */ + status = pb_encode_tag_for_field(stream, field) && + pb_encode_varint(stream, 0); + } + else + { + status = encode_basic_field(stream, field); + } + + field->pData = pData_orig; + + if (!status) + return false; + } + else + { + if (!encode_basic_field(stream, field)) + return false; + } + field->pData = (char*)field->pData + field->data_size; + } + } + + return true; +} + +/* In proto3, all fields are optional and are only encoded if their value is "non-zero". + * This function implements the check for the zero value. */ +static bool checkreturn pb_check_proto3_default_value(const pb_field_iter_t *field) +{ + pb_type_t type = field->type; + + if (PB_ATYPE(type) == PB_ATYPE_STATIC) + { + if (PB_HTYPE(type) == PB_HTYPE_REQUIRED) + { + /* Required proto2 fields inside proto3 submessage, pretty rare case */ + return false; + } + else if (PB_HTYPE(type) == PB_HTYPE_REPEATED) + { + /* Repeated fields inside proto3 submessage: present if count != 0 */ + return *(const pb_size_t*)field->pSize == 0; + } + else if (PB_HTYPE(type) == PB_HTYPE_ONEOF) + { + /* Oneof fields */ + return *(const pb_size_t*)field->pSize == 0; + } + else if (PB_HTYPE(type) == PB_HTYPE_OPTIONAL && field->pSize != NULL) + { + /* Proto2 optional fields inside proto3 message, or proto3 + * submessage fields. */ + return safe_read_bool(field->pSize) == false; + } + else if (field->descriptor->default_value) + { + /* Proto3 messages do not have default values, but proto2 messages + * can contain optional fields without has_fields (generator option 'proto3'). + * In this case they must always be encoded, to make sure that the + * non-zero default value is overwritten. + */ + return false; + } + + /* Rest is proto3 singular fields */ + if (PB_LTYPE(type) <= PB_LTYPE_LAST_PACKABLE) + { + /* Simple integer / float fields */ + pb_size_t i; + const char *p = (const char*)field->pData; + for (i = 0; i < field->data_size; i++) + { + if (p[i] != 0) + { + return false; + } + } + + return true; + } + else if (PB_LTYPE(type) == PB_LTYPE_BYTES) + { + const pb_bytes_array_t *bytes = (const pb_bytes_array_t*)field->pData; + return bytes->size == 0; + } + else if (PB_LTYPE(type) == PB_LTYPE_STRING) + { + return *(const char*)field->pData == '\0'; + } + else if (PB_LTYPE(type) == PB_LTYPE_FIXED_LENGTH_BYTES) + { + /* Fixed length bytes is only empty if its length is fixed + * as 0. Which would be pretty strange, but we can check + * it anyway. */ + return field->data_size == 0; + } + else if (PB_LTYPE_IS_SUBMSG(type)) + { + /* Check all fields in the submessage to find if any of them + * are non-zero. The comparison cannot be done byte-per-byte + * because the C struct may contain padding bytes that must + * be skipped. Note that usually proto3 submessages have + * a separate has_field that is checked earlier in this if. + */ + pb_field_iter_t iter; + if (pb_field_iter_begin(&iter, field->submsg_desc, field->pData)) + { + do + { + if (!pb_check_proto3_default_value(&iter)) + { + return false; + } + } while (pb_field_iter_next(&iter)); + } + return true; + } + } + else if (PB_ATYPE(type) == PB_ATYPE_POINTER) + { + return field->pData == NULL; + } + else if (PB_ATYPE(type) == PB_ATYPE_CALLBACK) + { + if (PB_LTYPE(type) == PB_LTYPE_EXTENSION) + { + const pb_extension_t *extension = *(const pb_extension_t* const *)field->pData; + return extension == NULL; + } + else if (field->descriptor->field_callback == pb_default_field_callback) + { + pb_callback_t *pCallback = (pb_callback_t*)field->pData; + return pCallback->funcs.encode == NULL; + } + else + { + return field->descriptor->field_callback == NULL; + } + } + + return false; /* Not typically reached, safe default for weird special cases. */ +} + +/* Encode a field with static or pointer allocation, i.e. one whose data + * is available to the encoder directly. */ +static bool checkreturn encode_basic_field(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + if (!field->pData) + { + /* Missing pointer field */ + return true; + } + + if (!pb_encode_tag_for_field(stream, field)) + return false; + + switch (PB_LTYPE(field->type)) + { + case PB_LTYPE_BOOL: + return pb_enc_bool(stream, field); + + case PB_LTYPE_VARINT: + case PB_LTYPE_UVARINT: + case PB_LTYPE_SVARINT: + return pb_enc_varint(stream, field); + + case PB_LTYPE_FIXED32: + case PB_LTYPE_FIXED64: + return pb_enc_fixed(stream, field); + + case PB_LTYPE_BYTES: + return pb_enc_bytes(stream, field); + + case PB_LTYPE_STRING: + return pb_enc_string(stream, field); + + case PB_LTYPE_SUBMESSAGE: + case PB_LTYPE_SUBMSG_W_CB: + return pb_enc_submessage(stream, field); + + case PB_LTYPE_FIXED_LENGTH_BYTES: + return pb_enc_fixed_length_bytes(stream, field); + + default: + PB_RETURN_ERROR(stream, "invalid field type"); + } +} + +/* Encode a field with callback semantics. This means that a user function is + * called to provide and encode the actual data. */ +static bool checkreturn encode_callback_field(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + if (field->descriptor->field_callback != NULL) + { + if (!field->descriptor->field_callback(NULL, stream, field)) + PB_RETURN_ERROR(stream, "callback error"); + } + return true; +} + +/* Encode a single field of any callback, pointer or static type. */ +static bool checkreturn encode_field(pb_ostream_t *stream, pb_field_iter_t *field) +{ + /* Check field presence */ + if (PB_HTYPE(field->type) == PB_HTYPE_ONEOF) + { + if (*(const pb_size_t*)field->pSize != field->tag) + { + /* Different type oneof field */ + return true; + } + } + else if (PB_HTYPE(field->type) == PB_HTYPE_OPTIONAL) + { + if (field->pSize) + { + if (safe_read_bool(field->pSize) == false) + { + /* Missing optional field */ + return true; + } + } + else if (PB_ATYPE(field->type) == PB_ATYPE_STATIC) + { + /* Proto3 singular field */ + if (pb_check_proto3_default_value(field)) + return true; + } + } + + if (!field->pData) + { + if (PB_HTYPE(field->type) == PB_HTYPE_REQUIRED) + PB_RETURN_ERROR(stream, "missing required field"); + + /* Pointer field set to NULL */ + return true; + } + + /* Then encode field contents */ + if (PB_ATYPE(field->type) == PB_ATYPE_CALLBACK) + { + return encode_callback_field(stream, field); + } + else if (PB_HTYPE(field->type) == PB_HTYPE_REPEATED) + { + return encode_array(stream, field); + } + else + { + return encode_basic_field(stream, field); + } +} + +/* Default handler for extension fields. Expects to have a pb_msgdesc_t + * pointer in the extension->type->arg field, pointing to a message with + * only one field in it. */ +static bool checkreturn default_extension_encoder(pb_ostream_t *stream, const pb_extension_t *extension) +{ + pb_field_iter_t iter; + + if (!pb_field_iter_begin_extension_const(&iter, extension)) + PB_RETURN_ERROR(stream, "invalid extension"); + + return encode_field(stream, &iter); +} + + +/* Walk through all the registered extensions and give them a chance + * to encode themselves. */ +static bool checkreturn encode_extension_field(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + const pb_extension_t *extension = *(const pb_extension_t* const *)field->pData; + + while (extension) + { + bool status; + if (extension->type->encode) + status = extension->type->encode(stream, extension); + else + status = default_extension_encoder(stream, extension); + + if (!status) + return false; + + extension = extension->next; + } + + return true; +} + +/********************* + * Encode all fields * + *********************/ + +bool checkreturn pb_encode(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct) +{ + pb_field_iter_t iter; + if (!pb_field_iter_begin_const(&iter, fields, src_struct)) + return true; /* Empty message type */ + + do { + if (PB_LTYPE(iter.type) == PB_LTYPE_EXTENSION) + { + /* Special case for the extension field placeholder */ + if (!encode_extension_field(stream, &iter)) + return false; + } + else + { + /* Regular field */ + if (!encode_field(stream, &iter)) + return false; + } + } while (pb_field_iter_next(&iter)); + + return true; +} + +bool checkreturn pb_encode_ex(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct, unsigned int flags) +{ + if ((flags & PB_ENCODE_DELIMITED) != 0) + { + return pb_encode_submessage(stream, fields, src_struct); + } + else if ((flags & PB_ENCODE_NULLTERMINATED) != 0) + { + const pb_byte_t zero = 0; + + if (!pb_encode(stream, fields, src_struct)) + return false; + + return pb_write(stream, &zero, 1); + } + else + { + return pb_encode(stream, fields, src_struct); + } +} + +bool pb_get_encoded_size(size_t *size, const pb_msgdesc_t *fields, const void *src_struct) +{ + pb_ostream_t stream = PB_OSTREAM_SIZING; + + if (!pb_encode(&stream, fields, src_struct)) + return false; + + *size = stream.bytes_written; + return true; +} + +/******************** + * Helper functions * + ********************/ + +/* This function avoids 64-bit shifts as they are quite slow on many platforms. */ +static bool checkreturn pb_encode_varint_32(pb_ostream_t *stream, uint32_t low, uint32_t high) +{ + size_t i = 0; + pb_byte_t buffer[10]; + pb_byte_t byte = (pb_byte_t)(low & 0x7F); + low >>= 7; + + while (i < 4 && (low != 0 || high != 0)) + { + byte |= 0x80; + buffer[i++] = byte; + byte = (pb_byte_t)(low & 0x7F); + low >>= 7; + } + + if (high) + { + byte = (pb_byte_t)(byte | ((high & 0x07) << 4)); + high >>= 3; + + while (high) + { + byte |= 0x80; + buffer[i++] = byte; + byte = (pb_byte_t)(high & 0x7F); + high >>= 7; + } + } + + buffer[i++] = byte; + + return pb_write(stream, buffer, i); +} + +bool checkreturn pb_encode_varint(pb_ostream_t *stream, pb_uint64_t value) +{ + if (value <= 0x7F) + { + /* Fast path: single byte */ + pb_byte_t byte = (pb_byte_t)value; + return pb_write(stream, &byte, 1); + } + else + { +#ifdef PB_WITHOUT_64BIT + return pb_encode_varint_32(stream, value, 0); +#else + return pb_encode_varint_32(stream, (uint32_t)value, (uint32_t)(value >> 32)); +#endif + } +} + +bool checkreturn pb_encode_svarint(pb_ostream_t *stream, pb_int64_t value) +{ + pb_uint64_t zigzagged; + pb_uint64_t mask = ((pb_uint64_t)-1) >> 1; /* Satisfy clang -fsanitize=integer */ + if (value < 0) + zigzagged = ~(((pb_uint64_t)value & mask) << 1); + else + zigzagged = (pb_uint64_t)value << 1; + + return pb_encode_varint(stream, zigzagged); +} + +bool checkreturn pb_encode_fixed32(pb_ostream_t *stream, const void *value) +{ +#if defined(PB_LITTLE_ENDIAN_8BIT) && PB_LITTLE_ENDIAN_8BIT == 1 + /* Fast path if we know that we're on little endian */ + return pb_write(stream, (const pb_byte_t*)value, 4); +#else + uint32_t val = *(const uint32_t*)value; + pb_byte_t bytes[4]; + bytes[0] = (pb_byte_t)(val & 0xFF); + bytes[1] = (pb_byte_t)((val >> 8) & 0xFF); + bytes[2] = (pb_byte_t)((val >> 16) & 0xFF); + bytes[3] = (pb_byte_t)((val >> 24) & 0xFF); + return pb_write(stream, bytes, 4); +#endif +} + +#ifndef PB_WITHOUT_64BIT +bool checkreturn pb_encode_fixed64(pb_ostream_t *stream, const void *value) +{ +#if defined(PB_LITTLE_ENDIAN_8BIT) && PB_LITTLE_ENDIAN_8BIT == 1 + /* Fast path if we know that we're on little endian */ + return pb_write(stream, (const pb_byte_t*)value, 8); +#else + uint64_t val = *(const uint64_t*)value; + pb_byte_t bytes[8]; + bytes[0] = (pb_byte_t)(val & 0xFF); + bytes[1] = (pb_byte_t)((val >> 8) & 0xFF); + bytes[2] = (pb_byte_t)((val >> 16) & 0xFF); + bytes[3] = (pb_byte_t)((val >> 24) & 0xFF); + bytes[4] = (pb_byte_t)((val >> 32) & 0xFF); + bytes[5] = (pb_byte_t)((val >> 40) & 0xFF); + bytes[6] = (pb_byte_t)((val >> 48) & 0xFF); + bytes[7] = (pb_byte_t)((val >> 56) & 0xFF); + return pb_write(stream, bytes, 8); +#endif +} +#endif + +bool checkreturn pb_encode_tag(pb_ostream_t *stream, pb_wire_type_t wiretype, uint32_t field_number) +{ + pb_uint64_t tag = ((pb_uint64_t)field_number << 3) | wiretype; + return pb_encode_varint(stream, tag); +} + +bool pb_encode_tag_for_field ( pb_ostream_t* stream, const pb_field_iter_t* field ) +{ + pb_wire_type_t wiretype; + switch (PB_LTYPE(field->type)) + { + case PB_LTYPE_BOOL: + case PB_LTYPE_VARINT: + case PB_LTYPE_UVARINT: + case PB_LTYPE_SVARINT: + wiretype = PB_WT_VARINT; + break; + + case PB_LTYPE_FIXED32: + wiretype = PB_WT_32BIT; + break; + + case PB_LTYPE_FIXED64: + wiretype = PB_WT_64BIT; + break; + + case PB_LTYPE_BYTES: + case PB_LTYPE_STRING: + case PB_LTYPE_SUBMESSAGE: + case PB_LTYPE_SUBMSG_W_CB: + case PB_LTYPE_FIXED_LENGTH_BYTES: + wiretype = PB_WT_STRING; + break; + + default: + PB_RETURN_ERROR(stream, "invalid field type"); + } + + return pb_encode_tag(stream, wiretype, field->tag); +} + +bool checkreturn pb_encode_string(pb_ostream_t *stream, const pb_byte_t *buffer, size_t size) +{ + if (!pb_encode_varint(stream, (pb_uint64_t)size)) + return false; + + return pb_write(stream, buffer, size); +} + +bool checkreturn pb_encode_submessage(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct) +{ + /* First calculate the message size using a non-writing substream. */ + pb_ostream_t substream = PB_OSTREAM_SIZING; + size_t size; + bool status; + + if (!pb_encode(&substream, fields, src_struct)) + { +#ifndef PB_NO_ERRMSG + stream->errmsg = substream.errmsg; +#endif + return false; + } + + size = substream.bytes_written; + + if (!pb_encode_varint(stream, (pb_uint64_t)size)) + return false; + + if (stream->callback == NULL) + return pb_write(stream, NULL, size); /* Just sizing */ + + if (stream->bytes_written + size > stream->max_size) + PB_RETURN_ERROR(stream, "stream full"); + + /* Use a substream to verify that a callback doesn't write more than + * what it did the first time. */ + substream.callback = stream->callback; + substream.state = stream->state; + substream.max_size = size; + substream.bytes_written = 0; +#ifndef PB_NO_ERRMSG + substream.errmsg = NULL; +#endif + + status = pb_encode(&substream, fields, src_struct); + + stream->bytes_written += substream.bytes_written; + stream->state = substream.state; +#ifndef PB_NO_ERRMSG + stream->errmsg = substream.errmsg; +#endif + + if (substream.bytes_written != size) + PB_RETURN_ERROR(stream, "submsg size changed"); + + return status; +} + +/* Field encoders */ + +static bool checkreturn pb_enc_bool(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + uint32_t value = safe_read_bool(field->pData) ? 1 : 0; + PB_UNUSED(field); + return pb_encode_varint(stream, value); +} + +static bool checkreturn pb_enc_varint(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + if (PB_LTYPE(field->type) == PB_LTYPE_UVARINT) + { + /* Perform unsigned integer extension */ + pb_uint64_t value = 0; + + if (field->data_size == sizeof(uint_least8_t)) + value = *(const uint_least8_t*)field->pData; + else if (field->data_size == sizeof(uint_least16_t)) + value = *(const uint_least16_t*)field->pData; + else if (field->data_size == sizeof(uint32_t)) + value = *(const uint32_t*)field->pData; + else if (field->data_size == sizeof(pb_uint64_t)) + value = *(const pb_uint64_t*)field->pData; + else + PB_RETURN_ERROR(stream, "invalid data_size"); + + return pb_encode_varint(stream, value); + } + else + { + /* Perform signed integer extension */ + pb_int64_t value = 0; + + if (field->data_size == sizeof(int_least8_t)) + value = *(const int_least8_t*)field->pData; + else if (field->data_size == sizeof(int_least16_t)) + value = *(const int_least16_t*)field->pData; + else if (field->data_size == sizeof(int32_t)) + value = *(const int32_t*)field->pData; + else if (field->data_size == sizeof(pb_int64_t)) + value = *(const pb_int64_t*)field->pData; + else + PB_RETURN_ERROR(stream, "invalid data_size"); + + if (PB_LTYPE(field->type) == PB_LTYPE_SVARINT) + return pb_encode_svarint(stream, value); +#ifdef PB_WITHOUT_64BIT + else if (value < 0) + return pb_encode_varint_32(stream, (uint32_t)value, (uint32_t)-1); +#endif + else + return pb_encode_varint(stream, (pb_uint64_t)value); + + } +} + +static bool checkreturn pb_enc_fixed(pb_ostream_t *stream, const pb_field_iter_t *field) +{ +#ifdef PB_CONVERT_DOUBLE_FLOAT + if (field->data_size == sizeof(float) && PB_LTYPE(field->type) == PB_LTYPE_FIXED64) + { + return pb_encode_float_as_double(stream, *(float*)field->pData); + } +#endif + + if (field->data_size == sizeof(uint32_t)) + { + return pb_encode_fixed32(stream, field->pData); + } +#ifndef PB_WITHOUT_64BIT + else if (field->data_size == sizeof(uint64_t)) + { + return pb_encode_fixed64(stream, field->pData); + } +#endif + else + { + PB_RETURN_ERROR(stream, "invalid data_size"); + } +} + +static bool checkreturn pb_enc_bytes(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + const pb_bytes_array_t *bytes = NULL; + + bytes = (const pb_bytes_array_t*)field->pData; + + if (bytes == NULL) + { + /* Treat null pointer as an empty bytes field */ + return pb_encode_string(stream, NULL, 0); + } + + if (PB_ATYPE(field->type) == PB_ATYPE_STATIC && + bytes->size > field->data_size - offsetof(pb_bytes_array_t, bytes)) + { + PB_RETURN_ERROR(stream, "bytes size exceeded"); + } + + return pb_encode_string(stream, bytes->bytes, (size_t)bytes->size); +} + +static bool checkreturn pb_enc_string(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + size_t size = 0; + size_t max_size = (size_t)field->data_size; + const char *str = (const char*)field->pData; + + if (PB_ATYPE(field->type) == PB_ATYPE_POINTER) + { + max_size = (size_t)-1; + } + else + { + /* pb_dec_string() assumes string fields end with a null + * terminator when the type isn't PB_ATYPE_POINTER, so we + * shouldn't allow more than max-1 bytes to be written to + * allow space for the null terminator. + */ + if (max_size == 0) + PB_RETURN_ERROR(stream, "zero-length string"); + + max_size -= 1; + } + + + if (str == NULL) + { + size = 0; /* Treat null pointer as an empty string */ + } + else + { + const char *p = str; + + /* strnlen() is not always available, so just use a loop */ + while (size < max_size && *p != '\0') + { + size++; + p++; + } + + if (*p != '\0') + { + PB_RETURN_ERROR(stream, "unterminated string"); + } + } + +#ifdef PB_VALIDATE_UTF8 + if (!pb_validate_utf8(str)) + PB_RETURN_ERROR(stream, "invalid utf8"); +#endif + + return pb_encode_string(stream, (const pb_byte_t*)str, size); +} + +static bool checkreturn pb_enc_submessage(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + if (field->submsg_desc == NULL) + PB_RETURN_ERROR(stream, "invalid field descriptor"); + + if (PB_LTYPE(field->type) == PB_LTYPE_SUBMSG_W_CB && field->pSize != NULL) + { + /* Message callback is stored right before pSize. */ + pb_callback_t *callback = (pb_callback_t*)field->pSize - 1; + if (callback->funcs.encode) + { + if (!callback->funcs.encode(stream, field, &callback->arg)) + return false; + } + } + + return pb_encode_submessage(stream, field->submsg_desc, field->pData); +} + +static bool checkreturn pb_enc_fixed_length_bytes(pb_ostream_t *stream, const pb_field_iter_t *field) +{ + return pb_encode_string(stream, (const pb_byte_t*)field->pData, (size_t)field->data_size); +} + +#ifdef PB_CONVERT_DOUBLE_FLOAT +bool pb_encode_float_as_double(pb_ostream_t *stream, float value) +{ + union { float f; uint32_t i; } in; + uint_least8_t sign; + int exponent; + uint64_t mantissa; + + in.f = value; + + /* Decompose input value */ + sign = (uint_least8_t)((in.i >> 31) & 1); + exponent = (int)((in.i >> 23) & 0xFF) - 127; + mantissa = in.i & 0x7FFFFF; + + if (exponent == 128) + { + /* Special value (NaN etc.) */ + exponent = 1024; + } + else if (exponent == -127) + { + if (!mantissa) + { + /* Zero */ + exponent = -1023; + } + else + { + /* Denormalized */ + mantissa <<= 1; + while (!(mantissa & 0x800000)) + { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x7FFFFF; + } + } + + /* Combine fields */ + mantissa <<= 29; + mantissa |= (uint64_t)(exponent + 1023) << 52; + mantissa |= (uint64_t)sign << 63; + + return pb_encode_fixed64(stream, &mantissa); +} +#endif diff --git a/src/third_party/librdkafka/dist/src/nanopb/pb_encode.h b/src/third_party/librdkafka/dist/src/nanopb/pb_encode.h new file mode 100644 index 00000000000..f3805e711d6 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/nanopb/pb_encode.h @@ -0,0 +1,185 @@ +/* pb_encode.h: Functions to encode protocol buffers. Depends on pb_encode.c. + * The main function is pb_encode. You also need an output stream, and the + * field descriptions created by nanopb_generator.py. + */ + +#ifndef PB_ENCODE_H_INCLUDED +#define PB_ENCODE_H_INCLUDED + +#include "nanopb/pb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Structure for defining custom output streams. You will need to provide + * a callback function to write the bytes to your storage, which can be + * for example a file or a network socket. + * + * The callback must conform to these rules: + * + * 1) Return false on IO errors. This will cause encoding to abort. + * 2) You can use state to store your own data (e.g. buffer pointer). + * 3) pb_write will update bytes_written after your callback runs. + * 4) Substreams will modify max_size and bytes_written. Don't use them + * to calculate any pointers. + */ +struct pb_ostream_s +{ +#ifdef PB_BUFFER_ONLY + /* Callback pointer is not used in buffer-only configuration. + * Having an int pointer here allows binary compatibility but + * gives an error if someone tries to assign callback function. + * Also, NULL pointer marks a 'sizing stream' that does not + * write anything. + */ + const int *callback; +#else + bool (*callback)(pb_ostream_t *stream, const pb_byte_t *buf, size_t count); +#endif + void *state; /* Free field for use by callback implementation. */ + size_t max_size; /* Limit number of output bytes written (or use SIZE_MAX). */ + size_t bytes_written; /* Number of bytes written so far. */ + +#ifndef PB_NO_ERRMSG + const char *errmsg; +#endif +}; + +/*************************** + * Main encoding functions * + ***************************/ + +/* Encode a single protocol buffers message from C structure into a stream. + * Returns true on success, false on any failure. + * The actual struct pointed to by src_struct must match the description in fields. + * All required fields in the struct are assumed to have been filled in. + * + * Example usage: + * MyMessage msg = {}; + * uint8_t buffer[64]; + * pb_ostream_t stream; + * + * msg.field1 = 42; + * stream = pb_ostream_from_buffer(buffer, sizeof(buffer)); + * pb_encode(&stream, MyMessage_fields, &msg); + */ +bool pb_encode(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct); + +/* Extended version of pb_encode, with several options to control the + * encoding process: + * + * PB_ENCODE_DELIMITED: Prepend the length of message as a varint. + * Corresponds to writeDelimitedTo() in Google's + * protobuf API. + * + * PB_ENCODE_NULLTERMINATED: Append a null byte to the message for termination. + * NOTE: This behaviour is not supported in most other + * protobuf implementations, so PB_ENCODE_DELIMITED + * is a better option for compatibility. + */ +#define PB_ENCODE_DELIMITED 0x02U +#define PB_ENCODE_NULLTERMINATED 0x04U +bool pb_encode_ex(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct, unsigned int flags); + +/* Defines for backwards compatibility with code written before nanopb-0.4.0 */ +#define pb_encode_delimited(s,f,d) pb_encode_ex(s,f,d, PB_ENCODE_DELIMITED) +#define pb_encode_nullterminated(s,f,d) pb_encode_ex(s,f,d, PB_ENCODE_NULLTERMINATED) + +/* Encode the message to get the size of the encoded data, but do not store + * the data. */ +bool pb_get_encoded_size(size_t *size, const pb_msgdesc_t *fields, const void *src_struct); + +/************************************** + * Functions for manipulating streams * + **************************************/ + +/* Create an output stream for writing into a memory buffer. + * The number of bytes written can be found in stream.bytes_written after + * encoding the message. + * + * Alternatively, you can use a custom stream that writes directly to e.g. + * a file or a network socket. + */ +pb_ostream_t pb_ostream_from_buffer(pb_byte_t *buf, size_t bufsize); + +/* Pseudo-stream for measuring the size of a message without actually storing + * the encoded data. + * + * Example usage: + * MyMessage msg = {}; + * pb_ostream_t stream = PB_OSTREAM_SIZING; + * pb_encode(&stream, MyMessage_fields, &msg); + * printf("Message size is %d\n", stream.bytes_written); + */ +#ifndef PB_NO_ERRMSG +#define PB_OSTREAM_SIZING {0,0,0,0,0} +#else +#define PB_OSTREAM_SIZING {0,0,0,0} +#endif + +/* Function to write into a pb_ostream_t stream. You can use this if you need + * to append or prepend some custom headers to the message. + */ +bool pb_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count); + + +/************************************************ + * Helper functions for writing field callbacks * + ************************************************/ + +/* Encode field header based on type and field number defined in the field + * structure. Call this from the callback before writing out field contents. */ +bool pb_encode_tag_for_field(pb_ostream_t *stream, const pb_field_iter_t *field); + +/* Encode field header by manually specifying wire type. You need to use this + * if you want to write out packed arrays from a callback field. */ +bool pb_encode_tag(pb_ostream_t *stream, pb_wire_type_t wiretype, uint32_t field_number); + +/* Encode an integer in the varint format. + * This works for bool, enum, int32, int64, uint32 and uint64 field types. */ +#ifndef PB_WITHOUT_64BIT +bool pb_encode_varint(pb_ostream_t *stream, uint64_t value); +#else +bool pb_encode_varint(pb_ostream_t *stream, uint32_t value); +#endif + +/* Encode an integer in the zig-zagged svarint format. + * This works for sint32 and sint64. */ +#ifndef PB_WITHOUT_64BIT +bool pb_encode_svarint(pb_ostream_t *stream, int64_t value); +#else +bool pb_encode_svarint(pb_ostream_t *stream, int32_t value); +#endif + +/* Encode a string or bytes type field. For strings, pass strlen(s) as size. */ +bool pb_encode_string(pb_ostream_t *stream, const pb_byte_t *buffer, size_t size); + +/* Encode a fixed32, sfixed32 or float value. + * You need to pass a pointer to a 4-byte wide C variable. */ +bool pb_encode_fixed32(pb_ostream_t *stream, const void *value); + +#ifndef PB_WITHOUT_64BIT +/* Encode a fixed64, sfixed64 or double value. + * You need to pass a pointer to a 8-byte wide C variable. */ +bool pb_encode_fixed64(pb_ostream_t *stream, const void *value); +#endif + +#ifdef PB_CONVERT_DOUBLE_FLOAT +/* Encode a float value so that it appears like a double in the encoded + * message. */ +bool pb_encode_float_as_double(pb_ostream_t *stream, float value); +#endif + +/* Encode a submessage field. + * You need to pass the pb_field_t array and pointer to struct, just like + * with pb_encode(). This internally encodes the submessage twice, first to + * calculate message size and then to actually write it out. + */ +bool pb_encode_submessage(pb_ostream_t *stream, const pb_msgdesc_t *fields, const void *src_struct); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/src/third_party/librdkafka/dist/src/opentelemetry/common.pb.c b/src/third_party/librdkafka/dist/src/opentelemetry/common.pb.c new file mode 100644 index 00000000000..e03889b5772 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/opentelemetry/common.pb.c @@ -0,0 +1,32 @@ +/* Automatically generated nanopb constant definitions */ +/* Generated by nanopb-0.4.8-dev */ + +#include "opentelemetry/common.pb.h" +#if PB_PROTO_HEADER_VERSION != 40 +#error Regenerate this file with the current version of nanopb generator. +#endif + +PB_BIND(opentelemetry_proto_common_v1_AnyValue, opentelemetry_proto_common_v1_AnyValue, AUTO) + + +PB_BIND(opentelemetry_proto_common_v1_ArrayValue, opentelemetry_proto_common_v1_ArrayValue, AUTO) + + +PB_BIND(opentelemetry_proto_common_v1_KeyValueList, opentelemetry_proto_common_v1_KeyValueList, AUTO) + + +PB_BIND(opentelemetry_proto_common_v1_KeyValue, opentelemetry_proto_common_v1_KeyValue, AUTO) + + +PB_BIND(opentelemetry_proto_common_v1_InstrumentationScope, opentelemetry_proto_common_v1_InstrumentationScope, AUTO) + + + +#ifndef PB_CONVERT_DOUBLE_FLOAT +/* On some platforms (such as AVR), double is really float. + * To be able to encode/decode double on these platforms, you need. + * to define PB_CONVERT_DOUBLE_FLOAT in pb.h or compiler command line. + */ +PB_STATIC_ASSERT(sizeof(double) == 8, DOUBLE_MUST_BE_8_BYTES) +#endif + diff --git a/src/third_party/librdkafka/dist/src/opentelemetry/common.pb.h b/src/third_party/librdkafka/dist/src/opentelemetry/common.pb.h new file mode 100644 index 00000000000..4a02adda661 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/opentelemetry/common.pb.h @@ -0,0 +1,170 @@ +/* Automatically generated nanopb header */ +/* Generated by nanopb-0.4.8-dev */ + +#ifndef PB_OPENTELEMETRY_PROTO_COMMON_V1_OPENTELEMETRY_PROTO_COMMON_V1_COMMON_PB_H_INCLUDED +#define PB_OPENTELEMETRY_PROTO_COMMON_V1_OPENTELEMETRY_PROTO_COMMON_V1_COMMON_PB_H_INCLUDED +#include + +#if PB_PROTO_HEADER_VERSION != 40 +#error Regenerate this file with the current version of nanopb generator. +#endif + +/* Struct definitions */ +/* ArrayValue is a list of AnyValue messages. We need ArrayValue as a message + since oneof in AnyValue does not allow repeated fields. */ +typedef struct _opentelemetry_proto_common_v1_ArrayValue { + /* Array of values. The array may be empty (contain 0 elements). */ + pb_callback_t values; +} opentelemetry_proto_common_v1_ArrayValue; + +/* KeyValueList is a list of KeyValue messages. We need KeyValueList as a message + since `oneof` in AnyValue does not allow repeated fields. Everywhere else where we need + a list of KeyValue messages (e.g. in Span) we use `repeated KeyValue` directly to + avoid unnecessary extra wrapping (which slows down the protocol). The 2 approaches + are semantically equivalent. */ +typedef struct _opentelemetry_proto_common_v1_KeyValueList { + /* A collection of key/value pairs of key-value pairs. The list may be empty (may + contain 0 elements). + The keys MUST be unique (it is not allowed to have more than one + value with the same key). */ + pb_callback_t values; +} opentelemetry_proto_common_v1_KeyValueList; + +/* AnyValue is used to represent any type of attribute value. AnyValue may contain a + primitive value such as a string or integer or it may contain an arbitrary nested + object containing arrays, key-value lists and primitives. */ +typedef struct _opentelemetry_proto_common_v1_AnyValue { + pb_size_t which_value; + union { + pb_callback_t string_value; + bool bool_value; + int64_t int_value; + double double_value; + opentelemetry_proto_common_v1_ArrayValue array_value; + opentelemetry_proto_common_v1_KeyValueList kvlist_value; + pb_callback_t bytes_value; + } value; +} opentelemetry_proto_common_v1_AnyValue; + +/* KeyValue is a key-value pair that is used to store Span attributes, Link + attributes, etc. */ +typedef struct _opentelemetry_proto_common_v1_KeyValue { + pb_callback_t key; + bool has_value; + opentelemetry_proto_common_v1_AnyValue value; +} opentelemetry_proto_common_v1_KeyValue; + +/* InstrumentationScope is a message representing the instrumentation scope information + such as the fully qualified name and version. */ +typedef struct _opentelemetry_proto_common_v1_InstrumentationScope { + /* An empty instrumentation scope name means the name is unknown. */ + pb_callback_t name; + pb_callback_t version; + /* Additional attributes that describe the scope. [Optional]. + Attribute keys MUST be unique (it is not allowed to have more than one + attribute with the same key). */ + pb_callback_t attributes; + uint32_t dropped_attributes_count; +} opentelemetry_proto_common_v1_InstrumentationScope; + + +#ifdef __cplusplus +extern "C" { +#endif + +/* Initializer values for message structs */ +#define opentelemetry_proto_common_v1_AnyValue_init_default {0, {{{NULL}, NULL}}} +#define opentelemetry_proto_common_v1_ArrayValue_init_default {{{NULL}, NULL}} +#define opentelemetry_proto_common_v1_KeyValueList_init_default {{{NULL}, NULL}} +#define opentelemetry_proto_common_v1_KeyValue_init_default {{{NULL}, NULL}, false, opentelemetry_proto_common_v1_AnyValue_init_default} +#define opentelemetry_proto_common_v1_InstrumentationScope_init_default {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, 0} +#define opentelemetry_proto_common_v1_AnyValue_init_zero {0, {{{NULL}, NULL}}} +#define opentelemetry_proto_common_v1_ArrayValue_init_zero {{{NULL}, NULL}} +#define opentelemetry_proto_common_v1_KeyValueList_init_zero {{{NULL}, NULL}} +#define opentelemetry_proto_common_v1_KeyValue_init_zero {{{NULL}, NULL}, false, opentelemetry_proto_common_v1_AnyValue_init_zero} +#define opentelemetry_proto_common_v1_InstrumentationScope_init_zero {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, 0} + +/* Field tags (for use in manual encoding/decoding) */ +#define opentelemetry_proto_common_v1_ArrayValue_values_tag 1 +#define opentelemetry_proto_common_v1_KeyValueList_values_tag 1 +#define opentelemetry_proto_common_v1_AnyValue_string_value_tag 1 +#define opentelemetry_proto_common_v1_AnyValue_bool_value_tag 2 +#define opentelemetry_proto_common_v1_AnyValue_int_value_tag 3 +#define opentelemetry_proto_common_v1_AnyValue_double_value_tag 4 +#define opentelemetry_proto_common_v1_AnyValue_array_value_tag 5 +#define opentelemetry_proto_common_v1_AnyValue_kvlist_value_tag 6 +#define opentelemetry_proto_common_v1_AnyValue_bytes_value_tag 7 +#define opentelemetry_proto_common_v1_KeyValue_key_tag 1 +#define opentelemetry_proto_common_v1_KeyValue_value_tag 2 +#define opentelemetry_proto_common_v1_InstrumentationScope_name_tag 1 +#define opentelemetry_proto_common_v1_InstrumentationScope_version_tag 2 +#define opentelemetry_proto_common_v1_InstrumentationScope_attributes_tag 3 +#define opentelemetry_proto_common_v1_InstrumentationScope_dropped_attributes_count_tag 4 + +/* Struct field encoding specification for nanopb */ +#define opentelemetry_proto_common_v1_AnyValue_FIELDLIST(X, a) \ +X(a, CALLBACK, ONEOF, STRING, (value,string_value,value.string_value), 1) \ +X(a, STATIC, ONEOF, BOOL, (value,bool_value,value.bool_value), 2) \ +X(a, STATIC, ONEOF, INT64, (value,int_value,value.int_value), 3) \ +X(a, STATIC, ONEOF, DOUBLE, (value,double_value,value.double_value), 4) \ +X(a, STATIC, ONEOF, MESSAGE, (value,array_value,value.array_value), 5) \ +X(a, STATIC, ONEOF, MESSAGE, (value,kvlist_value,value.kvlist_value), 6) \ +X(a, CALLBACK, ONEOF, BYTES, (value,bytes_value,value.bytes_value), 7) +#define opentelemetry_proto_common_v1_AnyValue_CALLBACK pb_default_field_callback +#define opentelemetry_proto_common_v1_AnyValue_DEFAULT NULL +#define opentelemetry_proto_common_v1_AnyValue_value_array_value_MSGTYPE opentelemetry_proto_common_v1_ArrayValue +#define opentelemetry_proto_common_v1_AnyValue_value_kvlist_value_MSGTYPE opentelemetry_proto_common_v1_KeyValueList + +#define opentelemetry_proto_common_v1_ArrayValue_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, values, 1) +#define opentelemetry_proto_common_v1_ArrayValue_CALLBACK pb_default_field_callback +#define opentelemetry_proto_common_v1_ArrayValue_DEFAULT NULL +#define opentelemetry_proto_common_v1_ArrayValue_values_MSGTYPE opentelemetry_proto_common_v1_AnyValue + +#define opentelemetry_proto_common_v1_KeyValueList_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, values, 1) +#define opentelemetry_proto_common_v1_KeyValueList_CALLBACK pb_default_field_callback +#define opentelemetry_proto_common_v1_KeyValueList_DEFAULT NULL +#define opentelemetry_proto_common_v1_KeyValueList_values_MSGTYPE opentelemetry_proto_common_v1_KeyValue + +#define opentelemetry_proto_common_v1_KeyValue_FIELDLIST(X, a) \ +X(a, CALLBACK, SINGULAR, STRING, key, 1) \ +X(a, STATIC, OPTIONAL, MESSAGE, value, 2) +#define opentelemetry_proto_common_v1_KeyValue_CALLBACK pb_default_field_callback +#define opentelemetry_proto_common_v1_KeyValue_DEFAULT NULL +#define opentelemetry_proto_common_v1_KeyValue_value_MSGTYPE opentelemetry_proto_common_v1_AnyValue + +#define opentelemetry_proto_common_v1_InstrumentationScope_FIELDLIST(X, a) \ +X(a, CALLBACK, SINGULAR, STRING, name, 1) \ +X(a, CALLBACK, SINGULAR, STRING, version, 2) \ +X(a, CALLBACK, REPEATED, MESSAGE, attributes, 3) \ +X(a, STATIC, SINGULAR, UINT32, dropped_attributes_count, 4) +#define opentelemetry_proto_common_v1_InstrumentationScope_CALLBACK pb_default_field_callback +#define opentelemetry_proto_common_v1_InstrumentationScope_DEFAULT NULL +#define opentelemetry_proto_common_v1_InstrumentationScope_attributes_MSGTYPE opentelemetry_proto_common_v1_KeyValue + +extern const pb_msgdesc_t opentelemetry_proto_common_v1_AnyValue_msg; +extern const pb_msgdesc_t opentelemetry_proto_common_v1_ArrayValue_msg; +extern const pb_msgdesc_t opentelemetry_proto_common_v1_KeyValueList_msg; +extern const pb_msgdesc_t opentelemetry_proto_common_v1_KeyValue_msg; +extern const pb_msgdesc_t opentelemetry_proto_common_v1_InstrumentationScope_msg; + +/* Defines for backwards compatibility with code written before nanopb-0.4.0 */ +#define opentelemetry_proto_common_v1_AnyValue_fields &opentelemetry_proto_common_v1_AnyValue_msg +#define opentelemetry_proto_common_v1_ArrayValue_fields &opentelemetry_proto_common_v1_ArrayValue_msg +#define opentelemetry_proto_common_v1_KeyValueList_fields &opentelemetry_proto_common_v1_KeyValueList_msg +#define opentelemetry_proto_common_v1_KeyValue_fields &opentelemetry_proto_common_v1_KeyValue_msg +#define opentelemetry_proto_common_v1_InstrumentationScope_fields &opentelemetry_proto_common_v1_InstrumentationScope_msg + +/* Maximum encoded size of messages (where known) */ +/* opentelemetry_proto_common_v1_AnyValue_size depends on runtime parameters */ +/* opentelemetry_proto_common_v1_ArrayValue_size depends on runtime parameters */ +/* opentelemetry_proto_common_v1_KeyValueList_size depends on runtime parameters */ +/* opentelemetry_proto_common_v1_KeyValue_size depends on runtime parameters */ +/* opentelemetry_proto_common_v1_InstrumentationScope_size depends on runtime parameters */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/src/third_party/librdkafka/dist/src/opentelemetry/metrics.options b/src/third_party/librdkafka/dist/src/opentelemetry/metrics.options new file mode 100644 index 00000000000..d5ab8d33c43 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/opentelemetry/metrics.options @@ -0,0 +1,2 @@ +# Needed to generate callback for data types within Metrics which isn't generated for oneof types by default +opentelemetry.proto.metrics.v1.Metric submsg_callback:true; diff --git a/src/third_party/librdkafka/dist/src/opentelemetry/metrics.pb.c b/src/third_party/librdkafka/dist/src/opentelemetry/metrics.pb.c new file mode 100644 index 00000000000..2b74de9272e --- /dev/null +++ b/src/third_party/librdkafka/dist/src/opentelemetry/metrics.pb.c @@ -0,0 +1,67 @@ +/* Automatically generated nanopb constant definitions */ +/* Generated by nanopb-0.4.8-dev */ + +#include "opentelemetry/metrics.pb.h" +#if PB_PROTO_HEADER_VERSION != 40 +#error Regenerate this file with the current version of nanopb generator. +#endif + +PB_BIND(opentelemetry_proto_metrics_v1_MetricsData, opentelemetry_proto_metrics_v1_MetricsData, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_ResourceMetrics, opentelemetry_proto_metrics_v1_ResourceMetrics, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_ScopeMetrics, opentelemetry_proto_metrics_v1_ScopeMetrics, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_Metric, opentelemetry_proto_metrics_v1_Metric, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_Gauge, opentelemetry_proto_metrics_v1_Gauge, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_Sum, opentelemetry_proto_metrics_v1_Sum, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_Histogram, opentelemetry_proto_metrics_v1_Histogram, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_ExponentialHistogram, opentelemetry_proto_metrics_v1_ExponentialHistogram, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_Summary, opentelemetry_proto_metrics_v1_Summary, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_NumberDataPoint, opentelemetry_proto_metrics_v1_NumberDataPoint, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_HistogramDataPoint, opentelemetry_proto_metrics_v1_HistogramDataPoint, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint, opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets, opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_SummaryDataPoint, opentelemetry_proto_metrics_v1_SummaryDataPoint, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile, opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile, AUTO) + + +PB_BIND(opentelemetry_proto_metrics_v1_Exemplar, opentelemetry_proto_metrics_v1_Exemplar, AUTO) + + + + + +#ifndef PB_CONVERT_DOUBLE_FLOAT +/* On some platforms (such as AVR), double is really float. + * To be able to encode/decode double on these platforms, you need. + * to define PB_CONVERT_DOUBLE_FLOAT in pb.h or compiler command line. + */ +PB_STATIC_ASSERT(sizeof(double) == 8, DOUBLE_MUST_BE_8_BYTES) +#endif + diff --git a/src/third_party/librdkafka/dist/src/opentelemetry/metrics.pb.h b/src/third_party/librdkafka/dist/src/opentelemetry/metrics.pb.h new file mode 100644 index 00000000000..7c812c2d459 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/opentelemetry/metrics.pb.h @@ -0,0 +1,966 @@ +/* Automatically generated nanopb header */ +/* Generated by nanopb-0.4.8-dev */ + +#ifndef PB_OPENTELEMETRY_PROTO_METRICS_V1_OPENTELEMETRY_PROTO_METRICS_V1_METRICS_PB_H_INCLUDED +#define PB_OPENTELEMETRY_PROTO_METRICS_V1_OPENTELEMETRY_PROTO_METRICS_V1_METRICS_PB_H_INCLUDED +#include +#include "opentelemetry/common.pb.h" +#include "opentelemetry/resource.pb.h" + +#if PB_PROTO_HEADER_VERSION != 40 +#error Regenerate this file with the current version of nanopb generator. +#endif + +/* Enum definitions */ +/* AggregationTemporality defines how a metric aggregator reports aggregated + values. It describes how those values relate to the time interval over + which they are aggregated. */ +typedef enum _opentelemetry_proto_metrics_v1_AggregationTemporality { + /* UNSPECIFIED is the default AggregationTemporality, it MUST not be used. */ + opentelemetry_proto_metrics_v1_AggregationTemporality_AGGREGATION_TEMPORALITY_UNSPECIFIED = 0, + /* DELTA is an AggregationTemporality for a metric aggregator which reports + changes since last report time. Successive metrics contain aggregation of + values from continuous and non-overlapping intervals. + + The values for a DELTA metric are based only on the time interval + associated with one measurement cycle. There is no dependency on + previous measurements like is the case for CUMULATIVE metrics. + + For example, consider a system measuring the number of requests that + it receives and reports the sum of these requests every second as a + DELTA metric: + + 1. The system starts receiving at time=t_0. + 2. A request is received, the system measures 1 request. + 3. A request is received, the system measures 1 request. + 4. A request is received, the system measures 1 request. + 5. The 1 second collection cycle ends. A metric is exported for the + number of requests received over the interval of time t_0 to + t_0+1 with a value of 3. + 6. A request is received, the system measures 1 request. + 7. A request is received, the system measures 1 request. + 8. The 1 second collection cycle ends. A metric is exported for the + number of requests received over the interval of time t_0+1 to + t_0+2 with a value of 2. */ + opentelemetry_proto_metrics_v1_AggregationTemporality_AGGREGATION_TEMPORALITY_DELTA = 1, + /* CUMULATIVE is an AggregationTemporality for a metric aggregator which + reports changes since a fixed start time. This means that current values + of a CUMULATIVE metric depend on all previous measurements since the + start time. Because of this, the sender is required to retain this state + in some form. If this state is lost or invalidated, the CUMULATIVE metric + values MUST be reset and a new fixed start time following the last + reported measurement time sent MUST be used. + + For example, consider a system measuring the number of requests that + it receives and reports the sum of these requests every second as a + CUMULATIVE metric: + + 1. The system starts receiving at time=t_0. + 2. A request is received, the system measures 1 request. + 3. A request is received, the system measures 1 request. + 4. A request is received, the system measures 1 request. + 5. The 1 second collection cycle ends. A metric is exported for the + number of requests received over the interval of time t_0 to + t_0+1 with a value of 3. + 6. A request is received, the system measures 1 request. + 7. A request is received, the system measures 1 request. + 8. The 1 second collection cycle ends. A metric is exported for the + number of requests received over the interval of time t_0 to + t_0+2 with a value of 5. + 9. The system experiences a fault and loses state. + 10. The system recovers and resumes receiving at time=t_1. + 11. A request is received, the system measures 1 request. + 12. The 1 second collection cycle ends. A metric is exported for the + number of requests received over the interval of time t_1 to + t_0+1 with a value of 1. + + Note: Even though, when reporting changes since last report time, using + CUMULATIVE is valid, it is not recommended. This may cause problems for + systems that do not use start_time to determine when the aggregation + value was reset (e.g. Prometheus). */ + opentelemetry_proto_metrics_v1_AggregationTemporality_AGGREGATION_TEMPORALITY_CUMULATIVE = 2 +} opentelemetry_proto_metrics_v1_AggregationTemporality; + +/* DataPointFlags is defined as a protobuf 'uint32' type and is to be used as a + bit-field representing 32 distinct boolean flags. Each flag defined in this + enum is a bit-mask. To test the presence of a single flag in the flags of + a data point, for example, use an expression like: + + (point.flags & DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK) == DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK */ +typedef enum _opentelemetry_proto_metrics_v1_DataPointFlags { + /* The zero value for the enum. Should not be used for comparisons. + Instead use bitwise "and" with the appropriate mask as shown above. */ + opentelemetry_proto_metrics_v1_DataPointFlags_DATA_POINT_FLAGS_DO_NOT_USE = 0, + /* This DataPoint is valid but has no recorded value. This value + SHOULD be used to reflect explicitly missing data in a series, as + for an equivalent to the Prometheus "staleness marker". */ + opentelemetry_proto_metrics_v1_DataPointFlags_DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK = 1 +} opentelemetry_proto_metrics_v1_DataPointFlags; + +/* Struct definitions */ +/* MetricsData represents the metrics data that can be stored in a persistent + storage, OR can be embedded by other protocols that transfer OTLP metrics + data but do not implement the OTLP protocol. + + The main difference between this message and collector protocol is that + in this message there will not be any "control" or "metadata" specific to + OTLP protocol. + + When new fields are added into this message, the OTLP request MUST be updated + as well. */ +typedef struct _opentelemetry_proto_metrics_v1_MetricsData { + /* An array of ResourceMetrics. + For data coming from a single resource this array will typically contain + one element. Intermediary nodes that receive data from multiple origins + typically batch the data before forwarding further and in that case this + array will contain multiple elements. */ + pb_callback_t resource_metrics; +} opentelemetry_proto_metrics_v1_MetricsData; + +/* A collection of ScopeMetrics from a Resource. */ +typedef struct _opentelemetry_proto_metrics_v1_ResourceMetrics { + /* The resource for the metrics in this message. + If this field is not set then no resource info is known. */ + bool has_resource; + opentelemetry_proto_resource_v1_Resource resource; + /* A list of metrics that originate from a resource. */ + pb_callback_t scope_metrics; + /* This schema_url applies to the data in the "resource" field. It does not apply + to the data in the "scope_metrics" field which have their own schema_url field. */ + pb_callback_t schema_url; +} opentelemetry_proto_metrics_v1_ResourceMetrics; + +/* A collection of Metrics produced by an Scope. */ +typedef struct _opentelemetry_proto_metrics_v1_ScopeMetrics { + /* The instrumentation scope information for the metrics in this message. + Semantically when InstrumentationScope isn't set, it is equivalent with + an empty instrumentation scope name (unknown). */ + bool has_scope; + opentelemetry_proto_common_v1_InstrumentationScope scope; + /* A list of metrics that originate from an instrumentation library. */ + pb_callback_t metrics; + /* This schema_url applies to all metrics in the "metrics" field. */ + pb_callback_t schema_url; +} opentelemetry_proto_metrics_v1_ScopeMetrics; + +/* Gauge represents the type of a scalar metric that always exports the + "current value" for every data point. It should be used for an "unknown" + aggregation. + + A Gauge does not support different aggregation temporalities. Given the + aggregation is unknown, points cannot be combined using the same + aggregation, regardless of aggregation temporalities. Therefore, + AggregationTemporality is not included. Consequently, this also means + "StartTimeUnixNano" is ignored for all data points. */ +typedef struct _opentelemetry_proto_metrics_v1_Gauge { + pb_callback_t data_points; +} opentelemetry_proto_metrics_v1_Gauge; + +/* Sum represents the type of a scalar metric that is calculated as a sum of all + reported measurements over a time interval. */ +typedef struct _opentelemetry_proto_metrics_v1_Sum { + pb_callback_t data_points; + /* aggregation_temporality describes if the aggregator reports delta changes + since last report time, or cumulative changes since a fixed start time. */ + opentelemetry_proto_metrics_v1_AggregationTemporality aggregation_temporality; + /* If "true" means that the sum is monotonic. */ + bool is_monotonic; +} opentelemetry_proto_metrics_v1_Sum; + +/* Histogram represents the type of a metric that is calculated by aggregating + as a Histogram of all reported measurements over a time interval. */ +typedef struct _opentelemetry_proto_metrics_v1_Histogram { + pb_callback_t data_points; + /* aggregation_temporality describes if the aggregator reports delta changes + since last report time, or cumulative changes since a fixed start time. */ + opentelemetry_proto_metrics_v1_AggregationTemporality aggregation_temporality; +} opentelemetry_proto_metrics_v1_Histogram; + +/* ExponentialHistogram represents the type of a metric that is calculated by aggregating + as a ExponentialHistogram of all reported double measurements over a time interval. */ +typedef struct _opentelemetry_proto_metrics_v1_ExponentialHistogram { + pb_callback_t data_points; + /* aggregation_temporality describes if the aggregator reports delta changes + since last report time, or cumulative changes since a fixed start time. */ + opentelemetry_proto_metrics_v1_AggregationTemporality aggregation_temporality; +} opentelemetry_proto_metrics_v1_ExponentialHistogram; + +/* Summary metric data are used to convey quantile summaries, + a Prometheus (see: https://prometheus.io/docs/concepts/metric_types/#summary) + and OpenMetrics (see: https://github.com/OpenObservability/OpenMetrics/blob/4dbf6075567ab43296eed941037c12951faafb92/protos/prometheus.proto#L45) + data type. These data points cannot always be merged in a meaningful way. + While they can be useful in some applications, histogram data points are + recommended for new applications. */ +typedef struct _opentelemetry_proto_metrics_v1_Summary { + pb_callback_t data_points; +} opentelemetry_proto_metrics_v1_Summary; + +/* Defines a Metric which has one or more timeseries. The following is a + brief summary of the Metric data model. For more details, see: + + https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md + + + The data model and relation between entities is shown in the + diagram below. Here, "DataPoint" is the term used to refer to any + one of the specific data point value types, and "points" is the term used + to refer to any one of the lists of points contained in the Metric. + + - Metric is composed of a metadata and data. + - Metadata part contains a name, description, unit. + - Data is one of the possible types (Sum, Gauge, Histogram, Summary). + - DataPoint contains timestamps, attributes, and one of the possible value type + fields. + + Metric + +------------+ + |name | + |description | + |unit | +------------------------------------+ + |data |---> |Gauge, Sum, Histogram, Summary, ... | + +------------+ +------------------------------------+ + + Data [One of Gauge, Sum, Histogram, Summary, ...] + +-----------+ + |... | // Metadata about the Data. + |points |--+ + +-----------+ | + | +---------------------------+ + | |DataPoint 1 | + v |+------+------+ +------+ | + +-----+ ||label |label |...|label | | + | 1 |-->||value1|value2|...|valueN| | + +-----+ |+------+------+ +------+ | + | . | |+-----+ | + | . | ||value| | + | . | |+-----+ | + | . | +---------------------------+ + | . | . + | . | . + | . | . + | . | +---------------------------+ + | . | |DataPoint M | + +-----+ |+------+------+ +------+ | + | M |-->||label |label |...|label | | + +-----+ ||value1|value2|...|valueN| | + |+------+------+ +------+ | + |+-----+ | + ||value| | + |+-----+ | + +---------------------------+ + + Each distinct type of DataPoint represents the output of a specific + aggregation function, the result of applying the DataPoint's + associated function of to one or more measurements. + + All DataPoint types have three common fields: + - Attributes includes key-value pairs associated with the data point + - TimeUnixNano is required, set to the end time of the aggregation + - StartTimeUnixNano is optional, but strongly encouraged for DataPoints + having an AggregationTemporality field, as discussed below. + + Both TimeUnixNano and StartTimeUnixNano values are expressed as + UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + + # TimeUnixNano + + This field is required, having consistent interpretation across + DataPoint types. TimeUnixNano is the moment corresponding to when + the data point's aggregate value was captured. + + Data points with the 0 value for TimeUnixNano SHOULD be rejected + by consumers. + + # StartTimeUnixNano + + StartTimeUnixNano in general allows detecting when a sequence of + observations is unbroken. This field indicates to consumers the + start time for points with cumulative and delta + AggregationTemporality, and it should be included whenever possible + to support correct rate calculation. Although it may be omitted + when the start time is truly unknown, setting StartTimeUnixNano is + strongly encouraged. */ +typedef struct _opentelemetry_proto_metrics_v1_Metric { + /* name of the metric, including its DNS name prefix. It must be unique. */ + pb_callback_t name; + /* description of the metric, which can be used in documentation. */ + pb_callback_t description; + /* unit in which the metric value is reported. Follows the format + described by http://unitsofmeasure.org/ucum.html. */ + pb_callback_t unit; + pb_callback_t cb_data; + pb_size_t which_data; + union { + opentelemetry_proto_metrics_v1_Gauge gauge; + opentelemetry_proto_metrics_v1_Sum sum; + opentelemetry_proto_metrics_v1_Histogram histogram; + opentelemetry_proto_metrics_v1_ExponentialHistogram exponential_histogram; + opentelemetry_proto_metrics_v1_Summary summary; + } data; +} opentelemetry_proto_metrics_v1_Metric; + +/* NumberDataPoint is a single data point in a timeseries that describes the + time-varying scalar value of a metric. */ +typedef struct _opentelemetry_proto_metrics_v1_NumberDataPoint { + /* StartTimeUnixNano is optional but strongly encouraged, see the + the detailed comments above Metric. + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t start_time_unix_nano; + /* TimeUnixNano is required, see the detailed comments above Metric. + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t time_unix_nano; + pb_size_t which_value; + union { + double as_double; + int64_t as_int; + } value; + /* (Optional) List of exemplars collected from + measurements that were used to form the data point */ + pb_callback_t exemplars; + /* The set of key/value pairs that uniquely identify the timeseries from + where this point belongs. The list may be empty (may contain 0 elements). + Attribute keys MUST be unique (it is not allowed to have more than one + attribute with the same key). */ + pb_callback_t attributes; + /* Flags that apply to this specific data point. See DataPointFlags + for the available flags and their meaning. */ + uint32_t flags; +} opentelemetry_proto_metrics_v1_NumberDataPoint; + +/* HistogramDataPoint is a single data point in a timeseries that describes the + time-varying values of a Histogram. A Histogram contains summary statistics + for a population of values, it may optionally contain the distribution of + those values across a set of buckets. + + If the histogram contains the distribution of values, then both + "explicit_bounds" and "bucket counts" fields must be defined. + If the histogram does not contain the distribution of values, then both + "explicit_bounds" and "bucket_counts" must be omitted and only "count" and + "sum" are known. */ +typedef struct _opentelemetry_proto_metrics_v1_HistogramDataPoint { + /* StartTimeUnixNano is optional but strongly encouraged, see the + the detailed comments above Metric. + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t start_time_unix_nano; + /* TimeUnixNano is required, see the detailed comments above Metric. + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t time_unix_nano; + /* count is the number of values in the population. Must be non-negative. This + value must be equal to the sum of the "count" fields in buckets if a + histogram is provided. */ + uint64_t count; + /* sum of the values in the population. If count is zero then this field + must be zero. + + Note: Sum should only be filled out when measuring non-negative discrete + events, and is assumed to be monotonic over the values of these events. + Negative events *can* be recorded, but sum should not be filled out when + doing so. This is specifically to enforce compatibility w/ OpenMetrics, + see: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#histogram */ + bool has_sum; + double sum; + /* bucket_counts is an optional field contains the count values of histogram + for each bucket. + + The sum of the bucket_counts must equal the value in the count field. + + The number of elements in bucket_counts array must be by one greater than + the number of elements in explicit_bounds array. */ + pb_callback_t bucket_counts; + /* explicit_bounds specifies buckets with explicitly defined bounds for values. + + The boundaries for bucket at index i are: + + (-infinity, explicit_bounds[i]] for i == 0 + (explicit_bounds[i-1], explicit_bounds[i]] for 0 < i < size(explicit_bounds) + (explicit_bounds[i-1], +infinity) for i == size(explicit_bounds) + + The values in the explicit_bounds array must be strictly increasing. + + Histogram buckets are inclusive of their upper boundary, except the last + bucket where the boundary is at infinity. This format is intentionally + compatible with the OpenMetrics histogram definition. */ + pb_callback_t explicit_bounds; + /* (Optional) List of exemplars collected from + measurements that were used to form the data point */ + pb_callback_t exemplars; + /* The set of key/value pairs that uniquely identify the timeseries from + where this point belongs. The list may be empty (may contain 0 elements). + Attribute keys MUST be unique (it is not allowed to have more than one + attribute with the same key). */ + pb_callback_t attributes; + /* Flags that apply to this specific data point. See DataPointFlags + for the available flags and their meaning. */ + uint32_t flags; + /* min is the minimum value over (start_time, end_time]. */ + bool has_min; + double min; + /* max is the maximum value over (start_time, end_time]. */ + bool has_max; + double max; +} opentelemetry_proto_metrics_v1_HistogramDataPoint; + +/* Buckets are a set of bucket counts, encoded in a contiguous array + of counts. */ +typedef struct _opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets { + /* Offset is the bucket index of the first entry in the bucket_counts array. + + Note: This uses a varint encoding as a simple form of compression. */ + int32_t offset; + /* bucket_counts is an array of count values, where bucket_counts[i] carries + the count of the bucket at index (offset+i). bucket_counts[i] is the count + of values greater than base^(offset+i) and less than or equal to + base^(offset+i+1). + + Note: By contrast, the explicit HistogramDataPoint uses + fixed64. This field is expected to have many buckets, + especially zeros, so uint64 has been selected to ensure + varint encoding. */ + pb_callback_t bucket_counts; +} opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets; + +/* ExponentialHistogramDataPoint is a single data point in a timeseries that describes the + time-varying values of a ExponentialHistogram of double values. A ExponentialHistogram contains + summary statistics for a population of values, it may optionally contain the + distribution of those values across a set of buckets. */ +typedef struct _opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint { + /* The set of key/value pairs that uniquely identify the timeseries from + where this point belongs. The list may be empty (may contain 0 elements). + Attribute keys MUST be unique (it is not allowed to have more than one + attribute with the same key). */ + pb_callback_t attributes; + /* StartTimeUnixNano is optional but strongly encouraged, see the + the detailed comments above Metric. + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t start_time_unix_nano; + /* TimeUnixNano is required, see the detailed comments above Metric. + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t time_unix_nano; + /* count is the number of values in the population. Must be + non-negative. This value must be equal to the sum of the "bucket_counts" + values in the positive and negative Buckets plus the "zero_count" field. */ + uint64_t count; + /* sum of the values in the population. If count is zero then this field + must be zero. + + Note: Sum should only be filled out when measuring non-negative discrete + events, and is assumed to be monotonic over the values of these events. + Negative events *can* be recorded, but sum should not be filled out when + doing so. This is specifically to enforce compatibility w/ OpenMetrics, + see: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#histogram */ + bool has_sum; + double sum; + /* scale describes the resolution of the histogram. Boundaries are + located at powers of the base, where: + + base = (2^(2^-scale)) + + The histogram bucket identified by `index`, a signed integer, + contains values that are greater than (base^index) and + less than or equal to (base^(index+1)). + + The positive and negative ranges of the histogram are expressed + separately. Negative values are mapped by their absolute value + into the negative range using the same scale as the positive range. + + scale is not restricted by the protocol, as the permissible + values depend on the range of the data. */ + int32_t scale; + /* zero_count is the count of values that are either exactly zero or + within the region considered zero by the instrumentation at the + tolerated degree of precision. This bucket stores values that + cannot be expressed using the standard exponential formula as + well as values that have been rounded to zero. + + Implementations MAY consider the zero bucket to have probability + mass equal to (zero_count / count). */ + uint64_t zero_count; + /* positive carries the positive range of exponential bucket counts. */ + bool has_positive; + opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets positive; + /* negative carries the negative range of exponential bucket counts. */ + bool has_negative; + opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets negative; + /* Flags that apply to this specific data point. See DataPointFlags + for the available flags and their meaning. */ + uint32_t flags; + /* (Optional) List of exemplars collected from + measurements that were used to form the data point */ + pb_callback_t exemplars; + /* min is the minimum value over (start_time, end_time]. */ + bool has_min; + double min; + /* max is the maximum value over (start_time, end_time]. */ + bool has_max; + double max; + /* ZeroThreshold may be optionally set to convey the width of the zero + region. Where the zero region is defined as the closed interval + [-ZeroThreshold, ZeroThreshold]. + When ZeroThreshold is 0, zero count bucket stores values that cannot be + expressed using the standard exponential formula as well as values that + have been rounded to zero. */ + double zero_threshold; +} opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint; + +/* SummaryDataPoint is a single data point in a timeseries that describes the + time-varying values of a Summary metric. */ +typedef struct _opentelemetry_proto_metrics_v1_SummaryDataPoint { + /* StartTimeUnixNano is optional but strongly encouraged, see the + the detailed comments above Metric. + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t start_time_unix_nano; + /* TimeUnixNano is required, see the detailed comments above Metric. + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t time_unix_nano; + /* count is the number of values in the population. Must be non-negative. */ + uint64_t count; + /* sum of the values in the population. If count is zero then this field + must be zero. + + Note: Sum should only be filled out when measuring non-negative discrete + events, and is assumed to be monotonic over the values of these events. + Negative events *can* be recorded, but sum should not be filled out when + doing so. This is specifically to enforce compatibility w/ OpenMetrics, + see: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#summary */ + double sum; + /* (Optional) list of values at different quantiles of the distribution calculated + from the current snapshot. The quantiles must be strictly increasing. */ + pb_callback_t quantile_values; + /* The set of key/value pairs that uniquely identify the timeseries from + where this point belongs. The list may be empty (may contain 0 elements). + Attribute keys MUST be unique (it is not allowed to have more than one + attribute with the same key). */ + pb_callback_t attributes; + /* Flags that apply to this specific data point. See DataPointFlags + for the available flags and their meaning. */ + uint32_t flags; +} opentelemetry_proto_metrics_v1_SummaryDataPoint; + +/* Represents the value at a given quantile of a distribution. + + To record Min and Max values following conventions are used: + - The 1.0 quantile is equivalent to the maximum value observed. + - The 0.0 quantile is equivalent to the minimum value observed. + + See the following issue for more context: + https://github.com/open-telemetry/opentelemetry-proto/issues/125 */ +typedef struct _opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile { + /* The quantile of a distribution. Must be in the interval + [0.0, 1.0]. */ + double quantile; + /* The value at the given quantile of a distribution. + + Quantile values must NOT be negative. */ + double value; +} opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile; + +/* A representation of an exemplar, which is a sample input measurement. + Exemplars also hold information about the environment when the measurement + was recorded, for example the span and trace ID of the active span when the + exemplar was recorded. */ +typedef struct _opentelemetry_proto_metrics_v1_Exemplar { + /* time_unix_nano is the exact time when this exemplar was recorded + + Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + 1970. */ + uint64_t time_unix_nano; + pb_size_t which_value; + union { + double as_double; + int64_t as_int; + } value; + /* (Optional) Span ID of the exemplar trace. + span_id may be missing if the measurement is not recorded inside a trace + or if the trace is not sampled. */ + pb_callback_t span_id; + /* (Optional) Trace ID of the exemplar trace. + trace_id may be missing if the measurement is not recorded inside a trace + or if the trace is not sampled. */ + pb_callback_t trace_id; + /* The set of key/value pairs that were filtered out by the aggregator, but + recorded alongside the original measurement. Only key/value pairs that were + filtered out by the aggregator should be included */ + pb_callback_t filtered_attributes; +} opentelemetry_proto_metrics_v1_Exemplar; + + +#ifdef __cplusplus +extern "C" { +#endif + +/* Helper constants for enums */ +#define _opentelemetry_proto_metrics_v1_AggregationTemporality_MIN opentelemetry_proto_metrics_v1_AggregationTemporality_AGGREGATION_TEMPORALITY_UNSPECIFIED +#define _opentelemetry_proto_metrics_v1_AggregationTemporality_MAX opentelemetry_proto_metrics_v1_AggregationTemporality_AGGREGATION_TEMPORALITY_CUMULATIVE +#define _opentelemetry_proto_metrics_v1_AggregationTemporality_ARRAYSIZE ((opentelemetry_proto_metrics_v1_AggregationTemporality)(opentelemetry_proto_metrics_v1_AggregationTemporality_AGGREGATION_TEMPORALITY_CUMULATIVE+1)) + +#define _opentelemetry_proto_metrics_v1_DataPointFlags_MIN opentelemetry_proto_metrics_v1_DataPointFlags_DATA_POINT_FLAGS_DO_NOT_USE +#define _opentelemetry_proto_metrics_v1_DataPointFlags_MAX opentelemetry_proto_metrics_v1_DataPointFlags_DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK +#define _opentelemetry_proto_metrics_v1_DataPointFlags_ARRAYSIZE ((opentelemetry_proto_metrics_v1_DataPointFlags)(opentelemetry_proto_metrics_v1_DataPointFlags_DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK+1)) + + + + + + +#define opentelemetry_proto_metrics_v1_Sum_aggregation_temporality_ENUMTYPE opentelemetry_proto_metrics_v1_AggregationTemporality + +#define opentelemetry_proto_metrics_v1_Histogram_aggregation_temporality_ENUMTYPE opentelemetry_proto_metrics_v1_AggregationTemporality + +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_aggregation_temporality_ENUMTYPE opentelemetry_proto_metrics_v1_AggregationTemporality + + + + + + + + + + +/* Initializer values for message structs */ +#define opentelemetry_proto_metrics_v1_MetricsData_init_default {{{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_ResourceMetrics_init_default {false, opentelemetry_proto_resource_v1_Resource_init_default, {{NULL}, NULL}, {{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_ScopeMetrics_init_default {false, opentelemetry_proto_common_v1_InstrumentationScope_init_default, {{NULL}, NULL}, {{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_Metric_init_default {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, 0, {opentelemetry_proto_metrics_v1_Gauge_init_default}} +#define opentelemetry_proto_metrics_v1_Gauge_init_default {{{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_Sum_init_default {{{NULL}, NULL}, _opentelemetry_proto_metrics_v1_AggregationTemporality_MIN, 0} +#define opentelemetry_proto_metrics_v1_Histogram_init_default {{{NULL}, NULL}, _opentelemetry_proto_metrics_v1_AggregationTemporality_MIN} +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_init_default {{{NULL}, NULL}, _opentelemetry_proto_metrics_v1_AggregationTemporality_MIN} +#define opentelemetry_proto_metrics_v1_Summary_init_default {{{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_NumberDataPoint_init_default {0, 0, 0, {0}, {{NULL}, NULL}, {{NULL}, NULL}, 0} +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_init_default {0, 0, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, 0, false, 0, false, 0} +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_init_default {{{NULL}, NULL}, 0, 0, 0, false, 0, 0, 0, false, opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_init_default, false, opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_init_default, 0, {{NULL}, NULL}, false, 0, false, 0, 0} +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_init_default {0, {{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_init_default {0, 0, 0, 0, {{NULL}, NULL}, {{NULL}, NULL}, 0} +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_init_default {0, 0} +#define opentelemetry_proto_metrics_v1_Exemplar_init_default {0, 0, {0}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_MetricsData_init_zero {{{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_ResourceMetrics_init_zero {false, opentelemetry_proto_resource_v1_Resource_init_zero, {{NULL}, NULL}, {{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_ScopeMetrics_init_zero {false, opentelemetry_proto_common_v1_InstrumentationScope_init_zero, {{NULL}, NULL}, {{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_Metric_init_zero {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, 0, {opentelemetry_proto_metrics_v1_Gauge_init_zero}} +#define opentelemetry_proto_metrics_v1_Gauge_init_zero {{{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_Sum_init_zero {{{NULL}, NULL}, _opentelemetry_proto_metrics_v1_AggregationTemporality_MIN, 0} +#define opentelemetry_proto_metrics_v1_Histogram_init_zero {{{NULL}, NULL}, _opentelemetry_proto_metrics_v1_AggregationTemporality_MIN} +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_init_zero {{{NULL}, NULL}, _opentelemetry_proto_metrics_v1_AggregationTemporality_MIN} +#define opentelemetry_proto_metrics_v1_Summary_init_zero {{{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_NumberDataPoint_init_zero {0, 0, 0, {0}, {{NULL}, NULL}, {{NULL}, NULL}, 0} +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_init_zero {0, 0, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, 0, false, 0, false, 0} +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_init_zero {{{NULL}, NULL}, 0, 0, 0, false, 0, 0, 0, false, opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_init_zero, false, opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_init_zero, 0, {{NULL}, NULL}, false, 0, false, 0, 0} +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_init_zero {0, {{NULL}, NULL}} +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_init_zero {0, 0, 0, 0, {{NULL}, NULL}, {{NULL}, NULL}, 0} +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_init_zero {0, 0} +#define opentelemetry_proto_metrics_v1_Exemplar_init_zero {0, 0, {0}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} + +/* Field tags (for use in manual encoding/decoding) */ +#define opentelemetry_proto_metrics_v1_MetricsData_resource_metrics_tag 1 +#define opentelemetry_proto_metrics_v1_ResourceMetrics_resource_tag 1 +#define opentelemetry_proto_metrics_v1_ResourceMetrics_scope_metrics_tag 2 +#define opentelemetry_proto_metrics_v1_ResourceMetrics_schema_url_tag 3 +#define opentelemetry_proto_metrics_v1_ScopeMetrics_scope_tag 1 +#define opentelemetry_proto_metrics_v1_ScopeMetrics_metrics_tag 2 +#define opentelemetry_proto_metrics_v1_ScopeMetrics_schema_url_tag 3 +#define opentelemetry_proto_metrics_v1_Gauge_data_points_tag 1 +#define opentelemetry_proto_metrics_v1_Sum_data_points_tag 1 +#define opentelemetry_proto_metrics_v1_Sum_aggregation_temporality_tag 2 +#define opentelemetry_proto_metrics_v1_Sum_is_monotonic_tag 3 +#define opentelemetry_proto_metrics_v1_Histogram_data_points_tag 1 +#define opentelemetry_proto_metrics_v1_Histogram_aggregation_temporality_tag 2 +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_data_points_tag 1 +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_aggregation_temporality_tag 2 +#define opentelemetry_proto_metrics_v1_Summary_data_points_tag 1 +#define opentelemetry_proto_metrics_v1_Metric_name_tag 1 +#define opentelemetry_proto_metrics_v1_Metric_description_tag 2 +#define opentelemetry_proto_metrics_v1_Metric_unit_tag 3 +#define opentelemetry_proto_metrics_v1_Metric_gauge_tag 5 +#define opentelemetry_proto_metrics_v1_Metric_sum_tag 7 +#define opentelemetry_proto_metrics_v1_Metric_histogram_tag 9 +#define opentelemetry_proto_metrics_v1_Metric_exponential_histogram_tag 10 +#define opentelemetry_proto_metrics_v1_Metric_summary_tag 11 +#define opentelemetry_proto_metrics_v1_NumberDataPoint_start_time_unix_nano_tag 2 +#define opentelemetry_proto_metrics_v1_NumberDataPoint_time_unix_nano_tag 3 +#define opentelemetry_proto_metrics_v1_NumberDataPoint_as_double_tag 4 +#define opentelemetry_proto_metrics_v1_NumberDataPoint_as_int_tag 6 +#define opentelemetry_proto_metrics_v1_NumberDataPoint_exemplars_tag 5 +#define opentelemetry_proto_metrics_v1_NumberDataPoint_attributes_tag 7 +#define opentelemetry_proto_metrics_v1_NumberDataPoint_flags_tag 8 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_start_time_unix_nano_tag 2 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_time_unix_nano_tag 3 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_count_tag 4 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_sum_tag 5 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_bucket_counts_tag 6 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_explicit_bounds_tag 7 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_exemplars_tag 8 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_attributes_tag 9 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_flags_tag 10 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_min_tag 11 +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_max_tag 12 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_offset_tag 1 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_bucket_counts_tag 2 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_attributes_tag 1 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_start_time_unix_nano_tag 2 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_time_unix_nano_tag 3 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_count_tag 4 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_sum_tag 5 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_scale_tag 6 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_zero_count_tag 7 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_positive_tag 8 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_negative_tag 9 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_flags_tag 10 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_exemplars_tag 11 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_min_tag 12 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_max_tag 13 +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_zero_threshold_tag 14 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_start_time_unix_nano_tag 2 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_time_unix_nano_tag 3 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_count_tag 4 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_sum_tag 5 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_quantile_values_tag 6 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_attributes_tag 7 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_flags_tag 8 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_quantile_tag 1 +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_value_tag 2 +#define opentelemetry_proto_metrics_v1_Exemplar_time_unix_nano_tag 2 +#define opentelemetry_proto_metrics_v1_Exemplar_as_double_tag 3 +#define opentelemetry_proto_metrics_v1_Exemplar_as_int_tag 6 +#define opentelemetry_proto_metrics_v1_Exemplar_span_id_tag 4 +#define opentelemetry_proto_metrics_v1_Exemplar_trace_id_tag 5 +#define opentelemetry_proto_metrics_v1_Exemplar_filtered_attributes_tag 7 + +/* Struct field encoding specification for nanopb */ +#define opentelemetry_proto_metrics_v1_MetricsData_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, resource_metrics, 1) +#define opentelemetry_proto_metrics_v1_MetricsData_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_MetricsData_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_MetricsData_resource_metrics_MSGTYPE opentelemetry_proto_metrics_v1_ResourceMetrics + +#define opentelemetry_proto_metrics_v1_ResourceMetrics_FIELDLIST(X, a) \ +X(a, STATIC, OPTIONAL, MESSAGE, resource, 1) \ +X(a, CALLBACK, REPEATED, MESSAGE, scope_metrics, 2) \ +X(a, CALLBACK, SINGULAR, STRING, schema_url, 3) +#define opentelemetry_proto_metrics_v1_ResourceMetrics_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_ResourceMetrics_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_ResourceMetrics_resource_MSGTYPE opentelemetry_proto_resource_v1_Resource +#define opentelemetry_proto_metrics_v1_ResourceMetrics_scope_metrics_MSGTYPE opentelemetry_proto_metrics_v1_ScopeMetrics + +#define opentelemetry_proto_metrics_v1_ScopeMetrics_FIELDLIST(X, a) \ +X(a, STATIC, OPTIONAL, MESSAGE, scope, 1) \ +X(a, CALLBACK, REPEATED, MESSAGE, metrics, 2) \ +X(a, CALLBACK, SINGULAR, STRING, schema_url, 3) +#define opentelemetry_proto_metrics_v1_ScopeMetrics_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_ScopeMetrics_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_ScopeMetrics_scope_MSGTYPE opentelemetry_proto_common_v1_InstrumentationScope +#define opentelemetry_proto_metrics_v1_ScopeMetrics_metrics_MSGTYPE opentelemetry_proto_metrics_v1_Metric + +#define opentelemetry_proto_metrics_v1_Metric_FIELDLIST(X, a) \ +X(a, CALLBACK, SINGULAR, STRING, name, 1) \ +X(a, CALLBACK, SINGULAR, STRING, description, 2) \ +X(a, CALLBACK, SINGULAR, STRING, unit, 3) \ +X(a, STATIC, ONEOF, MSG_W_CB, (data,gauge,data.gauge), 5) \ +X(a, STATIC, ONEOF, MSG_W_CB, (data,sum,data.sum), 7) \ +X(a, STATIC, ONEOF, MSG_W_CB, (data,histogram,data.histogram), 9) \ +X(a, STATIC, ONEOF, MSG_W_CB, (data,exponential_histogram,data.exponential_histogram), 10) \ +X(a, STATIC, ONEOF, MSG_W_CB, (data,summary,data.summary), 11) +#define opentelemetry_proto_metrics_v1_Metric_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_Metric_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_Metric_data_gauge_MSGTYPE opentelemetry_proto_metrics_v1_Gauge +#define opentelemetry_proto_metrics_v1_Metric_data_sum_MSGTYPE opentelemetry_proto_metrics_v1_Sum +#define opentelemetry_proto_metrics_v1_Metric_data_histogram_MSGTYPE opentelemetry_proto_metrics_v1_Histogram +#define opentelemetry_proto_metrics_v1_Metric_data_exponential_histogram_MSGTYPE opentelemetry_proto_metrics_v1_ExponentialHistogram +#define opentelemetry_proto_metrics_v1_Metric_data_summary_MSGTYPE opentelemetry_proto_metrics_v1_Summary + +#define opentelemetry_proto_metrics_v1_Gauge_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, data_points, 1) +#define opentelemetry_proto_metrics_v1_Gauge_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_Gauge_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_Gauge_data_points_MSGTYPE opentelemetry_proto_metrics_v1_NumberDataPoint + +#define opentelemetry_proto_metrics_v1_Sum_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, data_points, 1) \ +X(a, STATIC, SINGULAR, UENUM, aggregation_temporality, 2) \ +X(a, STATIC, SINGULAR, BOOL, is_monotonic, 3) +#define opentelemetry_proto_metrics_v1_Sum_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_Sum_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_Sum_data_points_MSGTYPE opentelemetry_proto_metrics_v1_NumberDataPoint + +#define opentelemetry_proto_metrics_v1_Histogram_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, data_points, 1) \ +X(a, STATIC, SINGULAR, UENUM, aggregation_temporality, 2) +#define opentelemetry_proto_metrics_v1_Histogram_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_Histogram_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_Histogram_data_points_MSGTYPE opentelemetry_proto_metrics_v1_HistogramDataPoint + +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, data_points, 1) \ +X(a, STATIC, SINGULAR, UENUM, aggregation_temporality, 2) +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_data_points_MSGTYPE opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint + +#define opentelemetry_proto_metrics_v1_Summary_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, data_points, 1) +#define opentelemetry_proto_metrics_v1_Summary_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_Summary_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_Summary_data_points_MSGTYPE opentelemetry_proto_metrics_v1_SummaryDataPoint + +#define opentelemetry_proto_metrics_v1_NumberDataPoint_FIELDLIST(X, a) \ +X(a, STATIC, SINGULAR, FIXED64, start_time_unix_nano, 2) \ +X(a, STATIC, SINGULAR, FIXED64, time_unix_nano, 3) \ +X(a, STATIC, ONEOF, DOUBLE, (value,as_double,value.as_double), 4) \ +X(a, CALLBACK, REPEATED, MESSAGE, exemplars, 5) \ +X(a, STATIC, ONEOF, SFIXED64, (value,as_int,value.as_int), 6) \ +X(a, CALLBACK, REPEATED, MESSAGE, attributes, 7) \ +X(a, STATIC, SINGULAR, UINT32, flags, 8) +#define opentelemetry_proto_metrics_v1_NumberDataPoint_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_NumberDataPoint_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_NumberDataPoint_exemplars_MSGTYPE opentelemetry_proto_metrics_v1_Exemplar +#define opentelemetry_proto_metrics_v1_NumberDataPoint_attributes_MSGTYPE opentelemetry_proto_common_v1_KeyValue + +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_FIELDLIST(X, a) \ +X(a, STATIC, SINGULAR, FIXED64, start_time_unix_nano, 2) \ +X(a, STATIC, SINGULAR, FIXED64, time_unix_nano, 3) \ +X(a, STATIC, SINGULAR, FIXED64, count, 4) \ +X(a, STATIC, OPTIONAL, DOUBLE, sum, 5) \ +X(a, CALLBACK, REPEATED, FIXED64, bucket_counts, 6) \ +X(a, CALLBACK, REPEATED, DOUBLE, explicit_bounds, 7) \ +X(a, CALLBACK, REPEATED, MESSAGE, exemplars, 8) \ +X(a, CALLBACK, REPEATED, MESSAGE, attributes, 9) \ +X(a, STATIC, SINGULAR, UINT32, flags, 10) \ +X(a, STATIC, OPTIONAL, DOUBLE, min, 11) \ +X(a, STATIC, OPTIONAL, DOUBLE, max, 12) +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_exemplars_MSGTYPE opentelemetry_proto_metrics_v1_Exemplar +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_attributes_MSGTYPE opentelemetry_proto_common_v1_KeyValue + +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, attributes, 1) \ +X(a, STATIC, SINGULAR, FIXED64, start_time_unix_nano, 2) \ +X(a, STATIC, SINGULAR, FIXED64, time_unix_nano, 3) \ +X(a, STATIC, SINGULAR, FIXED64, count, 4) \ +X(a, STATIC, OPTIONAL, DOUBLE, sum, 5) \ +X(a, STATIC, SINGULAR, SINT32, scale, 6) \ +X(a, STATIC, SINGULAR, FIXED64, zero_count, 7) \ +X(a, STATIC, OPTIONAL, MESSAGE, positive, 8) \ +X(a, STATIC, OPTIONAL, MESSAGE, negative, 9) \ +X(a, STATIC, SINGULAR, UINT32, flags, 10) \ +X(a, CALLBACK, REPEATED, MESSAGE, exemplars, 11) \ +X(a, STATIC, OPTIONAL, DOUBLE, min, 12) \ +X(a, STATIC, OPTIONAL, DOUBLE, max, 13) \ +X(a, STATIC, SINGULAR, DOUBLE, zero_threshold, 14) +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_attributes_MSGTYPE opentelemetry_proto_common_v1_KeyValue +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_positive_MSGTYPE opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_negative_MSGTYPE opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_exemplars_MSGTYPE opentelemetry_proto_metrics_v1_Exemplar + +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_FIELDLIST(X, a) \ +X(a, STATIC, SINGULAR, SINT32, offset, 1) \ +X(a, CALLBACK, REPEATED, UINT64, bucket_counts, 2) +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_DEFAULT NULL + +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_FIELDLIST(X, a) \ +X(a, STATIC, SINGULAR, FIXED64, start_time_unix_nano, 2) \ +X(a, STATIC, SINGULAR, FIXED64, time_unix_nano, 3) \ +X(a, STATIC, SINGULAR, FIXED64, count, 4) \ +X(a, STATIC, SINGULAR, DOUBLE, sum, 5) \ +X(a, CALLBACK, REPEATED, MESSAGE, quantile_values, 6) \ +X(a, CALLBACK, REPEATED, MESSAGE, attributes, 7) \ +X(a, STATIC, SINGULAR, UINT32, flags, 8) +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_quantile_values_MSGTYPE opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_attributes_MSGTYPE opentelemetry_proto_common_v1_KeyValue + +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_FIELDLIST(X, a) \ +X(a, STATIC, SINGULAR, DOUBLE, quantile, 1) \ +X(a, STATIC, SINGULAR, DOUBLE, value, 2) +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_CALLBACK NULL +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_DEFAULT NULL + +#define opentelemetry_proto_metrics_v1_Exemplar_FIELDLIST(X, a) \ +X(a, STATIC, SINGULAR, FIXED64, time_unix_nano, 2) \ +X(a, STATIC, ONEOF, DOUBLE, (value,as_double,value.as_double), 3) \ +X(a, CALLBACK, SINGULAR, BYTES, span_id, 4) \ +X(a, CALLBACK, SINGULAR, BYTES, trace_id, 5) \ +X(a, STATIC, ONEOF, SFIXED64, (value,as_int,value.as_int), 6) \ +X(a, CALLBACK, REPEATED, MESSAGE, filtered_attributes, 7) +#define opentelemetry_proto_metrics_v1_Exemplar_CALLBACK pb_default_field_callback +#define opentelemetry_proto_metrics_v1_Exemplar_DEFAULT NULL +#define opentelemetry_proto_metrics_v1_Exemplar_filtered_attributes_MSGTYPE opentelemetry_proto_common_v1_KeyValue + +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_MetricsData_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_ResourceMetrics_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_ScopeMetrics_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_Metric_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_Gauge_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_Sum_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_Histogram_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_ExponentialHistogram_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_Summary_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_NumberDataPoint_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_HistogramDataPoint_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_SummaryDataPoint_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_msg; +extern const pb_msgdesc_t opentelemetry_proto_metrics_v1_Exemplar_msg; + +/* Defines for backwards compatibility with code written before nanopb-0.4.0 */ +#define opentelemetry_proto_metrics_v1_MetricsData_fields &opentelemetry_proto_metrics_v1_MetricsData_msg +#define opentelemetry_proto_metrics_v1_ResourceMetrics_fields &opentelemetry_proto_metrics_v1_ResourceMetrics_msg +#define opentelemetry_proto_metrics_v1_ScopeMetrics_fields &opentelemetry_proto_metrics_v1_ScopeMetrics_msg +#define opentelemetry_proto_metrics_v1_Metric_fields &opentelemetry_proto_metrics_v1_Metric_msg +#define opentelemetry_proto_metrics_v1_Gauge_fields &opentelemetry_proto_metrics_v1_Gauge_msg +#define opentelemetry_proto_metrics_v1_Sum_fields &opentelemetry_proto_metrics_v1_Sum_msg +#define opentelemetry_proto_metrics_v1_Histogram_fields &opentelemetry_proto_metrics_v1_Histogram_msg +#define opentelemetry_proto_metrics_v1_ExponentialHistogram_fields &opentelemetry_proto_metrics_v1_ExponentialHistogram_msg +#define opentelemetry_proto_metrics_v1_Summary_fields &opentelemetry_proto_metrics_v1_Summary_msg +#define opentelemetry_proto_metrics_v1_NumberDataPoint_fields &opentelemetry_proto_metrics_v1_NumberDataPoint_msg +#define opentelemetry_proto_metrics_v1_HistogramDataPoint_fields &opentelemetry_proto_metrics_v1_HistogramDataPoint_msg +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_fields &opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_msg +#define opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_fields &opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_msg +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_fields &opentelemetry_proto_metrics_v1_SummaryDataPoint_msg +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_fields &opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_msg +#define opentelemetry_proto_metrics_v1_Exemplar_fields &opentelemetry_proto_metrics_v1_Exemplar_msg + +/* Maximum encoded size of messages (where known) */ +/* opentelemetry_proto_metrics_v1_MetricsData_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_ResourceMetrics_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_ScopeMetrics_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_Metric_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_Gauge_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_Sum_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_Histogram_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_ExponentialHistogram_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_Summary_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_NumberDataPoint_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_HistogramDataPoint_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_ExponentialHistogramDataPoint_Buckets_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_SummaryDataPoint_size depends on runtime parameters */ +/* opentelemetry_proto_metrics_v1_Exemplar_size depends on runtime parameters */ +#define opentelemetry_proto_metrics_v1_SummaryDataPoint_ValueAtQuantile_size 18 + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/src/third_party/librdkafka/dist/src/opentelemetry/resource.pb.c b/src/third_party/librdkafka/dist/src/opentelemetry/resource.pb.c new file mode 100644 index 00000000000..39cc42767bb --- /dev/null +++ b/src/third_party/librdkafka/dist/src/opentelemetry/resource.pb.c @@ -0,0 +1,12 @@ +/* Automatically generated nanopb constant definitions */ +/* Generated by nanopb-0.4.8-dev */ + +#include "opentelemetry/resource.pb.h" +#if PB_PROTO_HEADER_VERSION != 40 +#error Regenerate this file with the current version of nanopb generator. +#endif + +PB_BIND(opentelemetry_proto_resource_v1_Resource, opentelemetry_proto_resource_v1_Resource, AUTO) + + + diff --git a/src/third_party/librdkafka/dist/src/opentelemetry/resource.pb.h b/src/third_party/librdkafka/dist/src/opentelemetry/resource.pb.h new file mode 100644 index 00000000000..232c0b0244c --- /dev/null +++ b/src/third_party/librdkafka/dist/src/opentelemetry/resource.pb.h @@ -0,0 +1,58 @@ +/* Automatically generated nanopb header */ +/* Generated by nanopb-0.4.8-dev */ + +#ifndef PB_OPENTELEMETRY_PROTO_RESOURCE_V1_OPENTELEMETRY_PROTO_RESOURCE_V1_RESOURCE_PB_H_INCLUDED +#define PB_OPENTELEMETRY_PROTO_RESOURCE_V1_OPENTELEMETRY_PROTO_RESOURCE_V1_RESOURCE_PB_H_INCLUDED +#include +#include "opentelemetry/common.pb.h" + +#if PB_PROTO_HEADER_VERSION != 40 +#error Regenerate this file with the current version of nanopb generator. +#endif + +/* Struct definitions */ +/* Resource information. */ +typedef struct _opentelemetry_proto_resource_v1_Resource { + /* Set of attributes that describe the resource. + Attribute keys MUST be unique (it is not allowed to have more than one + attribute with the same key). */ + pb_callback_t attributes; + /* dropped_attributes_count is the number of dropped attributes. If the value is 0, then + no attributes were dropped. */ + uint32_t dropped_attributes_count; +} opentelemetry_proto_resource_v1_Resource; + + +#ifdef __cplusplus +extern "C" { +#endif + +/* Initializer values for message structs */ +#define opentelemetry_proto_resource_v1_Resource_init_default {{{NULL}, NULL}, 0} +#define opentelemetry_proto_resource_v1_Resource_init_zero {{{NULL}, NULL}, 0} + +/* Field tags (for use in manual encoding/decoding) */ +#define opentelemetry_proto_resource_v1_Resource_attributes_tag 1 +#define opentelemetry_proto_resource_v1_Resource_dropped_attributes_count_tag 2 + +/* Struct field encoding specification for nanopb */ +#define opentelemetry_proto_resource_v1_Resource_FIELDLIST(X, a) \ +X(a, CALLBACK, REPEATED, MESSAGE, attributes, 1) \ +X(a, STATIC, SINGULAR, UINT32, dropped_attributes_count, 2) +#define opentelemetry_proto_resource_v1_Resource_CALLBACK pb_default_field_callback +#define opentelemetry_proto_resource_v1_Resource_DEFAULT NULL +#define opentelemetry_proto_resource_v1_Resource_attributes_MSGTYPE opentelemetry_proto_common_v1_KeyValue + +extern const pb_msgdesc_t opentelemetry_proto_resource_v1_Resource_msg; + +/* Defines for backwards compatibility with code written before nanopb-0.4.0 */ +#define opentelemetry_proto_resource_v1_Resource_fields &opentelemetry_proto_resource_v1_Resource_msg + +/* Maximum encoded size of messages (where known) */ +/* opentelemetry_proto_resource_v1_Resource_size depends on runtime parameters */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/src/third_party/librdkafka/dist/src/rd.h b/src/third_party/librdkafka/dist/src/rd.h index 670605de441..300a7b030ca 100644 --- a/src/third_party/librdkafka/dist/src/rd.h +++ b/src/third_party/librdkafka/dist/src/rd.h @@ -1,7 +1,8 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -52,6 +53,7 @@ #include #include #include +#include #include "tinycthread.h" #include "rdsysqueue.h" @@ -219,7 +221,7 @@ static RD_INLINE RD_UNUSED char *rd_strndup(const char *s, size_t len) { /* Round/align X upwards to STRIDE, which must be power of 2. */ -#define RD_ROUNDUP(X, STRIDE) (((X) + ((STRIDE)-1)) & ~(STRIDE - 1)) +#define RD_ROUNDUP(X, STRIDE) (((X) + ((STRIDE) - 1)) & ~(STRIDE - 1)) #define RD_ARRAY_SIZE(A) (sizeof((A)) / sizeof(*(A))) #define RD_ARRAYSIZE(A) RD_ARRAY_SIZE(A) @@ -424,6 +426,10 @@ static RD_INLINE RD_UNUSED int rd_refcnt_get(rd_refcnt_t *R) { } while (0) +#define RD_INTERFACE_CALL(i, name, ...) (i->name(i->opaque, __VA_ARGS__)) + +#define RD_CEIL_INTEGER_DIVISION(X, DEN) (((X) + ((DEN) - 1)) / (DEN)) + /** * @brief Utility types to hold memory,size tuple. */ @@ -433,4 +439,140 @@ typedef struct rd_chariov_s { size_t size; } rd_chariov_t; +/** + * @brief Read the file at \p file_path in binary mode and return its contents. + * The returned buffer is NULL-terminated, + * the size parameter will contain the actual file size. + * + * @param file_path Path to the file to read. + * @param size Pointer to store the file size (optional). + * @param max_size Maximum file size to read (0 for no limit) (optional). + * + * @returns Newly allocated buffer containing the file contents. + * NULL on error (file not found, too large, etc). + * + * @remark The returned pointer ownership is transferred to the caller. + * + * @locality Any thread + */ +static RD_INLINE RD_UNUSED char * +rd_file_read(const char *file_path, size_t *size, size_t max_size) { + FILE *file; + char *buf = NULL; + size_t file_size; + size_t read_size; + if (!size) + size = &read_size; + +#ifndef _WIN32 + file = fopen(file_path, "rb"); +#else + file = NULL; + errno = fopen_s(&file, file_path, "rb"); +#endif + if (!file) + return NULL; + + if (fseek(file, 0, SEEK_END) != 0) + goto err; + + file_size = (size_t)ftell(file); + if (file_size < 0) + goto err; + + if (fseek(file, 0, SEEK_SET) != 0) + goto err; + + /* Check if file is too large */ + if (max_size > 0 && file_size > max_size) + goto err; + + /* Allocate buffer with extra byte for NULL terminator */ + buf = (char *)rd_malloc(file_size + 1); + read_size = fread(buf, 1, file_size, file); + + if (read_size != file_size) + goto err; + + /* NULL terminate the buffer */ + buf[file_size] = '\0'; + *size = file_size; + fclose(file); + return buf; +err: + fclose(file); + if (buf) + rd_free(buf); + return NULL; +} + +static RD_INLINE RD_UNUSED FILE * +rd_file_mkstemp(const char *prefix, + const char *mode, + char *tempfile_path_out, + size_t tempfile_path_out_size) { + FILE *tempfile; + +#ifdef _WIN32 + char tempfolder_path[MAX_PATH]; + char tempfile_path[MAX_PATH]; + if (!GetTempPathA(MAX_PATH, tempfolder_path)) + return NULL; /* Failed to get temp folder path */ + + + if (!GetTempFileNameA(tempfolder_path, "TMP", 1, tempfile_path)) + return NULL; /* Failed to create temp file name */ + + tempfile = fopen(tempfile_path, mode); +#else + int tempfile_fd; + char tempfile_path[512]; + rd_snprintf(tempfile_path, sizeof(tempfile_path), "/tmp/%sXXXXXX", + prefix); + tempfile_fd = mkstemp(tempfile_path); + if (tempfile_fd < 0) + return NULL; + + tempfile = fdopen(tempfile_fd, mode); +#endif + + if (!tempfile) + return NULL; + + if (tempfile_path_out) + rd_snprintf(tempfile_path_out, tempfile_path_out_size, "%s", + tempfile_path); + return tempfile; +} + +/** + * @brief Retrive stat for a \p path . + * + * @param path Path to the file or directory. + * @param is_dir Pointer to store if the \p path is a directory (optional). + * + * @return `rd_true` if the path exists. + */ +static RD_INLINE RD_UNUSED rd_bool_t rd_file_stat(const char *path, + rd_bool_t *is_dir) { +#ifdef _WIN32 + struct _stat st; + if (_stat(path, &st) == 0) { + if (is_dir) + *is_dir = st.st_mode & S_IFDIR; + return rd_true; + } +#else + struct stat st; + if (stat(path, &st) == 0) { + if (is_dir) + *is_dir = S_ISDIR(st.st_mode); + return rd_true; + } +#endif + if (is_dir) + *is_dir = rd_false; + return rd_false; +} + #endif /* _RD_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdaddr.c b/src/third_party/librdkafka/dist/src/rdaddr.c index 092406233b2..6fb2c66ca56 100644 --- a/src/third_party/librdkafka/dist/src/rdaddr.c +++ b/src/third_party/librdkafka/dist/src/rdaddr.c @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdaddr.h b/src/third_party/librdkafka/dist/src/rdaddr.h index c8574d01941..09105fdf148 100644 --- a/src/third_party/librdkafka/dist/src/rdaddr.h +++ b/src/third_party/librdkafka/dist/src/rdaddr.h @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,14 +54,13 @@ typedef union { #define sinx_family in.sin_family #define sinx_addr in.sin_addr #define RD_SOCKADDR_INX_LEN(sinx) \ - ((sinx)->sinx_family == AF_INET \ - ? sizeof(struct sockaddr_in) \ - : (sinx)->sinx_family == AF_INET6 ? sizeof(struct sockaddr_in6) \ - : sizeof(rd_sockaddr_inx_t)) + ((sinx)->sinx_family == AF_INET ? sizeof(struct sockaddr_in) \ + : (sinx)->sinx_family == AF_INET6 ? sizeof(struct sockaddr_in6) \ + : sizeof(rd_sockaddr_inx_t)) #define RD_SOCKADDR_INX_PORT(sinx) \ - ((sinx)->sinx_family == AF_INET \ - ? (sinx)->in.sin_port \ - : (sinx)->sinx_family == AF_INET6 ? (sinx)->in6.sin6_port : 0) + ((sinx)->sinx_family == AF_INET ? (sinx)->in.sin_port \ + : (sinx)->sinx_family == AF_INET6 ? (sinx)->in6.sin6_port \ + : 0) #define RD_SOCKADDR_INX_PORT_SET(sinx, port) \ do { \ @@ -139,7 +138,7 @@ rd_sockaddr_list_next(rd_sockaddr_list_t *rsal) { #define RD_SOCKADDR_LIST_FOREACH(sinx, rsal) \ for ((sinx) = &(rsal)->rsal_addr[0]; \ - (sinx) < &(rsal)->rsal_addr[(rsal)->rsal_len]; (sinx)++) + (sinx) < &(rsal)->rsal_addr[(rsal)->rsal_cnt]; (sinx)++) /** * Wrapper for getaddrinfo(3) that performs these additional tasks: diff --git a/src/third_party/librdkafka/dist/src/rdatomic.h b/src/third_party/librdkafka/dist/src/rdatomic.h index aa7d3d7705c..4b97dd7d087 100644 --- a/src/third_party/librdkafka/dist/src/rdatomic.h +++ b/src/third_party/librdkafka/dist/src/rdatomic.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2014-2016 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdavg.h b/src/third_party/librdkafka/dist/src/rdavg.h index a170e8da537..c187aa91fc8 100644 --- a/src/third_party/librdkafka/dist/src/rdavg.h +++ b/src/third_party/librdkafka/dist/src/rdavg.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -45,8 +45,9 @@ typedef struct rd_avg_s { } ra_v; mtx_t ra_lock; int ra_enabled; - enum { RD_AVG_GAUGE, - RD_AVG_COUNTER, + enum { + RD_AVG_GAUGE, + RD_AVG_COUNTER, } ra_type; #if WITH_HDRHISTOGRAM rd_hdr_histogram_t *ra_hdr; diff --git a/src/third_party/librdkafka/dist/src/rdavl.c b/src/third_party/librdkafka/dist/src/rdavl.c index f25251de8e3..0bb41180966 100644 --- a/src/third_party/librdkafka/dist/src/rdavl.c +++ b/src/third_party/librdkafka/dist/src/rdavl.c @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012-2016, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdavl.h b/src/third_party/librdkafka/dist/src/rdavl.h index f3e539242b9..dc6fe2e2c9b 100644 --- a/src/third_party/librdkafka/dist/src/rdavl.h +++ b/src/third_party/librdkafka/dist/src/rdavl.h @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012-2016, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdbase64.c b/src/third_party/librdkafka/dist/src/rdbase64.c new file mode 100644 index 00000000000..7d87650903f --- /dev/null +++ b/src/third_party/librdkafka/dist/src/rdbase64.c @@ -0,0 +1,200 @@ +/* + * librdkafka - The Apache Kafka C/C++ library + * + * Copyright (c) 2023 Confluent Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rdbase64.h" + +#if WITH_SSL +#include +#else + +#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f]) + +static const unsigned char data_bin2ascii[65] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +static int base64_encoding_conversion(unsigned char *out, + const unsigned char *in, + int dlen) { + int i, ret = 0; + unsigned long l; + + for (i = dlen; i > 0; i -= 3) { + if (i >= 3) { + l = (((unsigned long)in[0]) << 16L) | + (((unsigned long)in[1]) << 8L) | in[2]; + *(out++) = conv_bin2ascii(l >> 18L, data_bin2ascii); + *(out++) = conv_bin2ascii(l >> 12L, data_bin2ascii); + *(out++) = conv_bin2ascii(l >> 6L, data_bin2ascii); + *(out++) = conv_bin2ascii(l, data_bin2ascii); + } else { + l = ((unsigned long)in[0]) << 16L; + if (i == 2) + l |= ((unsigned long)in[1] << 8L); + + *(out++) = conv_bin2ascii(l >> 18L, data_bin2ascii); + *(out++) = conv_bin2ascii(l >> 12L, data_bin2ascii); + *(out++) = + (i == 1) ? '=' + : conv_bin2ascii(l >> 6L, data_bin2ascii); + *(out++) = '='; + } + ret += 4; + in += 3; + } + + *out = '\0'; + return ret; +} + +#endif + +/** + * @brief Base64 encode binary input \p in, and write base64-encoded string + * and it's size to \p out. out->ptr will be NULL in case of some issue + * with the conversion or the conversion is not supported. + * + * @remark out->ptr must be freed after use. + */ +void rd_base64_encode(const rd_chariov_t *in, rd_chariov_t *out) { + + size_t max_len; + + /* OpenSSL takes an |int| argument so the input cannot exceed that. */ + if (in->size > INT_MAX) { + out->ptr = NULL; + return; + } + + max_len = (((in->size + 2) / 3) * 4) + 1; + out->ptr = rd_malloc(max_len); + +#if WITH_SSL + out->size = EVP_EncodeBlock((unsigned char *)out->ptr, + (unsigned char *)in->ptr, (int)in->size); +#else + out->size = base64_encoding_conversion( + (unsigned char *)out->ptr, (unsigned char *)in->ptr, (int)in->size); +#endif + + rd_assert(out->size < max_len); + out->ptr[out->size] = 0; +} + + +/** + * @brief Base64 encode binary input \p in. + * @returns a newly allocated, base64-encoded string or NULL in case of some + * issue with the conversion or the conversion is not supported. + * + * @remark Returned string must be freed after use. + */ +char *rd_base64_encode_str(const rd_chariov_t *in) { + rd_chariov_t out; + rd_base64_encode(in, &out); + return out.ptr; +} + +/** + * @brief Base64 encode binary input \p in and return a newly allocated, + * base64-encoded string with URL-safe characters. + * @returns a newly allocated, base64-encoded string or NULL in case of some + * issue with the conversion or the conversion is not supported. + * + * @remark Returned string must be freed after use. + */ +char *rd_base64_encode_str_urlsafe(const rd_chariov_t *in) { + rd_chariov_t out; + char *p; + rd_base64_encode(in, &out); + + /* Replace + with - and / with _ */ + for (p = out.ptr; *p; p++) { + if (*p == '+') + *p = '-'; + else if (*p == '/') + *p = '_'; + } + + /* Remove padding '=' characters */ + int newlen = strlen(out.ptr); + while (newlen > 0 && out.ptr[newlen - 1] == '=') { + out.ptr[newlen - 1] = '\0'; + newlen--; + } + + out.size = newlen; + return out.ptr; +} + +/** + * @brief Base64 decode input string \p in. Ignores leading and trailing + * whitespace. + * @returns * 0 on successes in which case a newly allocated binary string is + * set in \p out (and size). + * * -1 on invalid Base64. + * * -2 on conversion not supported. + */ +int rd_base64_decode(const rd_chariov_t *in, rd_chariov_t *out) { + +#if WITH_SSL + size_t ret_len; + + /* OpenSSL takes an |int| argument, so |in->size| must not exceed + * that. */ + if (in->size % 4 != 0 || in->size > INT_MAX) { + return -1; + } + + ret_len = ((in->size / 4) * 3); + out->ptr = rd_malloc(ret_len + 1); + + if (EVP_DecodeBlock((unsigned char *)out->ptr, (unsigned char *)in->ptr, + (int)in->size) == -1) { + rd_free(out->ptr); + out->ptr = NULL; + return -1; + } + + /* EVP_DecodeBlock will pad the output with trailing NULs and count + * them in the return value. */ + if (in->size > 1 && in->ptr[in->size - 1] == '=') { + if (in->size > 2 && in->ptr[in->size - 2] == '=') { + ret_len -= 2; + } else { + ret_len -= 1; + } + } + + out->ptr[ret_len] = 0; + out->size = ret_len; + + return 0; +#else + return -2; +#endif +} diff --git a/src/third_party/librdkafka/dist/src/rdbase64.h b/src/third_party/librdkafka/dist/src/rdbase64.h new file mode 100644 index 00000000000..1fb12885488 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/rdbase64.h @@ -0,0 +1,43 @@ +/* + * librdkafka - The Apache Kafka C/C++ library + * + * Copyright (c) 2023 Confluent Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _RDBASE64_H_ +#define _RDBASE64_H_ + +#include "rd.h" + +void rd_base64_encode(const rd_chariov_t *in, rd_chariov_t *out); + +char *rd_base64_encode_str(const rd_chariov_t *in); + +char *rd_base64_encode_str_urlsafe(const rd_chariov_t *in); + +int rd_base64_decode(const rd_chariov_t *in, rd_chariov_t *out); + +#endif /* _RDBASE64_H_ */ \ No newline at end of file diff --git a/src/third_party/librdkafka/dist/src/rdbuf.c b/src/third_party/librdkafka/dist/src/rdbuf.c index 1392cf7b18c..427d632eb73 100644 --- a/src/third_party/librdkafka/dist/src/rdbuf.c +++ b/src/third_party/librdkafka/dist/src/rdbuf.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -660,13 +660,16 @@ size_t rd_buf_erase(rd_buf_t *rbuf, size_t absof, size_t size) { segremains); seg->seg_of -= toerase; + seg->seg_erased += toerase; rbuf->rbuf_len -= toerase; of += toerase; /* If segment is now empty, remove it */ - if (seg->seg_of == 0) + if (seg->seg_of == 0) { + rbuf->rbuf_erased -= seg->seg_erased; rd_buf_destroy_segment(rbuf, seg); + } } /* Update absolute offset of remaining segments */ @@ -709,6 +712,7 @@ int rd_buf_write_seek(rd_buf_t *rbuf, size_t absof) { next != seg;) { rd_segment_t *this = next; next = TAILQ_PREV(this, rd_segment_head, seg_link); + rbuf->rbuf_erased -= this->seg_erased; rd_buf_destroy_segment(rbuf, this); } diff --git a/src/third_party/librdkafka/dist/src/rdbuf.h b/src/third_party/librdkafka/dist/src/rdbuf.h index 1ef30e4a95e..d8f98422ccd 100644 --- a/src/third_party/librdkafka/dist/src/rdbuf.h +++ b/src/third_party/librdkafka/dist/src/rdbuf.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -70,6 +70,8 @@ typedef struct rd_segment_s { * beginning in the grand rd_buf_t */ void (*seg_free)(void *p); /**< Optional free function for seg_p */ int seg_flags; /**< Segment flags */ + size_t seg_erased; /** Total number of bytes erased from + * this segment. */ #define RD_SEGMENT_F_RDONLY 0x1 /**< Read-only segment */ #define RD_SEGMENT_F_FREE \ 0x2 /**< Free segment on destroy, \ diff --git a/src/third_party/librdkafka/dist/src/rdcrc32.c b/src/third_party/librdkafka/dist/src/rdcrc32.c index 2a6e126c142..f7a68855046 100644 --- a/src/third_party/librdkafka/dist/src/rdcrc32.c +++ b/src/third_party/librdkafka/dist/src/rdcrc32.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdcrc32.h b/src/third_party/librdkafka/dist/src/rdcrc32.h index c3195fca62d..676cd7d236b 100644 --- a/src/third_party/librdkafka/dist/src/rdcrc32.h +++ b/src/third_party/librdkafka/dist/src/rdcrc32.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rddl.c b/src/third_party/librdkafka/dist/src/rddl.c index 785e28c486f..826d0a79127 100644 --- a/src/third_party/librdkafka/dist/src/rddl.c +++ b/src/third_party/librdkafka/dist/src/rddl.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rddl.h b/src/third_party/librdkafka/dist/src/rddl.h index eaf6eb6d5ec..d1176c3e527 100644 --- a/src/third_party/librdkafka/dist/src/rddl.h +++ b/src/third_party/librdkafka/dist/src/rddl.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdendian.h b/src/third_party/librdkafka/dist/src/rdendian.h index 613d44bfaf8..55e6f2803c3 100644 --- a/src/third_party/librdkafka/dist/src/rdendian.h +++ b/src/third_party/librdkafka/dist/src/rdendian.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -125,16 +125,17 @@ #define be32toh(x) (x) #define be16toh(x) (x) #define le32toh(x) \ - ((((x)&0xff) << 24) | (((x)&0xff00) << 8) | (((x)&0xff0000) >> 8) | \ - (((x)&0xff000000) >> 24)) + ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8) | \ + (((x) & 0xff0000) >> 8) | (((x) & 0xff000000) >> 24)) #define le64toh(x) \ - ((((x)&0x00000000000000ffL) << 56) | \ - (((x)&0x000000000000ff00L) << 40) | \ - (((x)&0x0000000000ff0000L) << 24) | \ - (((x)&0x00000000ff000000L) << 8) | (((x)&0x000000ff00000000L) >> 8) | \ - (((x)&0x0000ff0000000000L) >> 24) | \ - (((x)&0x00ff000000000000L) >> 40) | \ - (((x)&0xff00000000000000L) >> 56)) + ((((x) & 0x00000000000000ffL) << 56) | \ + (((x) & 0x000000000000ff00L) << 40) | \ + (((x) & 0x0000000000ff0000L) << 24) | \ + (((x) & 0x00000000ff000000L) << 8) | \ + (((x) & 0x000000ff00000000L) >> 8) | \ + (((x) & 0x0000ff0000000000L) >> 24) | \ + (((x) & 0x00ff000000000000L) >> 40) | \ + (((x) & 0xff00000000000000L) >> 56)) #else #include #endif diff --git a/src/third_party/librdkafka/dist/src/rdfloat.h b/src/third_party/librdkafka/dist/src/rdfloat.h index 310045f0ea1..3868d35f5d9 100644 --- a/src/third_party/librdkafka/dist/src/rdfloat.h +++ b/src/third_party/librdkafka/dist/src/rdfloat.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2018, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdfnv1a.c b/src/third_party/librdkafka/dist/src/rdfnv1a.c index e951ec59f2e..c412348c2a7 100644 --- a/src/third_party/librdkafka/dist/src/rdfnv1a.c +++ b/src/third_party/librdkafka/dist/src/rdfnv1a.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2020, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdfnv1a.h b/src/third_party/librdkafka/dist/src/rdfnv1a.h index 8df66b0d62e..8d956ab68cd 100644 --- a/src/third_party/librdkafka/dist/src/rdfnv1a.h +++ b/src/third_party/librdkafka/dist/src/rdfnv1a.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdgz.c b/src/third_party/librdkafka/dist/src/rdgz.c index 794bd9cc1c5..d820bcfcacc 100644 --- a/src/third_party/librdkafka/dist/src/rdgz.c +++ b/src/third_party/librdkafka/dist/src/rdgz.c @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdgz.h b/src/third_party/librdkafka/dist/src/rdgz.h index 10d661cb3b2..1161091f298 100644 --- a/src/third_party/librdkafka/dist/src/rdgz.h +++ b/src/third_party/librdkafka/dist/src/rdgz.h @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdhdrhistogram.c b/src/third_party/librdkafka/dist/src/rdhdrhistogram.c index 3f2b6758b53..08240ac7a3b 100644 --- a/src/third_party/librdkafka/dist/src/rdhdrhistogram.c +++ b/src/third_party/librdkafka/dist/src/rdhdrhistogram.c @@ -31,7 +31,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018, Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdhdrhistogram.h b/src/third_party/librdkafka/dist/src/rdhdrhistogram.h index 868614b7b0b..7bfae84f4b2 100644 --- a/src/third_party/librdkafka/dist/src/rdhdrhistogram.h +++ b/src/third_party/librdkafka/dist/src/rdhdrhistogram.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018, Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdhttp.c b/src/third_party/librdkafka/dist/src/rdhttp.c index 7457a7fbe4e..30dfd6f7050 100644 --- a/src/third_party/librdkafka/dist/src/rdhttp.c +++ b/src/third_party/librdkafka/dist/src/rdhttp.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2021 Magnus Edenhill + * Copyright (c) 2021-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -40,6 +40,10 @@ #include #include "rdhttp.h" +#if WITH_SSL +#include "rdkafka_ssl.h" +#endif + /** Maximum response size, increase as necessary. */ #define RD_HTTP_RESPONSE_SIZE_MAX 1024 * 1024 * 500 /* 500kb */ @@ -128,8 +132,145 @@ rd_http_req_write_cb(char *ptr, size_t size, size_t nmemb, void *userdata) { return nmemb; } -rd_http_error_t *rd_http_req_init(rd_http_req_t *hreq, const char *url) { +#if WITH_SSL +/** + * @brief Callback function for setting up the SSL_CTX for HTTPS requests. + * + * This function sets the default CA paths for the SSL_CTX, and if that fails, + * it attempts to probe and set a default CA location. If `probe` is forced + * it skips the default CA paths and directly probes for CA certificates. + * + * On Windows, it attempts to load CA root certificates from the + * configured Windows certificate stores before falling back to the default. + * + * @return `CURLE_OK` on success, or `CURLE_SSL_CACERT_BADFILE` on failure. + */ +static CURLcode +rd_http_ssl_ctx_function(CURL *curl, void *sslctx, void *userptr) { + SSL_CTX *ctx = (SSL_CTX *)sslctx; + rd_kafka_t *rk = (rd_kafka_t *)userptr; + int r = -1; + rd_bool_t force_probe = + !rd_strcmp(rk->rk_conf.https.ca_location, "probe"); + rd_bool_t use_probe = force_probe; +#if WITH_STATIC_LIB_libcrypto + /* We fallback to `probe` when statically linked. */ + use_probe = rd_true; +#endif + +#ifdef _WIN32 + /* Attempt to load CA root certificates from the + * configured Windows certificate stores. */ + r = rd_kafka_ssl_win_load_cert_stores(rk, "https", ctx, + rk->rk_conf.ssl.ca_cert_stores); + if (r == 0) { + rd_kafka_log(rk, LOG_NOTICE, "CERTSTORE", + "No CA certificates loaded for `https` from " + "Windows certificate stores: " + "falling back to default OpenSSL CA paths"); + r = -1; + } else if (r == -1) + rd_kafka_log(rk, LOG_NOTICE, "CERTSTORE", + "Failed to load CA certificates for `https` from " + "Windows certificate stores: " + "falling back to default OpenSSL CA paths"); + + if (r != -1) { + rd_kafka_dbg(rk, SECURITY, "SSL", + "Successfully loaded CA certificates for `https` " + "from Windows certificate stores"); + return CURLE_OK; /* Success, CA certs loaded on Windows */ + } +#endif + + if (!force_probe) { + /* Previous default behavior: use predefined paths set when + * building OpenSSL. */ + char errstr[512]; + r = SSL_CTX_set_default_verify_paths(ctx); + if (r == 1) { + rd_kafka_dbg(rk, SECURITY, "SSL", + "SSL_CTX_set_default_verify_paths() " + "for `https` " + "succeeded"); + return CURLE_OK; /* Success */ + } + + /* Read error and clear the error stack. */ + rd_kafka_ssl_error0(rk, NULL, "https", errstr, sizeof(errstr)); + rd_kafka_dbg(rk, SECURITY, "SSL", + "SSL_CTX_set_default_verify_paths() " + "for `https` " + "failed: %s", + errstr); + } + + if (use_probe) { + /* We asked for probing or we're using + * a statically linked version of OpenSSL. */ + + r = rd_kafka_ssl_probe_and_set_default_ca_location(rk, "https", + ctx); + if (r == 0) + return CURLE_OK; + } + + return CURLE_SSL_CACERT_BADFILE; +} + +static void rd_http_ssl_configure(rd_kafka_t *rk, CURL *hreq_curl) { + rd_bool_t force_probe = + !rd_strcmp(rk->rk_conf.https.ca_location, "probe"); + + if (!force_probe && rk->rk_conf.https.ca_location) { + rd_bool_t is_dir; + rd_kafka_dbg(rk, SECURITY, "SSL", + "Setting `https` CA certs from " + "configured location: %s", + rk->rk_conf.https.ca_location); + if (rd_file_stat(rk->rk_conf.https.ca_location, &is_dir)) { + if (is_dir) { + curl_easy_setopt(hreq_curl, CURLOPT_CAPATH, + rk->rk_conf.https.ca_location); + curl_easy_setopt(hreq_curl, CURLOPT_CAINFO, + NULL); + } else { + curl_easy_setopt(hreq_curl, CURLOPT_CAPATH, + NULL); + curl_easy_setopt(hreq_curl, CURLOPT_CAINFO, + rk->rk_conf.https.ca_location); + } + } else { + /* Path doesn't exist, don't set any trusted + * certificate. */ + curl_easy_setopt(hreq_curl, CURLOPT_CAINFO, NULL); + curl_easy_setopt(hreq_curl, CURLOPT_CAPATH, NULL); + } + } else if (!force_probe && rk->rk_conf.https.ca_pem) { +#if CURL_AT_LEAST_VERSION(7, 77, 0) + struct curl_blob ca_blob = { + .data = rk->rk_conf.https.ca_pem, + .len = strlen(rk->rk_conf.https.ca_pem), + .flags = CURL_BLOB_COPY}; + rd_kafka_dbg(rk, SECURITY, "SSL", + "Setting `https` CA certs from " + "configured PEM string"); + curl_easy_setopt(hreq_curl, CURLOPT_CAINFO_BLOB, &ca_blob); +#endif + /* Only the blob should be set, no default paths. */ + curl_easy_setopt(hreq_curl, CURLOPT_CAINFO, NULL); + curl_easy_setopt(hreq_curl, CURLOPT_CAPATH, NULL); + } else { + curl_easy_setopt(hreq_curl, CURLOPT_SSL_CTX_FUNCTION, + rd_http_ssl_ctx_function); + curl_easy_setopt(hreq_curl, CURLOPT_SSL_CTX_DATA, rk); + } +} +#endif + +rd_http_error_t * +rd_http_req_init(rd_kafka_t *rk, rd_http_req_t *hreq, const char *url) { memset(hreq, 0, sizeof(*hreq)); hreq->hreq_curl = curl_easy_init(); @@ -139,8 +280,15 @@ rd_http_error_t *rd_http_req_init(rd_http_req_t *hreq, const char *url) { hreq->hreq_buf = rd_buf_new(1, 1024); curl_easy_setopt(hreq->hreq_curl, CURLOPT_URL, url); +#if CURL_AT_LEAST_VERSION(7, 85, 0) + curl_easy_setopt(hreq->hreq_curl, CURLOPT_PROTOCOLS_STR, "http,https"); +#else + /* As of 06/10/2025 Debian 10 and CentOS Stream 9 ship with + * older CURL versions, remove this condition once they're not supported + * anymore. */ curl_easy_setopt(hreq->hreq_curl, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS); +#endif curl_easy_setopt(hreq->hreq_curl, CURLOPT_MAXREDIRS, 16); curl_easy_setopt(hreq->hreq_curl, CURLOPT_TIMEOUT, 30); curl_easy_setopt(hreq->hreq_curl, CURLOPT_ERRORBUFFER, @@ -150,6 +298,10 @@ rd_http_error_t *rd_http_req_init(rd_http_req_t *hreq, const char *url) { rd_http_req_write_cb); curl_easy_setopt(hreq->hreq_curl, CURLOPT_WRITEDATA, (void *)hreq); +#if WITH_SSL + rd_http_ssl_configure(rk, hreq->hreq_curl); +#endif + return NULL; } @@ -200,13 +352,14 @@ const char *rd_http_req_get_content_type(rd_http_req_t *hreq) { * by calling rd_http_error_destroy(). In case of HTTP error the \p *rbufp * may be filled with the error response. */ -rd_http_error_t *rd_http_get(const char *url, rd_buf_t **rbufp) { +rd_http_error_t * +rd_http_get(rd_kafka_t *rk, const char *url, rd_buf_t **rbufp) { rd_http_req_t hreq; rd_http_error_t *herr; *rbufp = NULL; - herr = rd_http_req_init(&hreq, url); + herr = rd_http_req_init(rk, &hreq, url); if (unlikely(herr != NULL)) return herr; @@ -269,6 +422,7 @@ static rd_bool_t rd_http_is_failure_temporary(int error_code) { switch (error_code) { case 408: /**< Request timeout */ case 425: /**< Too early */ + case 429: /**< Too many requests */ case 500: /**< Internal server error */ case 502: /**< Bad gateway */ case 503: /**< Service unavailable */ @@ -309,7 +463,7 @@ rd_http_error_t *rd_http_post_expect_json(rd_kafka_t *rk, size_t len; const char *content_type; - herr = rd_http_req_init(&hreq, url); + herr = rd_http_req_init(rk, &hreq, url); if (unlikely(herr != NULL)) return herr; @@ -374,7 +528,8 @@ rd_http_error_t *rd_http_post_expect_json(rd_kafka_t *rk, * * Same error semantics as rd_http_get(). */ -rd_http_error_t *rd_http_get_json(const char *url, cJSON **jsonp) { +rd_http_error_t * +rd_http_get_json(rd_kafka_t *rk, const char *url, cJSON **jsonp) { rd_http_req_t hreq; rd_http_error_t *herr; rd_slice_t slice; @@ -385,7 +540,7 @@ rd_http_error_t *rd_http_get_json(const char *url, cJSON **jsonp) { *jsonp = NULL; - herr = rd_http_req_init(&hreq, url); + herr = rd_http_req_init(rk, &hreq, url); if (unlikely(herr != NULL)) return herr; @@ -460,19 +615,21 @@ int unittest_http(void) { cJSON *json, *jval; rd_http_error_t *herr; rd_bool_t empty; + rd_kafka_t *rk; if (!base_url || !*base_url) RD_UT_SKIP("RD_UT_HTTP_URL environment variable not set"); RD_UT_BEGIN(); + rk = rd_calloc(1, sizeof(*rk)); error_url_size = strlen(base_url) + strlen("/error") + 1; error_url = rd_alloca(error_url_size); rd_snprintf(error_url, error_url_size, "%s/error", base_url); /* Try the base url first, parse its JSON and extract a key-value. */ json = NULL; - herr = rd_http_get_json(base_url, &json); + herr = rd_http_get_json(rk, base_url, &json); RD_UT_ASSERT(!herr, "Expected get_json(%s) to succeed, got: %s", base_url, herr->errstr); @@ -492,7 +649,7 @@ int unittest_http(void) { /* Try the error URL, verify error code. */ json = NULL; - herr = rd_http_get_json(error_url, &json); + herr = rd_http_get_json(rk, error_url, &json); RD_UT_ASSERT(herr != NULL, "Expected get_json(%s) to fail", error_url); RD_UT_ASSERT(herr->code >= 400, "Expected get_json(%s) error code >= " @@ -506,6 +663,7 @@ int unittest_http(void) { if (json) cJSON_Delete(json); rd_http_error_destroy(herr); + rd_free(rk); RD_UT_PASS(); } diff --git a/src/third_party/librdkafka/dist/src/rdhttp.h b/src/third_party/librdkafka/dist/src/rdhttp.h index 80512e5ac28..0af726dd01a 100644 --- a/src/third_party/librdkafka/dist/src/rdhttp.h +++ b/src/third_party/librdkafka/dist/src/rdhttp.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2021 Magnus Edenhill + * Copyright (c) 2021-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -42,8 +42,9 @@ typedef struct rd_http_error_s { void rd_http_error_destroy(rd_http_error_t *herr); -rd_http_error_t *rd_http_get(const char *url, rd_buf_t **rbufp); -rd_http_error_t *rd_http_get_json(const char *url, cJSON **jsonp); +rd_http_error_t *rd_http_get(rd_kafka_t *rk, const char *url, rd_buf_t **rbufp); +rd_http_error_t * +rd_http_get_json(rd_kafka_t *rk, const char *url, cJSON **jsonp); void rd_http_global_init(void); @@ -62,7 +63,8 @@ typedef struct rd_http_req_s { * write to. */ } rd_http_req_t; -rd_http_error_t *rd_http_req_init(rd_http_req_t *hreq, const char *url); +rd_http_error_t * +rd_http_req_init(rd_kafka_t *rk, rd_http_req_t *hreq, const char *url); rd_http_error_t *rd_http_req_perform_sync(rd_http_req_t *hreq); rd_http_error_t *rd_http_parse_json(rd_http_req_t *hreq, cJSON **jsonp); rd_http_error_t *rd_http_post_expect_json(rd_kafka_t *rk, diff --git a/src/third_party/librdkafka/dist/src/rdinterval.h b/src/third_party/librdkafka/dist/src/rdinterval.h index 4283376462f..95cdf3c2d7f 100644 --- a/src/third_party/librdkafka/dist/src/rdinterval.h +++ b/src/third_party/librdkafka/dist/src/rdinterval.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,6 +31,7 @@ #define _RDINTERVAL_H_ #include "rd.h" +#include "rdrand.h" typedef struct rd_interval_s { rd_ts_t ri_ts_last; /* last interval timestamp */ @@ -109,6 +111,22 @@ static RD_INLINE RD_UNUSED void rd_interval_reset_to_now(rd_interval_t *ri, ri->ri_backoff = 0; } +/** + * Reset the interval to 'now' with the given backoff ms and max_jitter as + * percentage. The backoff is given just for absolute jitter calculation. If now + * is 0, the time will be gathered automatically. + */ +static RD_INLINE RD_UNUSED void +rd_interval_reset_to_now_with_jitter(rd_interval_t *ri, + rd_ts_t now, + int64_t backoff_ms, + int max_jitter) { + rd_interval_reset_to_now(ri, now); + /* We are multiplying by 10 as (backoff_ms * percent * 1000)/100 -> + * backoff_ms * jitter * 10 */ + ri->ri_backoff = backoff_ms * rd_jitter(-max_jitter, max_jitter) * 10; +} + /** * Back off the next interval by `backoff_us` microseconds. */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka.c b/src/third_party/librdkafka/dist/src/rdkafka.c index 8eedd9f94be..3723da7e86e 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka.c +++ b/src/third_party/librdkafka/dist/src/rdkafka.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -45,6 +46,7 @@ #include "rdkafka_topic.h" #include "rdkafka_partition.h" #include "rdkafka_offset.h" +#include "rdkafka_telemetry.h" #include "rdkafka_transport.h" #include "rdkafka_cgrp.h" #include "rdkafka_assignor.h" @@ -63,6 +65,7 @@ #endif #include "rdtime.h" +#include "rdmap.h" #include "crc32c.h" #include "rdunittest.h" @@ -393,14 +396,6 @@ void rd_kafka_set_log_level(rd_kafka_t *rk, int level) { -static const char *rd_kafka_type2str(rd_kafka_type_t type) { - static const char *types[] = { - [RD_KAFKA_PRODUCER] = "producer", - [RD_KAFKA_CONSUMER] = "consumer", - }; - return types[type]; -} - #define _ERR_DESC(ENUM, DESC) \ [ENUM - RD_KAFKA_RESP_ERR__BEGIN] = {ENUM, &(#ENUM)[18] /*pfx*/, DESC} @@ -409,7 +404,9 @@ static const struct rd_kafka_err_desc rd_kafka_err_descs[] = { _ERR_DESC(RD_KAFKA_RESP_ERR__BAD_MSG, "Local: Bad message format"), _ERR_DESC(RD_KAFKA_RESP_ERR__BAD_COMPRESSION, "Local: Invalid compressed data"), - _ERR_DESC(RD_KAFKA_RESP_ERR__DESTROY, "Local: Broker handle destroyed"), + _ERR_DESC(RD_KAFKA_RESP_ERR__DESTROY, + "Local: Broker handle destroyed " + "for termination"), _ERR_DESC( RD_KAFKA_RESP_ERR__FAIL, "Local: Communication failure with broker"), // FIXME: too specific @@ -489,6 +486,13 @@ static const struct rd_kafka_err_desc rd_kafka_err_descs[] = { _ERR_DESC(RD_KAFKA_RESP_ERR__NOOP, "Local: No operation performed"), _ERR_DESC(RD_KAFKA_RESP_ERR__AUTO_OFFSET_RESET, "Local: No offset to automatically reset to"), + _ERR_DESC(RD_KAFKA_RESP_ERR__LOG_TRUNCATION, + "Local: Partition log truncation detected"), + _ERR_DESC(RD_KAFKA_RESP_ERR__INVALID_DIFFERENT_RECORD, + "Local: an invalid record in the same batch caused " + "the failure of this message too"), + _ERR_DESC(RD_KAFKA_RESP_ERR__DESTROY_BROKER, + "Local: Broker handle destroyed without termination"), _ERR_DESC(RD_KAFKA_RESP_ERR_UNKNOWN, "Unknown broker error"), _ERR_DESC(RD_KAFKA_RESP_ERR_NO_ERROR, "Success"), @@ -697,7 +701,26 @@ static const struct rd_kafka_err_desc rd_kafka_err_descs[] = { _ERR_DESC(RD_KAFKA_RESP_ERR_PRINCIPAL_DESERIALIZATION_FAILURE, "Broker: Request principal deserialization failed during " "forwarding"), - + _ERR_DESC(RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_ID, "Broker: Unknown topic id"), + _ERR_DESC(RD_KAFKA_RESP_ERR_FENCED_MEMBER_EPOCH, + "Broker: The member epoch is fenced by the group coordinator"), + _ERR_DESC(RD_KAFKA_RESP_ERR_UNRELEASED_INSTANCE_ID, + "Broker: The instance ID is still used by another member in the " + "consumer group"), + _ERR_DESC(RD_KAFKA_RESP_ERR_UNSUPPORTED_ASSIGNOR, + "Broker: The assignor or its version range is not supported by " + "the consumer group"), + _ERR_DESC(RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH, + "Broker: The member epoch is stale"), + _ERR_DESC(RD_KAFKA_RESP_ERR_UNKNOWN_SUBSCRIPTION_ID, + "Broker: Client sent a push telemetry request with an invalid or " + "outdated subscription ID"), + _ERR_DESC(RD_KAFKA_RESP_ERR_TELEMETRY_TOO_LARGE, + "Broker: Client sent a push telemetry request larger than the " + "maximum size the broker will accept"), + _ERR_DESC(RD_KAFKA_RESP_ERR_REBOOTSTRAP_REQUIRED, + "Broker: Client metadata is stale, " + "client should rebootstrap to obtain new metadata"), _ERR_DESC(RD_KAFKA_RESP_ERR__END, NULL)}; @@ -909,7 +932,29 @@ rd_kafka_resp_err_t rd_kafka_test_fatal_error(rd_kafka_t *rk, return RD_KAFKA_RESP_ERR_NO_ERROR; } - +/** + * @brief Called when a broker thread is decommissioned. + * on the main thread to join the corresponding thread + * and remove it from the wait lists. + * + * @locality main thread + */ +void rd_kafka_decommissioned_broker_thread_join(rd_kafka_t *rk, + void *rkb_decommissioned) { + thrd_t *thrd; + int i; + RD_LIST_FOREACH(thrd, &rk->wait_decommissioned_thrds, i) { + void *rkb = rd_list_elem(&rk->wait_decommissioned_brokers, i); + if (rkb == rkb_decommissioned) { + rd_list_remove_elem(&rk->wait_decommissioned_thrds, i); + rd_list_remove_elem(&rk->wait_decommissioned_brokers, + i); + thrd_join(*thrd, NULL); + rd_free(thrd); + i--; + } + } +} /** * @brief Final destructor for rd_kafka_t, must only be called with refcnt 0. @@ -924,6 +969,8 @@ void rd_kafka_destroy_final(rd_kafka_t *rk) { rd_kafka_wrlock(rk); rd_kafka_wrunlock(rk); + rd_kafka_telemetry_clear(rk, rd_true /*clear_control_flow_fields*/); + /* Terminate SASL provider */ if (rk->rk_conf.sasl.provider) rd_kafka_sasl_term(rk); @@ -946,6 +993,18 @@ void rd_kafka_destroy_final(rd_kafka_t *rk) { rd_kafka_assignment_destroy(rk); if (rk->rk_consumer.q) rd_kafka_q_destroy(rk->rk_consumer.q); + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio); + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_current.rk_avg_rebalance_latency); + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_current.rk_avg_commit_latency); + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_poll_idle_ratio); + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency); + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_commit_latency); } /* Purge op-queues */ @@ -987,8 +1046,7 @@ void rd_kafka_destroy_final(rd_kafka_t *rk) { cnd_destroy(&rk->rk_init_cnd); mtx_destroy(&rk->rk_init_lock); - if (rk->rk_full_metadata) - rd_kafka_metadata_destroy(rk->rk_full_metadata); + rd_kafkap_str_destroy(rk->rk_client_id); rd_kafkap_str_destroy(rk->rk_group_id); rd_kafkap_str_destroy(rk->rk_eos.transactional_id); @@ -1074,7 +1132,13 @@ static void rd_kafka_destroy_app(rd_kafka_t *rk, int flags) { rd_kafka_consumer_close(rk); } - /* With the consumer closed, terminate the rest of librdkafka. */ + /* Await telemetry termination. This method blocks until the last + * PushTelemetry request is sent (if possible). */ + if (!(flags & RD_KAFKA_DESTROY_F_IMMEDIATE)) + rd_kafka_telemetry_await_termination(rk); + + /* With the consumer and telemetry closed, terminate the rest of + * librdkafka. */ rd_atomic32_set(&rk->rk_terminate, flags | RD_KAFKA_DESTROY_F_TERMINATE); @@ -1132,8 +1196,8 @@ void rd_kafka_destroy_flags(rd_kafka_t *rk, int flags) { */ static void rd_kafka_destroy_internal(rd_kafka_t *rk) { rd_kafka_topic_t *rkt, *rkt_tmp; - rd_kafka_broker_t *rkb, *rkb_tmp; - rd_list_t wait_thrds; + rd_kafka_broker_t *rkb; + rd_list_t wait_thrds, brokers_to_decommission; thrd_t *thrd; int i; @@ -1176,33 +1240,25 @@ static void rd_kafka_destroy_internal(rd_kafka_t *rk) { } /* Decommission brokers. - * Broker thread holds a refcount and detects when broker refcounts - * reaches 1 and then decommissions itself. */ - TAILQ_FOREACH_SAFE(rkb, &rk->rk_brokers, rkb_link, rkb_tmp) { - /* Add broker's thread to wait_thrds list for later joining */ - thrd = rd_malloc(sizeof(*thrd)); - *thrd = rkb->rkb_thread; - rd_list_add(&wait_thrds, thrd); - rd_kafka_wrunlock(rk); - - rd_kafka_dbg(rk, BROKER, "DESTROY", "Sending TERMINATE to %s", - rd_kafka_broker_name(rkb)); - /* Send op to trigger queue/io wake-up. - * The op itself is (likely) ignored by the broker thread. */ - rd_kafka_q_enq(rkb->rkb_ops, - rd_kafka_op_new(RD_KAFKA_OP_TERMINATE)); - -#ifndef _WIN32 - /* Interrupt IO threads to speed up termination. */ - if (rk->rk_conf.term_sig) - pthread_kill(rkb->rkb_thread, rk->rk_conf.term_sig); -#endif - - rd_kafka_broker_destroy(rkb); - - rd_kafka_wrlock(rk); + * `rd_kafka_broker_decommission` releases and reacquires + * the lock so there could be destroyed brokers in + * `rk->rk_brokers` */ + rd_list_init(&brokers_to_decommission, + rd_atomic32_get(&rk->rk_broker_cnt), NULL); + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + /* Don't try to decommission already decommissioning brokers + * otherwise they could be already destroyed when + * `rd_kafka_broker_decommission` is called below. */ + if (rd_list_find(&rk->wait_decommissioned_brokers, rkb, + rd_list_cmp_ptr) == NULL) + rd_list_add(&brokers_to_decommission, rkb); } + RD_LIST_FOREACH(rkb, &brokers_to_decommission, i) { + rd_kafka_broker_decommission(rk, rkb, &wait_thrds); + } + rd_list_destroy(&brokers_to_decommission); + if (rk->rk_clusterid) { rd_free(rk->rk_clusterid); rk->rk_clusterid = NULL; @@ -1243,22 +1299,23 @@ static void rd_kafka_destroy_internal(rd_kafka_t *rk) { /* Loose our special reference to the internal broker. */ mtx_lock(&rk->rk_internal_rkb_lock); - if ((rkb = rk->rk_internal_rkb)) { + if (rk->rk_internal_rkb) { rd_kafka_dbg(rk, GENERIC, "TERMINATE", "Decommissioning internal broker"); - /* Send op to trigger queue wake-up. */ - rd_kafka_q_enq(rkb->rkb_ops, + thrd = rd_malloc(sizeof(*thrd)); + *thrd = rk->rk_internal_rkb->rkb_thread; + + /* Send op to trigger queue wake-up. + * WARNING: This is last time we can read + * from rk_internal_rkb in this thread! */ + rd_kafka_q_enq(rk->rk_internal_rkb->rkb_ops, rd_kafka_op_new(RD_KAFKA_OP_TERMINATE)); rk->rk_internal_rkb = NULL; - thrd = rd_malloc(sizeof(*thrd)); - *thrd = rkb->rkb_thread; rd_list_add(&wait_thrds, thrd); } mtx_unlock(&rk->rk_internal_rkb_lock); - if (rkb) - rd_kafka_broker_destroy(rkb); rd_kafka_dbg(rk, GENERIC, "TERMINATE", "Join %d broker thread(s)", @@ -1274,6 +1331,17 @@ static void rd_kafka_destroy_internal(rd_kafka_t *rk) { rd_list_destroy(&wait_thrds); + /* Join previously decommissioned broker threads */ + RD_LIST_FOREACH(thrd, &rk->wait_decommissioned_thrds, i) { + int res; + if (thrd_join(*thrd, &res) != thrd_success) + ; + rd_free(thrd); + } + rd_list_destroy(&rk->additional_brokerlists); + rd_list_destroy(&rk->wait_decommissioned_brokers); + rd_list_destroy(&rk->wait_decommissioned_thrds); + /* Destroy mock cluster */ if (rk->rk_mock.cluster) rd_kafka_mock_cluster_destroy(rk->rk_mock.cluster); @@ -1398,9 +1466,7 @@ static RD_INLINE void rd_kafka_stats_emit_toppar(struct _stats_emit *st, rd_kafka_toppar_lock(rktp); if (rktp->rktp_broker) { - rd_kafka_broker_lock(rktp->rktp_broker); broker_id = rktp->rktp_broker->rkb_nodeid; - rd_kafka_broker_unlock(rktp->rktp_broker); } /* Grab a copy of the latest finalized offset stats */ @@ -1419,13 +1485,14 @@ static RD_INLINE void rd_kafka_stats_emit_toppar(struct _stats_emit *st, * offsets are not (yet) committed. */ if (end_offset != RD_KAFKA_OFFSET_INVALID) { - if (rktp->rktp_stored_offset >= 0 && - rktp->rktp_stored_offset <= end_offset) + if (rktp->rktp_stored_pos.offset >= 0 && + rktp->rktp_stored_pos.offset <= end_offset) consumer_lag_stored = - end_offset - rktp->rktp_stored_offset; - if (rktp->rktp_committed_offset >= 0 && - rktp->rktp_committed_offset <= end_offset) - consumer_lag = end_offset - rktp->rktp_committed_offset; + end_offset - rktp->rktp_stored_pos.offset; + if (rktp->rktp_committed_pos.offset >= 0 && + rktp->rktp_committed_pos.offset <= end_offset) + consumer_lag = + end_offset - rktp->rktp_committed_pos.offset; } _st_printf( @@ -1457,10 +1524,14 @@ static RD_INLINE void rd_kafka_stats_emit_toppar(struct _stats_emit *st, ", " "\"stored_offset\":%" PRId64 ", " + "\"stored_leader_epoch\":%" PRId32 + ", " "\"commited_offset\":%" PRId64 ", " /*FIXME: issue #80 */ "\"committed_offset\":%" PRId64 ", " + "\"committed_leader_epoch\":%" PRId32 + ", " "\"eof_offset\":%" PRId64 ", " "\"lo_offset\":%" PRId64 @@ -1473,6 +1544,8 @@ static RD_INLINE void rd_kafka_stats_emit_toppar(struct _stats_emit *st, ", " "\"consumer_lag_stored\":%" PRId64 ", " + "\"leader_epoch\":%" PRId32 + ", " "\"txmsgs\":%" PRIu64 ", " "\"txbytes\":%" PRIu64 @@ -1502,12 +1575,15 @@ static RD_INLINE void rd_kafka_stats_emit_toppar(struct _stats_emit *st, 0, (size_t)0, rd_kafka_q_len(rktp->rktp_fetchq), rd_kafka_q_size(rktp->rktp_fetchq), rd_kafka_fetch_states[rktp->rktp_fetch_state], - rktp->rktp_query_offset, offs.fetch_offset, rktp->rktp_app_offset, - rktp->rktp_stored_offset, - rktp->rktp_committed_offset, /* FIXME: issue #80 */ - rktp->rktp_committed_offset, offs.eof_offset, rktp->rktp_lo_offset, - rktp->rktp_hi_offset, rktp->rktp_ls_offset, consumer_lag, - consumer_lag_stored, rd_atomic64_get(&rktp->rktp_c.tx_msgs), + rktp->rktp_query_pos.offset, offs.fetch_pos.offset, + rktp->rktp_app_pos.offset, rktp->rktp_stored_pos.offset, + rktp->rktp_stored_pos.leader_epoch, + rktp->rktp_committed_pos.offset, /* FIXME: issue #80 */ + rktp->rktp_committed_pos.offset, + rktp->rktp_committed_pos.leader_epoch, offs.eof_offset, + rktp->rktp_lo_offset, rktp->rktp_hi_offset, rktp->rktp_ls_offset, + consumer_lag, consumer_lag_stored, rktp->rktp_leader_epoch, + rd_atomic64_get(&rktp->rktp_c.tx_msgs), rd_atomic64_get(&rktp->rktp_c.tx_msg_bytes), rd_atomic64_get(&rktp->rktp_c.rx_msgs), rd_atomic64_get(&rktp->rktp_c.rx_msg_bytes), @@ -1569,8 +1645,6 @@ static void rd_kafka_stats_emit_broker_reqs(struct _stats_emit *st, [RD_KAFKAP_AlterReplicaLogDirs] = rd_true, [RD_KAFKAP_DescribeLogDirs] = rd_true, - [RD_KAFKAP_SaslAuthenticate] = rd_false, - [RD_KAFKAP_CreateDelegationToken] = rd_true, [RD_KAFKAP_RenewDelegationToken] = rd_true, [RD_KAFKAP_ExpireDelegationToken] = rd_true, @@ -1587,21 +1661,37 @@ static void rd_kafka_stats_emit_broker_reqs(struct _stats_emit *st, [RD_KAFKAP_AlterIsr] = rd_true, [RD_KAFKAP_UpdateFeatures] = rd_true, [RD_KAFKAP_Envelope] = rd_true, + [RD_KAFKAP_FetchSnapshot] = rd_true, + [RD_KAFKAP_BrokerHeartbeat] = rd_true, + [RD_KAFKAP_UnregisterBroker] = rd_true, + [RD_KAFKAP_AllocateProducerIds] = rd_true, + [RD_KAFKAP_ConsumerGroupHeartbeat] = rd_true, }, [3 /*hide-unless-non-zero*/] = { /* Hide Admin requests unless they've been used */ - [RD_KAFKAP_CreateTopics] = rd_true, - [RD_KAFKAP_DeleteTopics] = rd_true, - [RD_KAFKAP_DeleteRecords] = rd_true, - [RD_KAFKAP_CreatePartitions] = rd_true, - [RD_KAFKAP_DescribeAcls] = rd_true, - [RD_KAFKAP_CreateAcls] = rd_true, - [RD_KAFKAP_DeleteAcls] = rd_true, - [RD_KAFKAP_DescribeConfigs] = rd_true, - [RD_KAFKAP_AlterConfigs] = rd_true, - [RD_KAFKAP_DeleteGroups] = rd_true, - [RD_KAFKAP_ListGroups] = rd_true, - [RD_KAFKAP_DescribeGroups] = rd_true}}; + [RD_KAFKAP_CreateTopics] = rd_true, + [RD_KAFKAP_DeleteTopics] = rd_true, + [RD_KAFKAP_DeleteRecords] = rd_true, + [RD_KAFKAP_CreatePartitions] = rd_true, + [RD_KAFKAP_DescribeAcls] = rd_true, + [RD_KAFKAP_CreateAcls] = rd_true, + [RD_KAFKAP_DeleteAcls] = rd_true, + [RD_KAFKAP_DescribeConfigs] = rd_true, + [RD_KAFKAP_AlterConfigs] = rd_true, + [RD_KAFKAP_DeleteGroups] = rd_true, + [RD_KAFKAP_ListGroups] = rd_true, + [RD_KAFKAP_DescribeGroups] = rd_true, + [RD_KAFKAP_DescribeLogDirs] = rd_true, + [RD_KAFKAP_IncrementalAlterConfigs] = rd_true, + [RD_KAFKAP_AlterPartitionReassignments] = rd_true, + [RD_KAFKAP_ListPartitionReassignments] = rd_true, + [RD_KAFKAP_OffsetDelete] = rd_true, + [RD_KAFKAP_DescribeClientQuotas] = rd_true, + [RD_KAFKAP_AlterClientQuotas] = rd_true, + [RD_KAFKAP_DescribeUserScramCredentials] = rd_true, + [RD_KAFKAP_AlterUserScramCredentials] = rd_true, + [RD_KAFKAP_ConsumerGroupDescribe] = rd_true, + }}; int i; int cnt = 0; @@ -1970,6 +2060,61 @@ static void rd_kafka_1s_tmr_cb(rd_kafka_timers_t *rkts, void *arg) { rd_kafka_coord_cache_expire(&rk->rk_coord_cache); } +/** + * @brief Re-bootstrap timer callback. + * + * @locality rdkafka main thread + * @locks none + */ +static void rd_kafka_rebootstrap_tmr_cb(rd_kafka_timers_t *rkts, void *arg) { + int i; + char *brokerlist; + rd_kafka_t *rk = rkts->rkts_rk; + rd_list_t additional_brokerlists; + + rd_dassert(thrd_is_current(rk->rk_thread)); + if (rd_kafka_terminating(rk)) + /* Avoid re-bootstrapping while terminating */ + return; + + rd_dassert(rk->rk_conf.metadata_recovery_strategy != + RD_KAFKA_METADATA_RECOVERY_STRATEGY_NONE); + if (rk->rk_conf.metadata_recovery_strategy == + RD_KAFKA_METADATA_RECOVERY_STRATEGY_NONE) + /* This function should not be called in this case. + * this is just a fail-safe. */ + return; + + rd_kafka_dbg(rk, ALL, "REBOOTSTRAP", "Starting re-bootstrap sequence"); + + if (rk->rk_conf.brokerlist) { + rd_kafka_brokers_add0( + rk, + rk->rk_conf.brokerlist, rd_true + /* resolve canonical bootstrap server + * list names if requested*/); + } + + rd_kafka_rdlock(rk); + if (rd_list_cnt(&rk->additional_brokerlists) == 0) { + rd_kafka_rdunlock(rk); + return; + } + + rd_list_init_copy(&additional_brokerlists, &rk->additional_brokerlists); + rd_list_copy_to(&additional_brokerlists, &rk->additional_brokerlists, + rd_list_string_copy, NULL); + rd_kafka_rdunlock(rk); + + RD_LIST_FOREACH(brokerlist, &additional_brokerlists, i) { + rd_kafka_brokers_add0(rk, brokerlist, + rd_false + /* don't resolve canonical bootstrap server list + * names even if requested */); + } + rd_list_destroy(&additional_brokerlists); +} + static void rd_kafka_stats_emit_tmr_cb(rd_kafka_timers_t *rkts, void *arg) { rd_kafka_t *rk = rkts->rkts_rk; rd_kafka_stats_emit_all(rk); @@ -2023,15 +2168,15 @@ static void rd_kafka_metadata_refresh_cb(rd_kafka_timers_t *rkts, void *arg) { * @locks none */ static int rd_kafka_init_wait(rd_kafka_t *rk, int timeout_ms) { - struct timespec tspec; int ret; + rd_ts_t abs_timeout; - rd_timeout_init_timespec(&tspec, timeout_ms); + abs_timeout = rd_timeout_init(timeout_ms); mtx_lock(&rk->rk_init_lock); while (rk->rk_init_wait_cnt > 0 && - cnd_timedwait_abs(&rk->rk_init_cnd, &rk->rk_init_lock, &tspec) == - thrd_success) + cnd_timedwait_abs(&rk->rk_init_cnd, &rk->rk_init_lock, + abs_timeout) == thrd_success) ; ret = rk->rk_init_wait_cnt; mtx_unlock(&rk->rk_init_lock); @@ -2044,10 +2189,8 @@ static int rd_kafka_init_wait(rd_kafka_t *rk, int timeout_ms) { * Main loop for Kafka handler thread. */ static int rd_kafka_thread_main(void *arg) { - rd_kafka_t *rk = arg; - rd_kafka_timer_t tmr_1s = RD_ZERO_INIT; - rd_kafka_timer_t tmr_stats_emit = RD_ZERO_INIT; - rd_kafka_timer_t tmr_metadata_refresh = RD_ZERO_INIT; + rd_kafka_t *rk = arg; + rd_kafka_timer_t tmr_stats_emit = RD_ZERO_INIT; rd_kafka_set_thread_name("main"); rd_kafka_set_thread_sysname("rdk:main"); @@ -2062,14 +2205,14 @@ static int rd_kafka_thread_main(void *arg) { rd_kafka_wrunlock(rk); /* 1 second timer for topic scan and connection checking. */ - rd_kafka_timer_start(&rk->rk_timers, &tmr_1s, 1000000, + rd_kafka_timer_start(&rk->rk_timers, &rk->one_s_tmr, 1000000, rd_kafka_1s_tmr_cb, NULL); if (rk->rk_conf.stats_interval_ms) rd_kafka_timer_start(&rk->rk_timers, &tmr_stats_emit, rk->rk_conf.stats_interval_ms * 1000ll, rd_kafka_stats_emit_tmr_cb, NULL); if (rk->rk_conf.metadata_refresh_interval_ms > 0) - rd_kafka_timer_start(&rk->rk_timers, &tmr_metadata_refresh, + rd_kafka_timer_start(&rk->rk_timers, &rk->metadata_refresh_tmr, rk->rk_conf.metadata_refresh_interval_ms * 1000ll, rd_kafka_metadata_refresh_cb, NULL); @@ -2090,7 +2233,10 @@ static int rd_kafka_thread_main(void *arg) { RD_KAFKA_CGRP_STATE_TERM)))) { rd_ts_t sleeptime = rd_kafka_timers_next( &rk->rk_timers, 1000 * 1000 /*1s*/, 1 /*lock*/); - rd_kafka_q_serve(rk->rk_ops, (int)(sleeptime / 1000), 0, + /* Use ceiling division to avoid calling serve with a 0 ms + * timeout in a tight loop until 1 ms has passed. */ + int timeout_ms = (sleeptime + 999) / 1000; + rd_kafka_q_serve(rk->rk_ops, timeout_ms, 0, RD_KAFKA_Q_CB_CALLBACK, NULL, NULL); if (rk->rk_cgrp) /* FIXME: move to timer-triggered */ rd_kafka_cgrp_serve(rk->rk_cgrp); @@ -2106,10 +2252,10 @@ static int rd_kafka_thread_main(void *arg) { rd_kafka_q_disable(rk->rk_ops); rd_kafka_q_purge(rk->rk_ops); - rd_kafka_timer_stop(&rk->rk_timers, &tmr_1s, 1); + rd_kafka_timer_stop(&rk->rk_timers, &rk->one_s_tmr, 1); if (rk->rk_conf.stats_interval_ms) rd_kafka_timer_stop(&rk->rk_timers, &tmr_stats_emit, 1); - rd_kafka_timer_stop(&rk->rk_timers, &tmr_metadata_refresh, 1); + rd_kafka_timer_stop(&rk->rk_timers, &rk->metadata_refresh_tmr, 1); /* Synchronise state */ rd_kafka_wrlock(rk); @@ -2222,8 +2368,15 @@ rd_kafka_t *rd_kafka_new(rd_kafka_type_t type, rd_interval_init(&rk->rk_suppress.sparse_connect_random); mtx_init(&rk->rk_suppress.sparse_connect_lock, mtx_plain); + mtx_init(&rk->rk_telemetry.lock, mtx_plain); + cnd_init(&rk->rk_telemetry.termination_cnd); + rd_atomic64_init(&rk->rk_ts_last_poll, rk->rk_ts_created); rd_atomic32_init(&rk->rk_flushing, 0); + rd_atomic32_init(&rk->rk_broker_cnt, 0); + rd_atomic32_init(&rk->rk_logical_broker_cnt, 0); + rd_atomic32_init(&rk->rk_broker_up_cnt, 0); + rd_atomic32_init(&rk->rk_broker_down_cnt, 0); rk->rk_rep = rd_kafka_q_new(rk); rk->rk_ops = rd_kafka_q_new(rk); @@ -2236,6 +2389,7 @@ rd_kafka_t *rd_kafka_new(rd_kafka_type_t type, rk->rk_logq->rkq_opaque = rk; } + rd_list_init(&rk->additional_brokerlists, 0, rd_free); TAILQ_INIT(&rk->rk_brokers); TAILQ_INIT(&rk->rk_topics); rd_kafka_timers_init(&rk->rk_timers, rk, rk->rk_ops); @@ -2243,6 +2397,8 @@ rd_kafka_t *rd_kafka_new(rd_kafka_type_t type, rd_kafka_coord_cache_init(&rk->rk_coord_cache, rk->rk_conf.metadata_max_age_ms); rd_kafka_coord_reqs_init(rk); + rd_list_init(&rk->wait_decommissioned_thrds, 0, NULL); + rd_list_init(&rk->wait_decommissioned_brokers, 0, NULL); if (rk->rk_conf.dr_cb || rk->rk_conf.dr_msg_cb) rk->rk_drmode = RD_KAFKA_DR_MODE_CB; @@ -2275,11 +2431,22 @@ rd_kafka_t *rd_kafka_new(rd_kafka_type_t type, #if WITH_OAUTHBEARER_OIDC if (rk->rk_conf.sasl.oauthbearer.method == RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC && - !rk->rk_conf.sasl.oauthbearer.token_refresh_cb) - rd_kafka_conf_set_oauthbearer_token_refresh_cb( - &rk->rk_conf, rd_kafka_oidc_token_refresh_cb); + !rk->rk_conf.sasl.oauthbearer.token_refresh_cb) { + /* Use JWT bearer */ + if (rk->rk_conf.sasl.oauthbearer.grant_type == + RD_KAFKA_SASL_OAUTHBEARER_GRANT_TYPE_CLIENT_CREDENTIALS) { + rd_kafka_conf_set_oauthbearer_token_refresh_cb( + &rk->rk_conf, + rd_kafka_oidc_token_client_credentials_refresh_cb); + } else { + rd_kafka_conf_set_oauthbearer_token_refresh_cb( + &rk->rk_conf, + rd_kafka_oidc_token_jwt_bearer_refresh_cb); + } + } #endif + rk->rk_controllerid = -1; /* Admin client defaults */ @@ -2412,8 +2579,9 @@ rd_kafka_t *rd_kafka_new(rd_kafka_type_t type, if (RD_KAFKAP_STR_LEN(rk->rk_group_id) > 0) { /* Create consumer group handle */ - rk->rk_cgrp = rd_kafka_cgrp_new(rk, rk->rk_group_id, - rk->rk_client_id); + rk->rk_cgrp = rd_kafka_cgrp_new( + rk, rk->rk_conf.group_protocol, rk->rk_group_id, + rk->rk_client_id); rk->rk_consumer.q = rd_kafka_q_keep(rk->rk_cgrp->rkcg_q); } else { @@ -2421,6 +2589,29 @@ rd_kafka_t *rd_kafka_new(rd_kafka_type_t type, rk->rk_consumer.q = rd_kafka_q_keep(rk->rk_rep); } + rd_avg_init( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_poll_idle_ratio, + RD_AVG_GAUGE, 0, 1, 2, rk->rk_conf.enable_metrics_push); + rd_avg_init( + &rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio, + RD_AVG_GAUGE, 0, 1, 2, rk->rk_conf.enable_metrics_push); + rd_avg_init( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init( + &rk->rk_telemetry.rd_avg_current.rk_avg_rebalance_latency, + RD_AVG_GAUGE, 0, 900000 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_commit_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init( + &rk->rk_telemetry.rd_avg_current.rk_avg_commit_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, + rk->rk_conf.enable_metrics_push); + } else if (type == RD_KAFKA_PRODUCER) { rk->rk_eos.transactional_id = rd_kafkap_str_new(rk->rk_conf.eos.transactional_id, -1); @@ -2499,7 +2690,8 @@ rd_kafka_t *rd_kafka_new(rd_kafka_type_t type, /* Add initial list of brokers from configuration */ if (rk->rk_conf.brokerlist) { - if (rd_kafka_brokers_add0(rk, rk->rk_conf.brokerlist) == 0) + if (rd_kafka_brokers_add0(rk, rk->rk_conf.brokerlist, + rd_true) == 0) rd_kafka_op_err(rk, RD_KAFKA_RESP_ERR__ALL_BROKERS_DOWN, "No brokers configured"); } @@ -2614,7 +2806,35 @@ fail: return NULL; } +/** + * Schedules a rebootstrap of the cluster immediately. + */ +void rd_kafka_rebootstrap(rd_kafka_t *rk) { + if (rk->rk_conf.metadata_recovery_strategy == + RD_KAFKA_METADATA_RECOVERY_STRATEGY_NONE) + return; + rd_kafka_timer_start_oneshot(&rk->rk_timers, &rk->rebootstrap_tmr, + rd_true /*restart*/, 0, + rd_kafka_rebootstrap_tmr_cb, NULL); +} + +/** + * Restarts rebootstrap timer with the configured interval. + * + * @locks none + * @locality any + */ +void rd_kafka_rebootstrap_tmr_restart(rd_kafka_t *rk) { + if (rk->rk_conf.metadata_recovery_strategy == + RD_KAFKA_METADATA_RECOVERY_STRATEGY_NONE) + return; + + rd_kafka_timer_start_oneshot( + &rk->rk_timers, &rk->rebootstrap_tmr, rd_true /*restart*/, + rk->rk_conf.metadata_recovery_rebootstrap_trigger_ms * 1000LL, + rd_kafka_rebootstrap_tmr_cb, NULL); +} /** * Counts usage of the legacy/simple consumer (rd_kafka_consume_start() with @@ -2701,7 +2921,8 @@ static RD_UNUSED int rd_kafka_consume_start0(rd_kafka_topic_t *rkt, return -1; } - rd_kafka_toppar_op_fetch_start(rktp, offset, rkq, RD_KAFKA_NO_REPLYQ); + rd_kafka_toppar_op_fetch_start(rktp, RD_KAFKA_FETCH_POS(offset, -1), + rkq, RD_KAFKA_NO_REPLYQ); rd_kafka_toppar_destroy(rktp); @@ -2813,7 +3034,8 @@ rd_kafka_resp_err_t rd_kafka_seek(rd_kafka_topic_t *app_rkt, replyq = RD_KAFKA_REPLYQ(tmpq, 0); } - if ((err = rd_kafka_toppar_op_seek(rktp, offset, replyq))) { + if ((err = rd_kafka_toppar_op_seek(rktp, RD_KAFKA_FETCH_POS(offset, -1), + replyq))) { if (tmpq) rd_kafka_q_destroy_owner(tmpq); rd_kafka_toppar_destroy(rktp); @@ -2865,8 +3087,9 @@ rd_kafka_seek_partitions(rd_kafka_t *rk, continue; } - err = rd_kafka_toppar_op_seek(rktp, rktpar->offset, - RD_KAFKA_REPLYQ(tmpq, 0)); + err = rd_kafka_toppar_op_seek( + rktp, rd_kafka_topic_partition_get_fetch_pos(rktpar), + RD_KAFKA_REPLYQ(tmpq, 0)); if (err) { rktpar->err = err; } else { @@ -2884,7 +3107,8 @@ rd_kafka_seek_partitions(rd_kafka_t *rk, while (cnt > 0) { rd_kafka_op_t *rko; - rko = rd_kafka_q_pop(tmpq, rd_timeout_remains(abs_timeout), 0); + rko = + rd_kafka_q_pop(tmpq, rd_timeout_remains_us(abs_timeout), 0); if (!rko) { rd_kafka_q_destroy_owner(tmpq); @@ -3023,13 +3247,12 @@ static rd_kafka_op_res_t rd_kafka_consume_callback0( struct consume_ctx ctx = {.consume_cb = consume_cb, .opaque = opaque}; rd_kafka_op_res_t res; - if (timeout_ms) - rd_kafka_app_poll_blocking(rkq->rkq_rk); + rd_kafka_app_poll_start(rkq->rkq_rk, rkq, 0, timeout_ms); res = rd_kafka_q_serve(rkq, timeout_ms, max_cnt, RD_KAFKA_Q_CB_RETURN, rd_kafka_consume_cb, &ctx); - rd_kafka_app_polled(rkq->rkq_rk); + rd_kafka_app_polled(rkq->rkq_rk, rkq); return res; } @@ -3092,16 +3315,15 @@ static rd_kafka_message_t * rd_kafka_consume0(rd_kafka_t *rk, rd_kafka_q_t *rkq, int timeout_ms) { rd_kafka_op_t *rko; rd_kafka_message_t *rkmessage = NULL; - rd_ts_t abs_timeout = rd_timeout_init(timeout_ms); + rd_ts_t now = rd_clock(); + rd_ts_t abs_timeout = rd_timeout_init0(now, timeout_ms); - if (timeout_ms) - rd_kafka_app_poll_blocking(rk); + rd_kafka_app_poll_start(rk, rkq, now, timeout_ms); rd_kafka_yield_thread = 0; while (( rko = rd_kafka_q_pop(rkq, rd_timeout_remains_us(abs_timeout), 0))) { rd_kafka_op_res_t res; - res = rd_kafka_poll_cb(rk, rkq, rko, RD_KAFKA_Q_CB_RETURN, NULL); @@ -3113,7 +3335,7 @@ rd_kafka_consume0(rd_kafka_t *rk, rd_kafka_q_t *rkq, int timeout_ms) { /* Callback called rd_kafka_yield(), we must * stop dispatching the queue and return. */ rd_kafka_set_last_error(RD_KAFKA_RESP_ERR__INTR, EINTR); - rd_kafka_app_polled(rk); + rd_kafka_app_polled(rk, rkq); return NULL; } @@ -3125,7 +3347,7 @@ rd_kafka_consume0(rd_kafka_t *rk, rd_kafka_q_t *rkq, int timeout_ms) { /* Timeout reached with no op returned. */ rd_kafka_set_last_error(RD_KAFKA_RESP_ERR__TIMED_OUT, ETIMEDOUT); - rd_kafka_app_polled(rk); + rd_kafka_app_polled(rk, rkq); return NULL; } @@ -3140,7 +3362,7 @@ rd_kafka_consume0(rd_kafka_t *rk, rd_kafka_q_t *rkq, int timeout_ms) { rd_kafka_set_last_error(0, 0); - rd_kafka_app_polled(rk); + rd_kafka_app_polled(rk, rkq); return rkmessage; } @@ -3419,10 +3641,12 @@ rd_kafka_position(rd_kafka_t *rk, rd_kafka_topic_partition_list_t *partitions) { } rd_kafka_toppar_lock(rktp); - rktpar->offset = rktp->rktp_app_offset; - rktpar->err = RD_KAFKA_RESP_ERR_NO_ERROR; + rd_kafka_topic_partition_set_from_fetch_pos(rktpar, + rktp->rktp_app_pos); rd_kafka_toppar_unlock(rktp); rd_kafka_toppar_destroy(rktp); + + rktpar->err = RD_KAFKA_RESP_ERR_NO_ERROR; } return RD_KAFKA_RESP_ERR_NO_ERROR; @@ -3449,6 +3673,7 @@ static void rd_kafka_query_wmark_offsets_resp_cb(rd_kafka_t *rk, struct _query_wmark_offsets_state *state; rd_kafka_topic_partition_list_t *offsets; rd_kafka_topic_partition_t *rktpar; + int actions = 0; if (err == RD_KAFKA_RESP_ERR__DESTROY) { /* 'state' has gone out of scope when query_watermark..() @@ -3460,7 +3685,15 @@ static void rd_kafka_query_wmark_offsets_resp_cb(rd_kafka_t *rk, offsets = rd_kafka_topic_partition_list_new(1); err = rd_kafka_handle_ListOffsets(rk, rkb, err, rkbuf, request, offsets, - NULL); + &actions); + + if (actions & RD_KAFKA_ERR_ACTION_REFRESH) { + /* Remove its cache in case the topic isn't a known topic. */ + rd_kafka_wrlock(rk); + rd_kafka_metadata_cache_delete_by_name(rk, state->topic); + rd_kafka_wrunlock(rk); + } + if (err == RD_KAFKA_RESP_ERR__IN_PROGRESS) { rd_kafka_topic_partition_list_destroy(offsets); return; /* Retrying */ @@ -3481,14 +3714,18 @@ static void rd_kafka_query_wmark_offsets_resp_cb(rd_kafka_t *rk, /* FALLTHRU */ } - /* Partition not seen in response. */ - if (!(rktpar = rd_kafka_topic_partition_list_find(offsets, state->topic, - state->partition))) + rktpar = rd_kafka_topic_partition_list_find(offsets, state->topic, + state->partition); + if (!rktpar && err > RD_KAFKA_RESP_ERR__END) { + /* Partition not seen in response, + * not a local error. */ err = RD_KAFKA_RESP_ERR__BAD_MSG; - else if (rktpar->err) - err = rktpar->err; - else - state->offsets[state->offidx] = rktpar->offset; + } else if (rktpar) { + if (rktpar->err) + err = rktpar->err; + else + state->offsets[state->offidx] = rktpar->offset; + } state->offidx++; @@ -3544,26 +3781,25 @@ rd_kafka_resp_err_t rd_kafka_query_watermark_offsets(rd_kafka_t *rk, state.ts_end = ts_end; state.state_version = rd_kafka_brokers_get_state_version(rk); - rktpar->offset = RD_KAFKA_OFFSET_BEGINNING; rd_kafka_ListOffsetsRequest( leader->rkb, partitions, RD_KAFKA_REPLYQ(rkq, 0), - rd_kafka_query_wmark_offsets_resp_cb, &state); + rd_kafka_query_wmark_offsets_resp_cb, timeout_ms, &state); rktpar->offset = RD_KAFKA_OFFSET_END; rd_kafka_ListOffsetsRequest( leader->rkb, partitions, RD_KAFKA_REPLYQ(rkq, 0), - rd_kafka_query_wmark_offsets_resp_cb, &state); + rd_kafka_query_wmark_offsets_resp_cb, timeout_ms, &state); rd_kafka_topic_partition_list_destroy(partitions); rd_list_destroy(&leaders); /* Wait for reply (or timeout) */ - while (state.err == RD_KAFKA_RESP_ERR__IN_PROGRESS && - rd_kafka_q_serve(rkq, 100, 0, RD_KAFKA_Q_CB_CALLBACK, - rd_kafka_poll_cb, - NULL) != RD_KAFKA_OP_RES_YIELD) - ; + while (state.err == RD_KAFKA_RESP_ERR__IN_PROGRESS) { + rd_kafka_q_serve(rkq, RD_POLL_INFINITE, 0, + RD_KAFKA_Q_CB_CALLBACK, rd_kafka_poll_cb, + NULL); + } rd_kafka_q_destroy_owner(rkq); @@ -3632,6 +3868,7 @@ static void rd_kafka_get_offsets_for_times_resp_cb(rd_kafka_t *rk, rd_kafka_buf_t *request, void *opaque) { struct _get_offsets_for_times *state; + int actions = 0; if (err == RD_KAFKA_RESP_ERR__DESTROY) { /* 'state' has gone out of scope when offsets_for_times() @@ -3642,10 +3879,22 @@ static void rd_kafka_get_offsets_for_times_resp_cb(rd_kafka_t *rk, state = opaque; err = rd_kafka_handle_ListOffsets(rk, rkb, err, rkbuf, request, - state->results, NULL); + state->results, &actions); if (err == RD_KAFKA_RESP_ERR__IN_PROGRESS) return; /* Retrying */ + if (actions & RD_KAFKA_ERR_ACTION_REFRESH) { + rd_kafka_topic_partition_t *rktpar; + /* Remove its cache in case the topic isn't a known topic. */ + rd_kafka_wrlock(rk); + RD_KAFKA_TPLIST_FOREACH(rktpar, state->results) { + if (rktpar->err) + rd_kafka_metadata_cache_delete_by_name( + rk, rktpar->topic); + } + rd_kafka_wrunlock(rk); + } + /* Retry if no broker connection is available yet. */ if (err == RD_KAFKA_RESP_ERR__TRANSPORT && rkb && rd_kafka_brokers_wait_state_change( @@ -3703,7 +3952,7 @@ rd_kafka_offsets_for_times(rd_kafka_t *rk, state.wait_reply++; rd_kafka_ListOffsetsRequest( leader->rkb, leader->partitions, RD_KAFKA_REPLYQ(rkq, 0), - rd_kafka_get_offsets_for_times_resp_cb, &state); + rd_kafka_get_offsets_for_times_resp_cb, timeout_ms, &state); } rd_list_destroy(&leaders); @@ -3760,9 +4009,10 @@ rd_kafka_op_res_t rd_kafka_poll_cb(rd_kafka_t *rk, cb_type == RD_KAFKA_Q_CB_FORCE_RETURN) return RD_KAFKA_OP_RES_PASS; /* Dont handle here */ else { - struct consume_ctx ctx = {.consume_cb = - rk->rk_conf.consume_cb, - .opaque = rk->rk_conf.opaque}; + rkq->rkq_ts_last_poll_end = rd_clock(); + struct consume_ctx ctx = {.consume_cb = + rk->rk_conf.consume_cb, + .opaque = rk->rk_conf.opaque}; return rd_kafka_consume_cb(rk, rkq, rko, cb_type, &ctx); } @@ -3912,6 +4162,9 @@ rd_kafka_op_res_t rd_kafka_poll_cb(rd_kafka_t *rk, case RD_KAFKA_OP_TERMINATE: /* nop: just a wake-up */ res = RD_KAFKA_OP_RES_YIELD; + if (rko->rko_u.terminated.cb) { + rko->rko_u.terminated.cb(rk, rko->rko_u.terminated.rkb); + } rd_kafka_op_destroy(rko); break; @@ -3919,6 +4172,7 @@ rd_kafka_op_res_t rd_kafka_poll_cb(rd_kafka_t *rk, case RD_KAFKA_OP_DELETETOPICS: case RD_KAFKA_OP_CREATEPARTITIONS: case RD_KAFKA_OP_ALTERCONFIGS: + case RD_KAFKA_OP_INCREMENTALALTERCONFIGS: case RD_KAFKA_OP_DESCRIBECONFIGS: case RD_KAFKA_OP_DELETERECORDS: case RD_KAFKA_OP_DELETEGROUPS: @@ -3926,6 +4180,7 @@ rd_kafka_op_res_t rd_kafka_poll_cb(rd_kafka_t *rk, case RD_KAFKA_OP_CREATEACLS: case RD_KAFKA_OP_DESCRIBEACLS: case RD_KAFKA_OP_DELETEACLS: + case RD_KAFKA_OP_LISTOFFSETS: /* Calls op_destroy() from worker callback, * when the time comes. */ res = rd_kafka_op_call(rk, rkq, rko); @@ -3952,6 +4207,19 @@ rd_kafka_op_res_t rd_kafka_poll_cb(rd_kafka_t *rk, rd_kafka_purge(rk, rko->rko_u.purge.flags); break; + case RD_KAFKA_OP_SET_TELEMETRY_BROKER: + rd_kafka_set_telemetry_broker_maybe( + rk, rko->rko_u.telemetry_broker.rkb); + break; + + case RD_KAFKA_OP_TERMINATE_TELEMETRY: + rd_kafka_telemetry_schedule_termination(rko->rko_rk); + break; + + case RD_KAFKA_OP_METADATA_UPDATE: + res = rd_kafka_metadata_update_op(rk, rko->rko_u.metadata.mdi); + break; + default: /* If op has a callback set (e.g., OAUTHBEARER_REFRESH), * call it. */ @@ -3974,14 +4242,9 @@ rd_kafka_op_res_t rd_kafka_poll_cb(rd_kafka_t *rk, int rd_kafka_poll(rd_kafka_t *rk, int timeout_ms) { int r; - if (timeout_ms) - rd_kafka_app_poll_blocking(rk); - - r = rd_kafka_q_serve(rk->rk_rep, timeout_ms, 0, RD_KAFKA_Q_CB_CALLBACK, - rd_kafka_poll_cb, NULL); - - rd_kafka_app_polled(rk); - + r = rd_kafka_q_serve_maybe_consume(rk->rk_rep, timeout_ms, 0, + RD_KAFKA_Q_CB_CALLBACK, + rd_kafka_poll_cb, NULL); return r; } @@ -3989,13 +4252,10 @@ int rd_kafka_poll(rd_kafka_t *rk, int timeout_ms) { rd_kafka_event_t *rd_kafka_queue_poll(rd_kafka_queue_t *rkqu, int timeout_ms) { rd_kafka_op_t *rko; - if (timeout_ms) - rd_kafka_app_poll_blocking(rkqu->rkqu_rk); + rko = rd_kafka_q_pop_serve_maybe_consume( + rkqu->rkqu_q, rd_timeout_us(timeout_ms), 0, RD_KAFKA_Q_CB_EVENT, + rd_kafka_poll_cb, NULL); - rko = rd_kafka_q_pop_serve(rkqu->rkqu_q, rd_timeout_us(timeout_ms), 0, - RD_KAFKA_Q_CB_EVENT, rd_kafka_poll_cb, NULL); - - rd_kafka_app_polled(rkqu->rkqu_rk); if (!rko) return NULL; @@ -4006,14 +4266,9 @@ rd_kafka_event_t *rd_kafka_queue_poll(rd_kafka_queue_t *rkqu, int timeout_ms) { int rd_kafka_queue_poll_callback(rd_kafka_queue_t *rkqu, int timeout_ms) { int r; - if (timeout_ms) - rd_kafka_app_poll_blocking(rkqu->rkqu_rk); - - r = rd_kafka_q_serve(rkqu->rkqu_q, timeout_ms, 0, - RD_KAFKA_Q_CB_CALLBACK, rd_kafka_poll_cb, NULL); - - rd_kafka_app_polled(rkqu->rkqu_rk); - + r = rd_kafka_q_serve_maybe_consume(rkqu->rkqu_q, timeout_ms, 0, + RD_KAFKA_Q_CB_CALLBACK, + rd_kafka_poll_cb, NULL); return r; } @@ -4570,6 +4825,26 @@ rd_kafka_consumer_group_state_code(const char *name) { return RD_KAFKA_CONSUMER_GROUP_STATE_UNKNOWN; } +static const char *rd_kafka_consumer_group_type_names[] = { + "Unknown", "Consumer", "Classic"}; + +const char * +rd_kafka_consumer_group_type_name(rd_kafka_consumer_group_type_t type) { + if (type < 0 || type >= RD_KAFKA_CONSUMER_GROUP_TYPE__CNT) + return NULL; + return rd_kafka_consumer_group_type_names[type]; +} + +rd_kafka_consumer_group_type_t +rd_kafka_consumer_group_type_code(const char *name) { + size_t i; + for (i = 0; i < RD_KAFKA_CONSUMER_GROUP_TYPE__CNT; i++) { + if (!rd_strcasecmp(rd_kafka_consumer_group_type_names[i], name)) + return i; + } + return RD_KAFKA_CONSUMER_GROUP_TYPE_UNKNOWN; +} + static void rd_kafka_DescribeGroups_resp_cb(rd_kafka_t *rk, rd_kafka_broker_t *rkb, rd_kafka_resp_err_t err, @@ -4624,8 +4899,8 @@ static void rd_kafka_DescribeGroups_resp_cb(rd_kafka_t *rk, goto err; } + gi->broker.id = rkb->rkb_nodeid; rd_kafka_broker_lock(rkb); - gi->broker.id = rkb->rkb_nodeid; gi->broker.host = rd_strdup(rkb->rkb_origname); gi->broker.port = rkb->rkb_port; rd_kafka_broker_unlock(rkb); @@ -4651,8 +4926,8 @@ static void rd_kafka_DescribeGroups_resp_cb(rd_kafka_t *rk, rd_kafka_buf_read_str(reply, &MemberId); rd_kafka_buf_read_str(reply, &ClientId); rd_kafka_buf_read_str(reply, &ClientHost); - rd_kafka_buf_read_bytes(reply, &Meta); - rd_kafka_buf_read_bytes(reply, &Assignment); + rd_kafka_buf_read_kbytes(reply, &Meta); + rd_kafka_buf_read_kbytes(reply, &Assignment); mi->member_id = RD_KAFKAP_STR_DUP(&MemberId); mi->client_id = RD_KAFKAP_STR_DUP(&ClientId); @@ -4754,7 +5029,9 @@ static void rd_kafka_ListGroups_resp_cb(rd_kafka_t *rk, state->wait_cnt++; error = rd_kafka_DescribeGroupsRequest( - rkb, 0, grps, i, RD_KAFKA_REPLYQ(state->q, 0), + rkb, 0, grps, i, + rd_false /* don't include authorized operations */, + RD_KAFKA_REPLYQ(state->q, 0), rd_kafka_DescribeGroups_resp_cb, state); if (error) { rd_kafka_DescribeGroups_resp_cb( @@ -4822,6 +5099,9 @@ rd_kafka_list_groups(rd_kafka_t *rk, /* Query each broker for its list of groups */ rd_kafka_rdlock(rk); TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + if (rd_kafka_broker_or_instance_terminating(rkb)) + continue; + rd_kafka_error_t *error; rd_kafka_broker_lock(rkb); if (rkb->rkb_nodeid == -1 || RD_KAFKA_BROKER_IS_LOGICAL(rkb)) { @@ -4833,7 +5113,7 @@ rd_kafka_list_groups(rd_kafka_t *rk, state.wait_cnt++; rkb_cnt++; error = rd_kafka_ListGroupsRequest( - rkb, 0, NULL, 0, RD_KAFKA_REPLYQ(state.q, 0), + rkb, 0, NULL, 0, NULL, 0, RD_KAFKA_REPLYQ(state.q, 0), rd_kafka_ListGroups_resp_cb, &state); if (error) { rd_kafka_ListGroups_resp_cb(rk, rkb, @@ -4933,13 +5213,8 @@ const char *rd_kafka_get_debug_contexts(void) { int rd_kafka_path_is_dir(const char *path) { -#ifdef _WIN32 - struct _stat st; - return (_stat(path, &st) == 0 && st.st_mode & S_IFDIR); -#else - struct stat st; - return (stat(path, &st) == 0 && S_ISDIR(st.st_mode)); -#endif + rd_bool_t is_dir; + return rd_file_stat(path, &is_dir) && is_dir; } @@ -5009,3 +5284,154 @@ int rd_kafka_errno(void) { int rd_kafka_unittest(void) { return rd_unittest(); } + + +/** + * Creates a new UUID. + * + * @return A newly allocated UUID. + */ +rd_kafka_Uuid_t *rd_kafka_Uuid_new(int64_t most_significant_bits, + int64_t least_significant_bits) { + rd_kafka_Uuid_t *uuid = rd_calloc(1, sizeof(rd_kafka_Uuid_t)); + uuid->most_significant_bits = most_significant_bits; + uuid->least_significant_bits = least_significant_bits; + return uuid; +} + +/** + * Returns a newly allocated copy of the given UUID. + * + * @param uuid UUID to copy. + * @return Copy of the provided UUID. + * + * @remark Dynamically allocated. Deallocate (free) after use. + */ +rd_kafka_Uuid_t *rd_kafka_Uuid_copy(const rd_kafka_Uuid_t *uuid) { + rd_kafka_Uuid_t *copy_uuid = rd_kafka_Uuid_new( + uuid->most_significant_bits, uuid->least_significant_bits); + if (*uuid->base64str) + memcpy(copy_uuid->base64str, uuid->base64str, 23); + return copy_uuid; +} + +/** + * Returns a new non cryptographically secure UUIDv4 (random). + * + * @return A UUIDv4. + * + * @remark Must be freed after use using rd_kafka_Uuid_destroy(). + */ +rd_kafka_Uuid_t rd_kafka_Uuid_random() { + int i; + unsigned char rand_values_bytes[16] = {0}; + uint64_t *rand_values_uint64 = (uint64_t *)rand_values_bytes; + unsigned char *rand_values_app; + rd_kafka_Uuid_t ret = RD_KAFKA_UUID_ZERO; + for (i = 0; i < 16; i += 2) { + uint16_t rand_uint16 = (uint16_t)rd_jitter(0, INT16_MAX - 1); + /* No need to convert endianess here because it's still only + * a random value. */ + rand_values_app = (unsigned char *)&rand_uint16; + rand_values_bytes[i] |= rand_values_app[0]; + rand_values_bytes[i + 1] |= rand_values_app[1]; + } + + rand_values_bytes[6] &= 0x0f; /* clear version */ + rand_values_bytes[6] |= 0x40; /* version 4 */ + rand_values_bytes[8] &= 0x3f; /* clear variant */ + rand_values_bytes[8] |= 0x80; /* IETF variant */ + + ret.most_significant_bits = be64toh(rand_values_uint64[0]); + ret.least_significant_bits = be64toh(rand_values_uint64[1]); + return ret; +} + +/** + * @brief Destroy the provided uuid. + * + * @param uuid UUID + */ +void rd_kafka_Uuid_destroy(rd_kafka_Uuid_t *uuid) { + rd_free(uuid); +} + +/** + * @brief Computes canonical encoding for the given uuid string. + * Mainly useful for testing. + * + * @param uuid UUID for which canonical encoding is required. + * + * @return canonical encoded string for the given UUID. + * + * @remark Must be freed after use. + */ +char *rd_kafka_Uuid_str(const rd_kafka_Uuid_t *uuid) { + int i, j; + unsigned char bytes[16]; + char *ret = rd_calloc(37, sizeof(*ret)); + + for (i = 0; i < 8; i++) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + j = 7 - i; +#elif __BYTE_ORDER == __BIG_ENDIAN + j = i; +#endif + bytes[i] = (uuid->most_significant_bits >> (8 * j)) & 0xFF; + bytes[8 + i] = (uuid->least_significant_bits >> (8 * j)) & 0xFF; + } + + rd_snprintf(ret, 37, + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%" + "02x%02x%02x", + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], + bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], + bytes[11], bytes[12], bytes[13], bytes[14], bytes[15]); + return ret; +} + +const char *rd_kafka_Uuid_base64str(const rd_kafka_Uuid_t *uuid) { + if (*uuid->base64str) + return uuid->base64str; + + rd_chariov_t in_base64; + char *out_base64_str; + char *uuid_bytes; + uint64_t input_uuid[2]; + + input_uuid[0] = htobe64(uuid->most_significant_bits); + input_uuid[1] = htobe64(uuid->least_significant_bits); + uuid_bytes = (char *)input_uuid; + in_base64.ptr = uuid_bytes; + in_base64.size = sizeof(uuid->most_significant_bits) + + sizeof(uuid->least_significant_bits); + + out_base64_str = rd_base64_encode_str(&in_base64); + if (!out_base64_str) + return NULL; + + rd_strlcpy((char *)uuid->base64str, out_base64_str, + 23 /* Removing extra ('=') padding */); + rd_free(out_base64_str); + return uuid->base64str; +} + +unsigned int rd_kafka_Uuid_hash(const rd_kafka_Uuid_t *uuid) { + unsigned char bytes[16]; + memcpy(bytes, &uuid->most_significant_bits, 8); + memcpy(&bytes[8], &uuid->least_significant_bits, 8); + return rd_bytes_hash(bytes, 16); +} + +unsigned int rd_kafka_Uuid_map_hash(const void *key) { + return rd_kafka_Uuid_hash(key); +} + +int64_t rd_kafka_Uuid_least_significant_bits(const rd_kafka_Uuid_t *uuid) { + return uuid->least_significant_bits; +} + + +int64_t rd_kafka_Uuid_most_significant_bits(const rd_kafka_Uuid_t *uuid) { + return uuid->most_significant_bits; +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka.h b/src/third_party/librdkafka/dist/src/rdkafka.h index d77216f0e83..583e57b7d61 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka.h +++ b/src/third_party/librdkafka/dist/src/rdkafka.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2022 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -92,6 +93,7 @@ typedef SSIZE_T ssize_t; #define RD_DEPRECATED __attribute__((deprecated)) #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) +#define RD_HAS_STATEMENT_EXPRESSIONS #define RD_FORMAT(...) __attribute__((format(__VA_ARGS__))) #else #define RD_FORMAT(...) @@ -165,7 +167,7 @@ typedef SSIZE_T ssize_t; * @remark This value should only be used during compile time, * for runtime checks of version use rd_kafka_version() */ -#define RD_KAFKA_VERSION 0x020002ff +#define RD_KAFKA_VERSION 0x020b00ff /** * @brief Returns the librdkafka version as integer. @@ -260,6 +262,9 @@ typedef struct rd_kafka_error_s rd_kafka_error_t; typedef struct rd_kafka_headers_s rd_kafka_headers_t; typedef struct rd_kafka_group_result_s rd_kafka_group_result_t; typedef struct rd_kafka_acl_result_s rd_kafka_acl_result_t; +typedef struct rd_kafka_Uuid_s rd_kafka_Uuid_t; +typedef struct rd_kafka_topic_partition_result_s + rd_kafka_topic_partition_result_t; /* @endcond */ @@ -283,7 +288,7 @@ typedef enum { RD_KAFKA_RESP_ERR__BAD_MSG = -199, /** Bad/unknown compression */ RD_KAFKA_RESP_ERR__BAD_COMPRESSION = -198, - /** Broker is going away */ + /** Broker is going away, together with client instance */ RD_KAFKA_RESP_ERR__DESTROY = -197, /** Generic failure */ RD_KAFKA_RESP_ERR__FAIL = -196, @@ -402,6 +407,13 @@ typedef enum { RD_KAFKA_RESP_ERR__NOOP = -141, /** No offset to automatically reset to */ RD_KAFKA_RESP_ERR__AUTO_OFFSET_RESET = -140, + /** Partition log truncation detected */ + RD_KAFKA_RESP_ERR__LOG_TRUNCATION = -139, + /** A different record in the batch was invalid + * and this message failed persisting. */ + RD_KAFKA_RESP_ERR__INVALID_DIFFERENT_RECORD = -138, + /** Broker is going away but client isn't terminating */ + RD_KAFKA_RESP_ERR__DESTROY_BROKER = -137, /** End internal error codes */ RD_KAFKA_RESP_ERR__END = -100, @@ -421,7 +433,9 @@ typedef enum { RD_KAFKA_RESP_ERR_INVALID_MSG_SIZE = 4, /** Leader not available */ RD_KAFKA_RESP_ERR_LEADER_NOT_AVAILABLE = 5, - /** Not leader for partition */ +/** Not leader for partition */ +#define RD_KAFKA_RESP_ERR_NOT_LEADER_OR_FOLLOWER \ + RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION = 6, /** Request timed out */ RD_KAFKA_RESP_ERR_REQUEST_TIMED_OUT = 7, @@ -624,7 +638,27 @@ typedef enum { RD_KAFKA_RESP_ERR_FEATURE_UPDATE_FAILED = 96, /** Request principal deserialization failed during forwarding */ RD_KAFKA_RESP_ERR_PRINCIPAL_DESERIALIZATION_FAILURE = 97, - + /** Unknown Topic Id */ + RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_ID = 100, + /** The member epoch is fenced by the group coordinator */ + RD_KAFKA_RESP_ERR_FENCED_MEMBER_EPOCH = 110, + /** The instance ID is still used by another member in the + * consumer group */ + RD_KAFKA_RESP_ERR_UNRELEASED_INSTANCE_ID = 111, + /** The assignor or its version range is not supported by the consumer + * group */ + RD_KAFKA_RESP_ERR_UNSUPPORTED_ASSIGNOR = 112, + /** The member epoch is stale */ + RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH = 113, + /** Client sent a push telemetry request with an invalid or outdated + * subscription ID. */ + RD_KAFKA_RESP_ERR_UNKNOWN_SUBSCRIPTION_ID = 117, + /** Client sent a push telemetry request larger than the maximum size + * the broker will accept. */ + RD_KAFKA_RESP_ERR_TELEMETRY_TOO_LARGE = 118, + /** Client metadata is stale, + * client should rebootstrap to obtain new metadata. */ + RD_KAFKA_RESP_ERR_REBOOTSTRAP_REQUIRED = 129, RD_KAFKA_RESP_ERR_END_ALL, } rd_kafka_resp_err_t; @@ -898,10 +932,11 @@ typedef struct rd_kafka_topic_partition_s { void *opaque; /**< Opaque value for application use */ rd_kafka_resp_err_t err; /**< Error code, depending on use. */ void *_private; /**< INTERNAL USE ONLY, - * INITIALIZE TO ZERO, DO NOT TOUCH */ + * INITIALIZE TO ZERO, DO NOT TOUCH, + * DO NOT COPY, DO NOT SHARE WITH OTHER + * rd_kafka_t INSTANCES. */ } rd_kafka_topic_partition_t; - /** * @brief Destroy a rd_kafka_topic_partition_t. * @remark This must not be called for elements in a topic partition list. @@ -910,6 +945,31 @@ RD_EXPORT void rd_kafka_topic_partition_destroy(rd_kafka_topic_partition_t *rktpar); +/** + * @brief Sets the offset leader epoch (use -1 to clear). + * + * @param rktpar Partition object. + * @param leader_epoch Offset leader epoch, use -1 to reset. + * + * @remark See KIP-320 for more information. + */ +RD_EXPORT +void rd_kafka_topic_partition_set_leader_epoch( + rd_kafka_topic_partition_t *rktpar, + int32_t leader_epoch); + +/** + * @returns the offset leader epoch, if relevant and known, + * else -1. + * + * @param rktpar Partition object. + * + * @remark See KIP-320 for more information. + */ +RD_EXPORT +int32_t rd_kafka_topic_partition_get_leader_epoch( + const rd_kafka_topic_partition_t *rktpar); + /** * @brief A growable list of Topic+Partitions. * @@ -920,7 +980,6 @@ typedef struct rd_kafka_topic_partition_list_s { rd_kafka_topic_partition_t *elems; /**< Element array[] */ } rd_kafka_topic_partition_list_t; - /** * @brief Create a new list/vector Topic+Partition container. * @@ -938,7 +997,6 @@ typedef struct rd_kafka_topic_partition_list_s { RD_EXPORT rd_kafka_topic_partition_list_t *rd_kafka_topic_partition_list_new(int size); - /** * @brief Free all resources used by the list and the list itself. */ @@ -1429,7 +1487,8 @@ typedef struct rd_kafka_message_s { * for retried messages when * idempotence is enabled. */ void *_private; /**< Consumer: - * - rdkafka private pointer: DO NOT MODIFY + * - rdkafka private pointer: + * DO NOT MODIFY, DO NOT COPY. * Producer: * - dr_msg_cb: * msg_opaque from produce() call or @@ -1454,6 +1513,16 @@ void rd_kafka_message_destroy(rd_kafka_message_t *rkmessage); RD_EXPORT const char *rd_kafka_message_errstr(const rd_kafka_message_t *rkmessage); +/** + * @brief Returns the error string for an errored produced rd_kafka_message_t or + * NULL if there was no error. + * + * @remark This function MUST used with the producer. + */ +RD_EXPORT +const char * +rd_kafka_message_produce_errstr(const rd_kafka_message_t *rkmessage); + /** * @brief Returns the message timestamp for a consumed message. @@ -1585,6 +1654,87 @@ typedef enum { RD_EXPORT rd_kafka_msg_status_t rd_kafka_message_status(const rd_kafka_message_t *rkmessage); + +/** + * @returns the message's partition leader epoch at the time the message was + * fetched and if known, else -1. + * + * @remark This API must only be used on consumed messages without error. + * @remark Requires broker version >= 2.10 (KIP-320). + */ +RD_EXPORT int32_t +rd_kafka_message_leader_epoch(const rd_kafka_message_t *rkmessage); + + +/**@}*/ + + +/** + * @name UUID + * @{ + * + */ + +/** + * @brief Computes base64 encoding for the given uuid string. + * @param uuid UUID for which base64 encoding is required. + * + * @return base64 encoded string for the given UUID or NULL in case of some + * issue with the conversion or the conversion is not supported. + */ +RD_EXPORT const char *rd_kafka_Uuid_base64str(const rd_kafka_Uuid_t *uuid); + +/** + * @brief Gets least significant 64 bits for the given UUID. + * + * @param uuid UUID + * + * @return least significant 64 bits for the given UUID. + */ +RD_EXPORT int64_t +rd_kafka_Uuid_least_significant_bits(const rd_kafka_Uuid_t *uuid); + + +/** + * @brief Gets most significant 64 bits for the given UUID. + * + * @param uuid UUID + * + * @return most significant 64 bits for the given UUID. + */ +RD_EXPORT int64_t +rd_kafka_Uuid_most_significant_bits(const rd_kafka_Uuid_t *uuid); + + +/** + * @brief Creates a new UUID. + * + * @param most_significant_bits most significant 64 bits of the 128 bits UUID. + * @param least_significant_bits least significant 64 bits of the 128 bits UUID. + * + * @return A newly allocated UUID. + * @remark Must be freed after use using rd_kafka_Uuid_destroy() + */ +RD_EXPORT rd_kafka_Uuid_t *rd_kafka_Uuid_new(int64_t most_significant_bits, + int64_t least_significant_bits); + +/** + * @brief Copies the given UUID. + * + * @param uuid UUID to be copied. + * + * @return A newly allocated copy of the provided UUID. + * @remark Must be freed after use using rd_kafka_Uuid_destroy() + */ +RD_EXPORT rd_kafka_Uuid_t *rd_kafka_Uuid_copy(const rd_kafka_Uuid_t *uuid); + +/** + * @brief Destroy the provided uuid. + * + * @param uuid UUID + */ +RD_EXPORT void rd_kafka_Uuid_destroy(rd_kafka_Uuid_t *uuid); + /**@}*/ @@ -2056,7 +2206,7 @@ void rd_kafka_conf_set_log_cb(rd_kafka_conf_t *conf, * rd_kafka_conf_set_opaque(). * * For more information on the format of \p json, see - * https://github.com/edenhill/librdkafka/wiki/Statistics + * https://github.com/confluentinc/librdkafka/wiki/Statistics * * If the application wishes to hold on to the \p json pointer and free * it at a later time it must return 1 from the \p stats_cb. @@ -3081,7 +3231,7 @@ void *rd_kafka_topic_opaque(const rd_kafka_topic_t *rkt); * The unassigned partition is used by the producer API for messages * that should be partitioned using the configured or default partitioner. */ -#define RD_KAFKA_PARTITION_UA ((int32_t)-1) +#define RD_KAFKA_PARTITION_UA ((int32_t) - 1) /** @@ -3386,6 +3536,12 @@ rd_kafka_error_t *rd_kafka_sasl_set_credentials(rd_kafka_t *rk, * * @remark rd_kafka_queue_destroy() MUST be called on this queue * prior to calling rd_kafka_consumer_close(). + * @remark Polling the returned queue counts as a consumer poll, and will reset + * the timer for max.poll.interval.ms. If this queue is forwarded to a + * "destq", polling destq also counts as a consumer poll (this works + * for any number of forwards). However, even if this queue is + * unforwarded or forwarded elsewhere, polling destq will continue + * to count as a consumer poll. */ RD_EXPORT rd_kafka_queue_t *rd_kafka_queue_get_consumer(rd_kafka_t *rk); @@ -3889,6 +4045,11 @@ int rd_kafka_consume_callback_queue( * The \c offset + 1 will be committed (written) to broker (or file) according * to \c `auto.commit.interval.ms` or manual offset-less commit() * + * @deprecated This API lacks support for partition leader epochs, which makes + * it at risk for unclean leader election log truncation issues. + * Use rd_kafka_offsets_store() and rd_kafka_offset_store_message() + * instead. + * * @warning This method may only be called for partitions that are currently * assigned. * Non-assigned partitions will fail with RD_KAFKA_RESP_ERR__STATE. @@ -3931,6 +4092,9 @@ rd_kafka_offset_store(rd_kafka_topic_t *rkt, int32_t partition, int64_t offset); * @remark \c `enable.auto.offset.store` must be set to "false" when using * this API. * + * @remark The leader epoch, if set, will be used to fence outdated partition + * leaders. See rd_kafka_topic_partition_set_leader_epoch(). + * * @returns RD_KAFKA_RESP_ERR_NO_ERROR on (partial) success, or * RD_KAFKA_RESP_ERR__INVALID_ARG if \c enable.auto.offset.store * is true, or @@ -3940,6 +4104,31 @@ rd_kafka_offset_store(rd_kafka_topic_t *rkt, int32_t partition, int64_t offset); RD_EXPORT rd_kafka_resp_err_t rd_kafka_offsets_store(rd_kafka_t *rk, rd_kafka_topic_partition_list_t *offsets); + + +/** + * @brief Store offset +1 for the consumed message. + * + * The message offset + 1 will be committed to broker according + * to \c `auto.commit.interval.ms` or manual offset-less commit() + * + * @warning This method may only be called for partitions that are currently + * assigned. + * Non-assigned partitions will fail with RD_KAFKA_RESP_ERR__STATE. + * Since v1.9.0. + * + * @warning Avoid storing offsets after calling rd_kafka_seek() (et.al) as + * this may later interfere with resuming a paused partition, instead + * store offsets prior to calling seek. + * + * @remark \c `enable.auto.offset.store` must be set to "false" when using + * this API. + * + * @returns NULL on success or an error object on failure. + */ +RD_EXPORT +rd_kafka_error_t *rd_kafka_offset_store_message(rd_kafka_message_t *rkmessage); + /**@}*/ @@ -4250,6 +4439,21 @@ RD_EXPORT int rd_kafka_assignment_lost(rd_kafka_t *rk); * or successfully scheduled if asynchronous, or failed. * RD_KAFKA_RESP_ERR__FATAL is returned if the consumer has raised * a fatal error. + * + * FIXME: Update below documentation. + * + * RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH is returned, when + * using `group.protocol=consumer`, if the commit failed because the + * member has switched to a new member epoch. + * This error code can be retried. + * Partition level error is also set in the \p offsets. + * + * RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID is returned, when + * using `group.protocol=consumer`, if the member has been + * removed from the consumer group + * This error code is permanent, uncommitted messages will be + * reprocessed by this or a different member and committed there. + * Partition level error is also set in the \p offsets. */ RD_EXPORT rd_kafka_resp_err_t rd_kafka_commit(rd_kafka_t *rk, @@ -4393,6 +4597,58 @@ rd_kafka_consumer_group_metadata_new_with_genid(const char *group_id, const char *member_id, const char *group_instance_id); +/** + * @brief Get group id of a group metadata. + * + * @param group_metadata The group metadata. + * + * @returns The group id contained in the passed \p group_metadata. + * + * @remark The returned pointer has the same lifetime as \p group_metadata. + */ +RD_EXPORT +const char *rd_kafka_consumer_group_metadata_group_id( + const rd_kafka_consumer_group_metadata_t *group_metadata); + +/** + * @brief Get group instance id of a group metadata. + * + * @param group_metadata The group metadata. + * + * @returns The group instance id contained in the passed \p group_metadata + * or NULL. + * + * @remark The returned pointer has the same lifetime as \p group_metadata. + */ +RD_EXPORT +const char *rd_kafka_consumer_group_metadata_group_instance_id( + const rd_kafka_consumer_group_metadata_t *group_metadata); + +/** + * @brief Get member id of a group metadata. + * + * @param group_metadata The group metadata. + * + * @returns The member id contained in the passed \p group_metadata. + * + * @remark The returned pointer has the same lifetime as \p group_metadata. + */ +RD_EXPORT +const char *rd_kafka_consumer_group_metadata_member_id( + const rd_kafka_consumer_group_metadata_t *group_metadata); + +/** + * @brief Get the generation id (classic protocol) + * or member epoch (consumer protocol) of a group metadata. + * + * @param group_metadata The group metadata. + * + * @returns The generation id or member epoch + * contained in the passed \p group_metadata. + */ +RD_EXPORT +int32_t rd_kafka_consumer_group_metadata_generation_id( + const rd_kafka_consumer_group_metadata_t *group_metadata); /** * @brief Frees the consumer group metadata object as returned by @@ -4891,6 +5147,16 @@ const char *rd_kafka_Node_host(const rd_kafka_Node_t *node); RD_EXPORT uint16_t rd_kafka_Node_port(const rd_kafka_Node_t *node); +/** + * @brief Get the rack of \p node. + * + * @param node The Node instance + * + * @return The node rack id. May be NULL. + */ +RD_EXPORT +const char *rd_kafka_Node_rack(const rd_kafka_Node_t *node); + /**@}*/ @@ -4937,6 +5203,18 @@ typedef enum { RD_KAFKA_CONSUMER_GROUP_STATE__CNT } rd_kafka_consumer_group_state_t; +/** + * @enum rd_kafka_consumer_group_type_t + * + * @brief Consumer group type. + */ +typedef enum { + RD_KAFKA_CONSUMER_GROUP_TYPE_UNKNOWN = 0, + RD_KAFKA_CONSUMER_GROUP_TYPE_CONSUMER = 1, + RD_KAFKA_CONSUMER_GROUP_TYPE_CLASSIC = 2, + RD_KAFKA_CONSUMER_GROUP_TYPE__CNT +} rd_kafka_consumer_group_type_t; + /** * @brief Group information */ @@ -5021,6 +5299,30 @@ RD_EXPORT rd_kafka_consumer_group_state_t rd_kafka_consumer_group_state_code(const char *name); +/** + * @brief Returns a name for a group type code. + * + * @param type The group type value. + * + * @return The group type name corresponding to the provided group type value. + */ +RD_EXPORT +const char * +rd_kafka_consumer_group_type_name(rd_kafka_consumer_group_type_t type); + +/** + * @brief Returns a code for a group type name. + * + * @param name The group type name. + * + * @return The group type value corresponding to the provided group type name. + * + * @remark The comparison is case-insensitive. + */ +RD_EXPORT +rd_kafka_consumer_group_type_t +rd_kafka_consumer_group_type_code(const char *name); + /** * @brief Release list memory */ @@ -5074,6 +5376,18 @@ void rd_kafka_group_list_destroy(const struct rd_kafka_group_list *grplist); RD_EXPORT int rd_kafka_brokers_add(rd_kafka_t *rk, const char *brokerlist); +/** + * @brief Retrieve and return the learned broker ids. + * + * @param rk Instance to use. + * @param cntp Will be updated to the number of brokers returned. + * + * @returns a malloc:ed list of int32_t broker ids. + * + * @remark The returned pointer must be freed. + */ +RD_EXPORT +int32_t *rd_kafka_brokers_learned_ids(rd_kafka_t *rk, size_t *cntp); /** @@ -5282,7 +5596,20 @@ typedef int rd_kafka_event_type_t; #define RD_KAFKA_EVENT_LISTCONSUMERGROUPOFFSETS_RESULT 0x8000 /** AlterConsumerGroupOffsets_result_t */ #define RD_KAFKA_EVENT_ALTERCONSUMERGROUPOFFSETS_RESULT 0x10000 - +/** IncrementalAlterConfigs_result_t */ +#define RD_KAFKA_EVENT_INCREMENTALALTERCONFIGS_RESULT 0x20000 +/** DescribeUserScramCredentials_result_t */ +#define RD_KAFKA_EVENT_DESCRIBEUSERSCRAMCREDENTIALS_RESULT 0x40000 +/** AlterUserScramCredentials_result_t */ +#define RD_KAFKA_EVENT_ALTERUSERSCRAMCREDENTIALS_RESULT 0x80000 +/** DescribeTopics_result_t */ +#define RD_KAFKA_EVENT_DESCRIBETOPICS_RESULT 0x100000 +/** DescribeCluster_result_t */ +#define RD_KAFKA_EVENT_DESCRIBECLUSTER_RESULT 0x200000 +/** ListOffsets_result_t */ +#define RD_KAFKA_EVENT_LISTOFFSETS_RESULT 0x400000 +/** ElectLeaders_result_t */ +#define RD_KAFKA_EVENT_ELECTLEADERS_RESULT 0x800000 /** * @returns the event type for the given event. @@ -5429,6 +5756,7 @@ int rd_kafka_event_error_is_fatal(rd_kafka_event_t *rkev); * - RD_KAFKA_EVENT_DESCRIBEACLS_RESULT * - RD_KAFKA_EVENT_DELETEACLS_RESULT * - RD_KAFKA_EVENT_ALTERCONFIGS_RESULT + * - RD_KAFKA_EVENT_INCREMENTAL_ALTERCONFIGS_RESULT * - RD_KAFKA_EVENT_DESCRIBECONFIGS_RESULT * - RD_KAFKA_EVENT_DELETEGROUPS_RESULT * - RD_KAFKA_EVENT_DELETECONSUMERGROUPOFFSETS_RESULT @@ -5437,6 +5765,10 @@ int rd_kafka_event_error_is_fatal(rd_kafka_event_t *rkev); * - RD_KAFKA_EVENT_DESCRIBECONSUMERGROUPS_RESULT * - RD_KAFKA_EVENT_LISTCONSUMERGROUPOFFSETS_RESULT * - RD_KAFKA_EVENT_ALTERCONSUMERGROUPOFFSETS_RESULT + * - RD_KAFKA_EVENT_DESCRIBETOPICS_RESULT + * - RD_KAFKA_EVENT_DESCRIBECLUSTER_RESULT + * - RD_KAFKA_EVENT_LISTOFFSETS_RESULT + * - RD_KAFKA_EVENT_ELECTLEADERS_RESULT */ RD_EXPORT void *rd_kafka_event_opaque(rd_kafka_event_t *rkev); @@ -5532,6 +5864,8 @@ typedef rd_kafka_event_t rd_kafka_DeleteAcls_result_t; typedef rd_kafka_event_t rd_kafka_CreatePartitions_result_t; /*! AlterConfigs result type */ typedef rd_kafka_event_t rd_kafka_AlterConfigs_result_t; +/*! IncrementalAlterConfigs result type */ +typedef rd_kafka_event_t rd_kafka_IncrementalAlterConfigs_result_t; /*! CreateTopics result type */ typedef rd_kafka_event_t rd_kafka_DescribeConfigs_result_t; /*! DeleteRecords result type */ @@ -5548,6 +5882,18 @@ typedef rd_kafka_event_t rd_kafka_DeleteConsumerGroupOffsets_result_t; typedef rd_kafka_event_t rd_kafka_AlterConsumerGroupOffsets_result_t; /*! ListConsumerGroupOffsets result type */ typedef rd_kafka_event_t rd_kafka_ListConsumerGroupOffsets_result_t; +/*! DescribeTopics result type */ +typedef rd_kafka_event_t rd_kafka_DescribeTopics_result_t; +/*! DescribeCluster result type */ +typedef rd_kafka_event_t rd_kafka_DescribeCluster_result_t; +/*! DescribeUserScramCredentials result type */ +typedef rd_kafka_event_t rd_kafka_DescribeUserScramCredentials_result_t; +/*! AlterUserScramCredentials result type */ +typedef rd_kafka_event_t rd_kafka_AlterUserScramCredentials_result_t; +/*! ListOffsets result type */ +typedef rd_kafka_event_t rd_kafka_ListOffsets_result_t; +/*! ElectLeaders result type */ +typedef rd_kafka_event_t rd_kafka_ElectLeaders_result_t; /** * @brief Get CreateTopics result. @@ -5597,6 +5943,18 @@ rd_kafka_event_CreatePartitions_result(rd_kafka_event_t *rkev); RD_EXPORT const rd_kafka_AlterConfigs_result_t * rd_kafka_event_AlterConfigs_result(rd_kafka_event_t *rkev); +/** + * @brief Get IncrementalAlterConfigs result. + * + * @returns the result of a IncrementalAlterConfigs request, or NULL if event is + * of different type. + * + * Event types: + * RD_KAFKA_EVENT_INCREMENTALALTERCONFIGS_RESULT + */ +RD_EXPORT const rd_kafka_IncrementalAlterConfigs_result_t * +rd_kafka_event_IncrementalAlterConfigs_result(rd_kafka_event_t *rkev); + /** * @brief Get DescribeConfigs result. * @@ -5649,6 +6007,35 @@ rd_kafka_event_ListConsumerGroups_result(rd_kafka_event_t *rkev); RD_EXPORT const rd_kafka_DescribeConsumerGroups_result_t * rd_kafka_event_DescribeConsumerGroups_result(rd_kafka_event_t *rkev); +/** + * @brief Get DescribeTopics result. + * + * @returns the result of a DescribeTopics request, or NULL if event is + * of different type. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p rkev object. + * + * Event types: + * RD_KAFKA_EVENT_DESCRIBETOPICS_RESULT + */ +RD_EXPORT const rd_kafka_DescribeTopics_result_t * +rd_kafka_event_DescribeTopics_result(rd_kafka_event_t *rkev); + +/** + * @brief Get DescribeCluster result. + * + * @returns the result of a DescribeCluster request, or NULL if event is + * of different type. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p rkev object. + * + * Event types: + * RD_KAFKA_EVENT_DESCRIBECLUSTER_RESULT + */ +RD_EXPORT const rd_kafka_DescribeCluster_result_t * +rd_kafka_event_DescribeCluster_result(rd_kafka_event_t *rkev); /** * @brief Get DeleteGroups result. * @@ -5703,6 +6090,21 @@ rd_kafka_event_DescribeAcls_result(rd_kafka_event_t *rkev); RD_EXPORT const rd_kafka_DeleteAcls_result_t * rd_kafka_event_DeleteAcls_result(rd_kafka_event_t *rkev); +/** + * @brief Get ListConsumerGroupOffsets result. + * + * @returns the result of a ListConsumerGroupOffsets request, or NULL if + * event is of different type. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p rkev object. + * + * Event types: + * RD_KAFKA_EVENT_LISTCONSUMERGROUPOFFSETS_RESULT + */ +RD_EXPORT const rd_kafka_ListConsumerGroupOffsets_result_t * +rd_kafka_event_ListConsumerGroupOffsets_result(rd_kafka_event_t *rkev); + /** * @brief Get AlterConsumerGroupOffsets result. * @@ -5719,19 +6121,65 @@ RD_EXPORT const rd_kafka_AlterConsumerGroupOffsets_result_t * rd_kafka_event_AlterConsumerGroupOffsets_result(rd_kafka_event_t *rkev); /** - * @brief Get ListConsumerGroupOffsets result. + * @brief Get ListOffsets result. * - * @returns the result of a ListConsumerGroupOffsets request, or NULL if + * @returns the result of a ListOffsets request, or NULL if * event is of different type. * * @remark The lifetime of the returned memory is the same * as the lifetime of the \p rkev object. * * Event types: - * RD_KAFKA_EVENT_LISTCONSUMERGROUPOFFSETS_RESULT + * RD_KAFKA_EVENT_LISTOFFSETS_RESULT */ -RD_EXPORT const rd_kafka_ListConsumerGroupOffsets_result_t * -rd_kafka_event_ListConsumerGroupOffsets_result(rd_kafka_event_t *rkev); +RD_EXPORT const rd_kafka_ListOffsets_result_t * +rd_kafka_event_ListOffsets_result(rd_kafka_event_t *rkev); + + +/** + * @brief Get DescribeUserScramCredentials result. + * + * @returns the result of a DescribeUserScramCredentials request, or NULL if + * event is of different type. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p rkev object. + * + * Event types: + * RD_KAFKA_EVENT_DESCRIBEUSERSCRAMCREDENTIALS_RESULT + */ +RD_EXPORT const rd_kafka_DescribeUserScramCredentials_result_t * +rd_kafka_event_DescribeUserScramCredentials_result(rd_kafka_event_t *rkev); + +/** + * @brief Get AlterUserScramCredentials result. + * + * @returns the result of a AlterUserScramCredentials request, or NULL if + * event is of different type. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p rkev object. + * + * Event types: + * RD_KAFKA_EVENT_ALTERUSERSCRAMCREDENTIALS_RESULT + */ +RD_EXPORT const rd_kafka_AlterUserScramCredentials_result_t * +rd_kafka_event_AlterUserScramCredentials_result(rd_kafka_event_t *rkev); + +/** + * @brief Get ElectLeaders result. + * + * @returns the result of a ElectLeaders request, or NULL if + * event is of different type. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p rkev object. + * + * Event types: + * RD_KAFKA_EVENT_ELECTLEADERS_RESULT + */ +RD_EXPORT const rd_kafka_ElectLeaders_result_t * +rd_kafka_event_ElectLeaders_result(rd_kafka_event_t *rkev); /** * @brief Poll a queue for an event for max \p timeout_ms. @@ -6209,6 +6657,7 @@ typedef rd_kafka_resp_err_t(rd_kafka_interceptor_f_on_thread_exit_t)( * @param secproto The security protocol. * @param name The original name of the broker. * @param port The port of the broker. + * @param state Broker state name. * @param ic_opaque The interceptor's opaque pointer specified in ..add..(). * * @returns an error code on failure, the error is logged but otherwise ignored. @@ -6568,6 +7017,30 @@ rd_kafka_group_result_name(const rd_kafka_group_result_t *groupres); RD_EXPORT const rd_kafka_topic_partition_list_t * rd_kafka_group_result_partitions(const rd_kafka_group_result_t *groupres); +/** + * @brief Topic Partition Result provides per-topic+partition operation result + * Consists of TopicPartition object and error object. + */ + +/** + * @returns the topic partition object from the topic partition result object. + * @remarks lifetime of the returned string is the same as the \p + * partition_result. + * The error object is set inside the topic partition object. For the + * detailed error information, use + * rd_kafka_topic_partition_result_error() + */ +RD_EXPORT const rd_kafka_topic_partition_t * +rd_kafka_topic_partition_result_partition( + const rd_kafka_topic_partition_result_t *partition_result); + +/** + * @returns the error object from the topic partition result object. + * @remarks lifetime of the returned string is the same as the \p + * partition_result. + */ +RD_EXPORT const rd_kafka_error_t *rd_kafka_topic_partition_result_error( + const rd_kafka_topic_partition_result_t *partition_result); /**@}*/ @@ -6635,7 +7108,17 @@ typedef enum rd_kafka_admin_op_t { RD_KAFKA_ADMIN_OP_LISTCONSUMERGROUPOFFSETS, /** AlterConsumerGroupOffsets */ RD_KAFKA_ADMIN_OP_ALTERCONSUMERGROUPOFFSETS, - RD_KAFKA_ADMIN_OP__CNT /**< Number of ops defined */ + /** IncrementalAlterConfigs */ + RD_KAFKA_ADMIN_OP_INCREMENTALALTERCONFIGS, + /** DescribeUserScramCredentials */ + RD_KAFKA_ADMIN_OP_DESCRIBEUSERSCRAMCREDENTIALS, + /** AlterUserScramCredentials */ + RD_KAFKA_ADMIN_OP_ALTERUSERSCRAMCREDENTIALS, + RD_KAFKA_ADMIN_OP_DESCRIBETOPICS, /**< DescribeTopics */ + RD_KAFKA_ADMIN_OP_DESCRIBECLUSTER, /**< DescribeCluster */ + RD_KAFKA_ADMIN_OP_LISTOFFSETS, /**< ListOffsets */ + RD_KAFKA_ADMIN_OP_ELECTLEADERS, /**< ElectLeaders */ + RD_KAFKA_ADMIN_OP__CNT /**< Number of ops defined */ } rd_kafka_admin_op_t; /** @@ -6652,6 +7135,18 @@ typedef enum rd_kafka_admin_op_t { typedef struct rd_kafka_AdminOptions_s rd_kafka_AdminOptions_t; +/** + * @enum rd_kafka_IsolationLevel_t + * + * @brief IsolationLevel enum name for use with rd_kafka_AdminOptions_new() + * + * @sa rd_kafka_AdminOptions_new() + */ +typedef enum rd_kafka_IsolationLevel_t { + RD_KAFKA_ISOLATION_LEVEL_READ_UNCOMMITTED = 0, + RD_KAFKA_ISOLATION_LEVEL_READ_COMMITTED = 1 +} rd_kafka_IsolationLevel_t; + /** * @brief Create a new AdminOptions object. * @@ -6686,8 +7181,7 @@ RD_EXPORT void rd_kafka_AdminOptions_destroy(rd_kafka_AdminOptions_t *options); * request transmission, operation time on broker, and response. * * @param options Admin options. - * @param timeout_ms Timeout in milliseconds, use -1 for indefinite timeout. - * Defaults to `socket.timeout.ms`. + * @param timeout_ms Timeout in milliseconds. Defaults to `socket.timeout.ms`. * @param errstr A human readable error string (nul-terminated) is written to * this location that must be of at least \p errstr_size bytes. * The \p errstr is only written in case of error. @@ -6771,6 +7265,8 @@ rd_kafka_AdminOptions_set_validate_only(rd_kafka_AdminOptions_t *options, * the following exceptions: * - AlterConfigs with a BROKER resource are sent to the broker id set * as the resource name. + * - IncrementalAlterConfigs with a BROKER resource are sent to the broker id + * set as the resource name. * - DescribeConfigs with a BROKER resource are sent to the broker id set * as the resource name. * @@ -6813,6 +7309,25 @@ rd_kafka_error_t *rd_kafka_AdminOptions_set_require_stable_offsets( rd_kafka_AdminOptions_t *options, int true_or_false); +/** + * @brief Whether broker should return authorized operations for the given + * resource in the DescribeConsumerGroups, DescribeTopics, or + * DescribeCluster calls. + * + * @param options Admin options. + * @param true_or_false Defaults to false. + * + * @return NULL on success, a new error instance that must be + * released with rd_kafka_error_destroy() in case of error. + * + * @remark This option is valid for DescribeConsumerGroups, DescribeTopics, + * DescribeCluster. + */ +RD_EXPORT +rd_kafka_error_t *rd_kafka_AdminOptions_set_include_authorized_operations( + rd_kafka_AdminOptions_t *options, + int true_or_false); + /** * @brief Set consumer groups states to query for. * @@ -6831,6 +7346,32 @@ rd_kafka_error_t *rd_kafka_AdminOptions_set_match_consumer_group_states( const rd_kafka_consumer_group_state_t *consumer_group_states, size_t consumer_group_states_cnt); +/** + * @brief Set consumer groups types to query for. + * + * @param options Admin options. + * @param consumer_group_types Array of consumer group types. + * @param consumer_group_types_cnt Size of the \p consumer_group_types array. + * + * @return NULL on success, a new error instance that must be + * released with rd_kafka_error_destroy() in case of error. + * + * @remark This option is valid for ListConsumerGroups. + */ +RD_EXPORT +rd_kafka_error_t *rd_kafka_AdminOptions_set_match_consumer_group_types( + rd_kafka_AdminOptions_t *options, + const rd_kafka_consumer_group_type_t *consumer_group_types, + size_t consumer_group_types_cnt); + +/** + * @brief Set Isolation Level to an allowed `rd_kafka_IsolationLevel_t` value. + */ +RD_EXPORT +rd_kafka_error_t * +rd_kafka_AdminOptions_set_isolation_level(rd_kafka_AdminOptions_t *options, + rd_kafka_IsolationLevel_t value); + /** * @brief Set application opaque value that can be extracted from the * result event using rd_kafka_event_opaque() @@ -6839,6 +7380,35 @@ RD_EXPORT void rd_kafka_AdminOptions_set_opaque(rd_kafka_AdminOptions_t *options, void *ev_opaque); + + +/** + * @enum rd_kafka_AclOperation_t + * @brief Apache Kafka ACL operation types. Common type for multiple Admin API + * functions. + */ +typedef enum rd_kafka_AclOperation_t { + RD_KAFKA_ACL_OPERATION_UNKNOWN = 0, /**< Unknown */ + RD_KAFKA_ACL_OPERATION_ANY = + 1, /**< In a filter, matches any AclOperation */ + RD_KAFKA_ACL_OPERATION_ALL = 2, /**< ALL operation */ + RD_KAFKA_ACL_OPERATION_READ = 3, /**< READ operation */ + RD_KAFKA_ACL_OPERATION_WRITE = 4, /**< WRITE operation */ + RD_KAFKA_ACL_OPERATION_CREATE = 5, /**< CREATE operation */ + RD_KAFKA_ACL_OPERATION_DELETE = 6, /**< DELETE operation */ + RD_KAFKA_ACL_OPERATION_ALTER = 7, /**< ALTER operation */ + RD_KAFKA_ACL_OPERATION_DESCRIBE = 8, /**< DESCRIBE operation */ + RD_KAFKA_ACL_OPERATION_CLUSTER_ACTION = + 9, /**< CLUSTER_ACTION operation */ + RD_KAFKA_ACL_OPERATION_DESCRIBE_CONFIGS = + 10, /**< DESCRIBE_CONFIGS operation */ + RD_KAFKA_ACL_OPERATION_ALTER_CONFIGS = + 11, /**< ALTER_CONFIGS operation */ + RD_KAFKA_ACL_OPERATION_IDEMPOTENT_WRITE = + 12, /**< IDEMPOTENT_WRITE operation */ + RD_KAFKA_ACL_OPERATION__CNT +} rd_kafka_AclOperation_t; + /**@}*/ /** @@ -7217,6 +7787,8 @@ typedef enum rd_kafka_ConfigSource_t { /** Built-in default configuration for configs that have a * default value */ RD_KAFKA_CONFIG_SOURCE_DEFAULT_CONFIG = 5, + /** Group config that is configured for a specific group */ + RD_KAFKA_CONFIG_SOURCE_GROUP_CONFIG = 8, /** Number of source types defined */ RD_KAFKA_CONFIG_SOURCE__CNT, @@ -7305,12 +7877,13 @@ rd_kafka_ConfigEntry_synonyms(const rd_kafka_ConfigEntry_t *entry, * @brief Apache Kafka resource types */ typedef enum rd_kafka_ResourceType_t { - RD_KAFKA_RESOURCE_UNKNOWN = 0, /**< Unknown */ - RD_KAFKA_RESOURCE_ANY = 1, /**< Any (used for lookups) */ - RD_KAFKA_RESOURCE_TOPIC = 2, /**< Topic */ - RD_KAFKA_RESOURCE_GROUP = 3, /**< Group */ - RD_KAFKA_RESOURCE_BROKER = 4, /**< Broker */ - RD_KAFKA_RESOURCE__CNT, /**< Number of resource types defined */ + RD_KAFKA_RESOURCE_UNKNOWN = 0, /**< Unknown */ + RD_KAFKA_RESOURCE_ANY = 1, /**< Any (used for lookups) */ + RD_KAFKA_RESOURCE_TOPIC = 2, /**< Topic */ + RD_KAFKA_RESOURCE_GROUP = 3, /**< Group */ + RD_KAFKA_RESOURCE_BROKER = 4, /**< Broker */ + RD_KAFKA_RESOURCE_TRANSACTIONAL_ID = 5, /**< Transactional ID */ + RD_KAFKA_RESOURCE__CNT, /**< Number of resource types defined */ } rd_kafka_ResourceType_t; /** @@ -7331,6 +7904,18 @@ typedef enum rd_kafka_ResourcePatternType_t { RD_KAFKA_RESOURCE_PATTERN_TYPE__CNT, } rd_kafka_ResourcePatternType_t; +/** + * @enum rd_kafka_AlterConfigOpType_t + * @brief Incremental alter configs operations. + */ +typedef enum rd_kafka_AlterConfigOpType_t { + RD_KAFKA_ALTER_CONFIG_OP_TYPE_SET = 0, + RD_KAFKA_ALTER_CONFIG_OP_TYPE_DELETE = 1, + RD_KAFKA_ALTER_CONFIG_OP_TYPE_APPEND = 2, + RD_KAFKA_ALTER_CONFIG_OP_TYPE_SUBTRACT = 3, + RD_KAFKA_ALTER_CONFIG_OP_TYPE__CNT, +} rd_kafka_AlterConfigOpType_t; + /** * @returns a string representation of the \p resource_pattern_type */ @@ -7396,6 +7981,31 @@ rd_kafka_ConfigResource_set_config(rd_kafka_ConfigResource_t *config, const char *value); +/** + * @brief Add the value of the configuration entry for a subsequent + * incremental alter config operation. APPEND and SUBTRACT are + * possible for list-type configuration entries only. + * + * @param config ConfigResource to add config property to. + * @param name Configuration name, depends on resource type. + * @param op_type Operation type, one of rd_kafka_AlterConfigOpType_t. + * @param value Configuration value, depends on resource type and \p name. + * Set to \c NULL, only with with op_type set to DELETE, + * to revert configuration value to default. + * + * @returns NULL on success, or an rd_kafka_error_t * + * with the corresponding error code and string. + * Error ownership belongs to the caller. + * Possible error codes: + * - RD_KAFKA_RESP_ERR__INVALID_ARG on invalid input. + */ +RD_EXPORT rd_kafka_error_t *rd_kafka_ConfigResource_add_incremental_config( + rd_kafka_ConfigResource_t *config, + const char *name, + rd_kafka_AlterConfigOpType_t op_type, + const char *value); + + /** * @brief Get an array of config entries from a ConfigResource object. * @@ -7461,6 +8071,8 @@ rd_kafka_ConfigResource_error_string(const rd_kafka_ConfigResource_t *config); * since these resource requests must be sent to the broker specified * in the resource. * + * @deprecated Use rd_kafka_IncrementalAlterConfigs(). + * */ RD_EXPORT void rd_kafka_AlterConfigs(rd_kafka_t *rk, @@ -7495,6 +8107,66 @@ rd_kafka_AlterConfigs_result_resources( +/* + * IncrementalAlterConfigs - alter cluster configuration incrementally. + * + */ + + +/** + * @brief Incrementally update the configuration for the specified resources. + * Updates are not transactional so they may succeed for some resources + * while fail for others. The configs for a particular resource are + * updated atomically, executing the corresponding incremental operations + * on the provided configurations. + * + * @remark Requires broker version >=2.3.0 + * + * @remark Multiple resources and resource types may be set, but at most one + * resource of type \c RD_KAFKA_RESOURCE_BROKER is allowed per call + * since these resource requests must be sent to the broker specified + * in the resource. Broker option will be ignored in this case. + * + * @param rk Client instance. + * @param configs Array of config entries to alter. + * @param config_cnt Number of elements in \p configs array. + * @param options Optional admin options, or NULL for defaults. + * @param rkqu Queue to emit result on. + */ +RD_EXPORT +void rd_kafka_IncrementalAlterConfigs(rd_kafka_t *rk, + rd_kafka_ConfigResource_t **configs, + size_t config_cnt, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu); + + +/* + * IncrementalAlterConfigs result type and methods + */ + +/** + * @brief Get an array of resource results from a IncrementalAlterConfigs + * result. + * + * Use \c rd_kafka_ConfigResource_error() and + * \c rd_kafka_ConfigResource_error_string() to extract per-resource error + * results on the returned array elements. + * + * The returned object life-times are the same as the \p result object. + * + * @param result Result object to get resource results from. + * @param cntp is updated to the number of elements in the array. + * + * @returns an array of ConfigResource elements, or NULL if not available. + */ +RD_EXPORT const rd_kafka_ConfigResource_t ** +rd_kafka_IncrementalAlterConfigs_result_resources( + const rd_kafka_IncrementalAlterConfigs_result_t *result, + size_t *cntp); + + + /* * DescribeConfigs - retrieve cluster configuration. * @@ -7646,6 +8318,310 @@ rd_kafka_DeleteRecords_result_offsets( /**@}*/ +/** + * @name Admin API - DescribeTopics + * @{ + */ + +/** + * @brief Represents a collection of topics, to be passed to DescribeTopics. + * + */ +typedef struct rd_kafka_TopicCollection_s rd_kafka_TopicCollection_t; + +/** + * @brief TopicPartition represents a partition in the DescribeTopics result. + * + */ +typedef struct rd_kafka_TopicPartitionInfo_s rd_kafka_TopicPartitionInfo_t; + +/** + * @brief DescribeTopics result type. + * + */ +typedef struct rd_kafka_TopicDescription_s rd_kafka_TopicDescription_t; + +/** + * @brief Creates a new TopicCollection for passing to rd_kafka_DescribeTopics. + * + * @param topics A list of topics. + * @param topics_cnt Count of topics. + * + * @return a newly allocated TopicCollection object. Must be freed using + * rd_kafka_TopicCollection_destroy when done. + */ +RD_EXPORT +rd_kafka_TopicCollection_t * +rd_kafka_TopicCollection_of_topic_names(const char **topics, size_t topics_cnt); + +/** + * @brief Destroy and free a TopicCollection object created with + * rd_kafka_TopicCollection_new_* methods. + */ +RD_EXPORT void +rd_kafka_TopicCollection_destroy(rd_kafka_TopicCollection_t *topics); + +/** + * @brief Describe topics as specified by the \p topics + * array of size \p topics_cnt elements. + * + * @param rk Client instance. + * @param topics Collection of topics to describe. + * @param options Optional admin options, or NULL for defaults. + * Valid options: + * - include_authorized_operations + * @param rkqu Queue to emit result on. + * + * @remark The result event type emitted on the supplied queue is of type + * \c RD_KAFKA_EVENT_DESCRIBETOPICS_RESULT + */ +RD_EXPORT +void rd_kafka_DescribeTopics(rd_kafka_t *rk, + const rd_kafka_TopicCollection_t *topics, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu); + +/** + * @brief Get an array of topic results from a DescribeTopics result. + * + * @param result Result to get topics results from. + * @param cntp is updated to the number of elements in the array. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p result object. + */ +RD_EXPORT +const rd_kafka_TopicDescription_t **rd_kafka_DescribeTopics_result_topics( + const rd_kafka_DescribeTopics_result_t *result, + size_t *cntp); + + +/** + * @brief Gets an array of partitions for the \p topicdesc topic. + * + * @param topicdesc The topic description. + * @param cntp is updated to the number of partitions in the array. + * + * @return An array of TopicPartitionInfos. + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p topicdesc object. + */ +RD_EXPORT +const rd_kafka_TopicPartitionInfo_t **rd_kafka_TopicDescription_partitions( + const rd_kafka_TopicDescription_t *topicdesc, + size_t *cntp); + + +/** + * @brief Gets the partition id for \p partition. + * + * @param partition The partition info. + * + * @return The partition id. + */ +RD_EXPORT +const int rd_kafka_TopicPartitionInfo_partition( + const rd_kafka_TopicPartitionInfo_t *partition); + + +/** + * @brief Gets the partition leader for \p partition. + * + * @param partition The partition info. + * + * @return The partition leader. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p partition object. + */ +RD_EXPORT +const rd_kafka_Node_t *rd_kafka_TopicPartitionInfo_leader( + const rd_kafka_TopicPartitionInfo_t *partition); + +/** + * @brief Gets the partition in-sync replicas for \p partition. + * + * @param partition The partition info. + * @param cntp is updated with in-sync replicas count. + * + * @return The in-sync replica nodes. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p partition object. + */ +RD_EXPORT +const rd_kafka_Node_t ** +rd_kafka_TopicPartitionInfo_isr(const rd_kafka_TopicPartitionInfo_t *partition, + size_t *cntp); + +/** + * @brief Gets the partition replicas for \p partition. + * + * @param partition The partition info. + * @param cntp is updated with partition replicas count. + * + * @return The partition replicas nodes. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p partition object. + */ +RD_EXPORT +const rd_kafka_Node_t **rd_kafka_TopicPartitionInfo_replicas( + const rd_kafka_TopicPartitionInfo_t *partition, + size_t *cntp); + +/** + * @brief Gets the topic authorized ACL operations for the \p topicdesc topic. + * + * @param topicdesc The topic description. + * @param cntp is updated with authorized ACL operations count. + * + * @return The topic authorized operations. Is NULL if operations were not + * requested. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p topicdesc object. + */ +RD_EXPORT +const rd_kafka_AclOperation_t *rd_kafka_TopicDescription_authorized_operations( + const rd_kafka_TopicDescription_t *topicdesc, + size_t *cntp); + +/** + * @brief Gets the topic name for the \p topicdesc topic. + * + * @param topicdesc The topic description. + * + * @return The topic name. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p topicdesc object. + */ +RD_EXPORT +const char * +rd_kafka_TopicDescription_name(const rd_kafka_TopicDescription_t *topicdesc); + +/** + * @brief Gets the topic id for the \p topicdesc topic. + * + * @param topicdesc The topic description. + * @return The topic id + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p topicdesc object. + */ +RD_EXPORT const rd_kafka_Uuid_t *rd_kafka_TopicDescription_topic_id( + const rd_kafka_TopicDescription_t *topicdesc); + +/** + * @brief Gets if the \p topicdesc topic is internal. + * + * @param topicdesc The topic description. + * + * @return 1 if the topic is internal to Kafka, 0 otherwise. + */ +RD_EXPORT +int rd_kafka_TopicDescription_is_internal( + const rd_kafka_TopicDescription_t *topicdesc); + +/** + * @brief Gets the error for the \p topicdesc topic. + * + * @param topicdesc The topic description. + * + * @return The topic description error. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p topicdesc object. + */ +RD_EXPORT +const rd_kafka_error_t * +rd_kafka_TopicDescription_error(const rd_kafka_TopicDescription_t *topicdesc); + + +/**@}*/ + +/** + * @name Admin API - DescribeCluster + * @{ + */ + +/** + * @brief Describes the cluster. + * + * @param rk Client instance. + * @param options Optional admin options, or NULL for defaults. + * Valid options: + * - include_authorized_operations + * @param rkqu Queue to emit result on. + * + * @remark The result event type emitted on the supplied queue is of type + * \c RD_KAFKA_EVENT_DESCRIBECLUSTER_RESULT + */ +RD_EXPORT +void rd_kafka_DescribeCluster(rd_kafka_t *rk, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu); + +/** + * @brief Gets the broker nodes for the \p result cluster. + * + * @param result The result of DescribeCluster. + * @param cntp is updated with the count of broker nodes. + * + * @return An array of broker nodes. + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p result object. + */ +RD_EXPORT +const rd_kafka_Node_t **rd_kafka_DescribeCluster_result_nodes( + const rd_kafka_DescribeCluster_result_t *result, + size_t *cntp); + +/** + * @brief Gets the authorized ACL operations for the \p result cluster. + * + * @param result The result of DescribeCluster. + * @param cntp is updated with authorized ACL operations count. + * + * @return The cluster authorized operations. Is NULL if operations were not + * requested. + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p result object. + */ +RD_EXPORT +const rd_kafka_AclOperation_t * +rd_kafka_DescribeCluster_result_authorized_operations( + const rd_kafka_DescribeCluster_result_t *result, + size_t *cntp); + +/** + * @brief Gets the current controller for the \p result cluster. + * + * @param result The result of DescribeCluster. + * + * @return The cluster current controller. + */ +RD_EXPORT +const rd_kafka_Node_t *rd_kafka_DescribeCluster_result_controller( + const rd_kafka_DescribeCluster_result_t *result); + +/** + * @brief Gets the cluster id for the \p result cluster. + * + * @param result The result of DescribeCluster. + * + * @return The cluster id. + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p result object. + */ +RD_EXPORT +const char *rd_kafka_DescribeCluster_result_cluster_id( + const rd_kafka_DescribeCluster_result_t *result); + +/**@}*/ + + /** * @name Admin API - ListConsumerGroups * @{ @@ -7715,6 +8691,17 @@ RD_EXPORT rd_kafka_consumer_group_state_t rd_kafka_ConsumerGroupListing_state( const rd_kafka_ConsumerGroupListing_t *grplist); +/** + * @brief Gets type for the \p grplist group. + * + * @param grplist The group listing. + * + * @return A group type. + */ +RD_EXPORT +rd_kafka_consumer_group_type_t rd_kafka_ConsumerGroupListing_type( + const rd_kafka_ConsumerGroupListing_t *grplist); + /** * @brief Get an array of valid list groups from a ListConsumerGroups result. * @@ -7784,6 +8771,8 @@ typedef struct rd_kafka_MemberAssignment_s rd_kafka_MemberAssignment_t; * @param groups Array of groups to describe. * @param groups_cnt Number of elements in \p groups array. * @param options Optional admin options, or NULL for defaults. + * Valid options: + * - include_authorized_operations * @param rkqu Queue to emit result on. * * @remark The result event type emitted on the supplied queue is of type @@ -7868,6 +8857,23 @@ RD_EXPORT const char *rd_kafka_ConsumerGroupDescription_partition_assignor( const rd_kafka_ConsumerGroupDescription_t *grpdesc); +/** + * @brief Gets the authorized ACL operations for the \p grpdesc group. + * + * @param grpdesc The group description. + * @param cntp is updated with authorized ACL operations count. + * + * @return The group authorized operations. Is NULL if operations were not + * requested. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p grpdesc object. + */ +RD_EXPORT +const rd_kafka_AclOperation_t * +rd_kafka_ConsumerGroupDescription_authorized_operations( + const rd_kafka_ConsumerGroupDescription_t *grpdesc, + size_t *cntp); /** * @brief Gets state for the \p grpdesc group. @@ -7894,6 +8900,17 @@ RD_EXPORT const rd_kafka_Node_t *rd_kafka_ConsumerGroupDescription_coordinator( const rd_kafka_ConsumerGroupDescription_t *grpdesc); +/** + * @brief Gets type for the \p grpdesc group. + * + * @param grpdesc The group description. + * + * @return A group type. + */ +RD_EXPORT +rd_kafka_consumer_group_type_t rd_kafka_ConsumerGroupDescription_type( + const rd_kafka_ConsumerGroupDescription_t *grpdesc); + /** * @brief Gets the members count of \p grpdesc group. * @@ -8006,6 +9023,21 @@ RD_EXPORT const rd_kafka_topic_partition_list_t *rd_kafka_MemberAssignment_partitions( const rd_kafka_MemberAssignment_t *assignment); +/** + * @brief Gets target assignment of \p member. + * + * @param member The group member. + * + * @return The target assignment for `consumer` group types. + * Returns NULL for the `classic` group types. + * + * @remark The lifetime of the returned memory is the same + * as the lifetime of the \p member object. + */ +RD_EXPORT +const rd_kafka_MemberAssignment_t *rd_kafka_MemberDescription_target_assignment( + const rd_kafka_MemberDescription_t *member); + /**@}*/ /** @@ -8371,6 +9403,331 @@ rd_kafka_DeleteConsumerGroupOffsets_result_groups( /**@}*/ +/** + * @name Admin API - ListOffsets + * @brief Given a topic_partition list, provides the offset information. + * @{ + */ + +/** + * @enum rd_kafka_OffsetSpec_t + * @brief Allows to specify the desired offsets when using ListOffsets. + */ +typedef enum rd_kafka_OffsetSpec_t { + /* Used to retrieve the offset with the largest timestamp of a partition + * as message timestamps can be specified client side this may not match + * the log end offset returned by SPEC_LATEST. + */ + RD_KAFKA_OFFSET_SPEC_MAX_TIMESTAMP = -3, + /* Used to retrieve the offset with the earliest timestamp of a + partition. */ + RD_KAFKA_OFFSET_SPEC_EARLIEST = -2, + /* Used to retrieve the offset with the latest timestamp of a partition. + */ + RD_KAFKA_OFFSET_SPEC_LATEST = -1, +} rd_kafka_OffsetSpec_t; + +/** + * @brief Information returned from a ListOffsets call for a specific + * `rd_kafka_topic_partition_t`. + */ +typedef struct rd_kafka_ListOffsetsResultInfo_s + rd_kafka_ListOffsetsResultInfo_t; + +/** + * @brief Returns the topic partition of the passed \p result_info. + */ +RD_EXPORT +const rd_kafka_topic_partition_t * +rd_kafka_ListOffsetsResultInfo_topic_partition( + const rd_kafka_ListOffsetsResultInfo_t *result_info); + +/** + * @brief Returns the timestamp corresponding to the offset in \p result_info. + */ +RD_EXPORT +int64_t rd_kafka_ListOffsetsResultInfo_timestamp( + const rd_kafka_ListOffsetsResultInfo_t *result_info); + +/** + * @brief Returns the array of ListOffsetsResultInfo in \p result + * and populates the size of the array in \p cntp. + */ +RD_EXPORT +const rd_kafka_ListOffsetsResultInfo_t ** +rd_kafka_ListOffsets_result_infos(const rd_kafka_ListOffsets_result_t *result, + size_t *cntp); + +/** + * @brief List offsets for the specified \p topic_partitions. + * This operation enables to find the beginning offset, + * end offset as well as the offset matching a timestamp in partitions + * or the offset with max timestamp. + * + * @param rk Client instance. + * @param topic_partitions topic_partition_list_t with the partitions and + * offsets to list. Each topic partition offset can be + * a value of the `rd_kafka_OffsetSpec_t` enum or + * a non-negative value, representing a timestamp, + * to query for the first offset after the + * given timestamp. + * @param options Optional admin options, or NULL for defaults. + * @param rkqu Queue to emit result on. + * + * Supported admin options: + * - rd_kafka_AdminOptions_set_isolation_level() - default \c + * RD_KAFKA_ISOLATION_LEVEL_READ_UNCOMMITTED + * - rd_kafka_AdminOptions_set_request_timeout() - default socket.timeout.ms + * + * @remark The result event type emitted on the supplied queue is of type + * \c RD_KAFKA_EVENT_LISTOFFSETS_RESULT + */ +RD_EXPORT +void rd_kafka_ListOffsets(rd_kafka_t *rk, + rd_kafka_topic_partition_list_t *topic_partitions, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu); + +/**@}*/ + +/** + * @name Admin API - User SCRAM credentials + * @{ + */ + +/** + * @enum rd_kafka_ScramMechanism_t + * @brief Apache Kafka ScramMechanism values. + */ +typedef enum rd_kafka_ScramMechanism_t { + RD_KAFKA_SCRAM_MECHANISM_UNKNOWN = 0, + RD_KAFKA_SCRAM_MECHANISM_SHA_256 = 1, + RD_KAFKA_SCRAM_MECHANISM_SHA_512 = 2, + RD_KAFKA_SCRAM_MECHANISM__CNT +} rd_kafka_ScramMechanism_t; + +/** + * @brief Scram credential info. + * Mechanism and iterations for a SASL/SCRAM + * credential associated with a user. + */ +typedef struct rd_kafka_ScramCredentialInfo_s rd_kafka_ScramCredentialInfo_t; + +/** + * @brief Returns the mechanism of a given ScramCredentialInfo. + */ +RD_EXPORT +rd_kafka_ScramMechanism_t rd_kafka_ScramCredentialInfo_mechanism( + const rd_kafka_ScramCredentialInfo_t *scram_credential_info); + +/** + * @brief Returns the iterations of a given ScramCredentialInfo. + */ +RD_EXPORT +int32_t rd_kafka_ScramCredentialInfo_iterations( + const rd_kafka_ScramCredentialInfo_t *scram_credential_info); + +/** + * @brief Representation of all SASL/SCRAM credentials associated + * with a user that can be retrieved, + * or an error indicating why credentials + * could not be retrieved. + */ +typedef struct rd_kafka_UserScramCredentialsDescription_s + rd_kafka_UserScramCredentialsDescription_t; + +/** + * @brief Returns the username of a UserScramCredentialsDescription. + */ +RD_EXPORT +const char *rd_kafka_UserScramCredentialsDescription_user( + const rd_kafka_UserScramCredentialsDescription_t *description); + +/** + * @brief Returns the error associated with a UserScramCredentialsDescription. + */ +RD_EXPORT +const rd_kafka_error_t *rd_kafka_UserScramCredentialsDescription_error( + const rd_kafka_UserScramCredentialsDescription_t *description); + +/** + * @brief Returns the count of ScramCredentialInfos of a + * UserScramCredentialsDescription. + */ +RD_EXPORT +size_t rd_kafka_UserScramCredentialsDescription_scramcredentialinfo_count( + const rd_kafka_UserScramCredentialsDescription_t *description); + +/** + * @brief Returns the ScramCredentialInfo at index idx of + * UserScramCredentialsDescription. + */ +RD_EXPORT +const rd_kafka_ScramCredentialInfo_t * +rd_kafka_UserScramCredentialsDescription_scramcredentialinfo( + const rd_kafka_UserScramCredentialsDescription_t *description, + size_t idx); + +/** + * @brief Get an array of descriptions from a DescribeUserScramCredentials + * result. + * + * The returned value life-time is the same as the \p result object. + * + * @param result Result to get descriptions from. + * @param cntp is updated to the number of elements in the array. + */ +RD_EXPORT +const rd_kafka_UserScramCredentialsDescription_t ** +rd_kafka_DescribeUserScramCredentials_result_descriptions( + const rd_kafka_DescribeUserScramCredentials_result_t *result, + size_t *cntp); + +/** + * @brief Describe SASL/SCRAM credentials. + * This operation is supported by brokers with version 2.7.0 or higher. + * + * @param rk Client instance. + * @param users The users for which credentials are to be described. + * All users' credentials are described if NULL. + * @param user_cnt Number of elements in \p users array. + * @param options Optional admin options, or NULL for defaults. + * @param rkqu Queue to emit result on. + */ +RD_EXPORT +void rd_kafka_DescribeUserScramCredentials( + rd_kafka_t *rk, + const char **users, + size_t user_cnt, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu); + +/** + * @brief A request to alter a user's SASL/SCRAM credentials. + */ +typedef struct rd_kafka_UserScramCredentialAlteration_s + rd_kafka_UserScramCredentialAlteration_t; + +/** + * @brief Allocates a new UserScramCredentialUpsertion given its fields. + * If salt isn't given a 64 B salt is generated using OpenSSL + * RAND_priv_bytes, if available. + * + * @param username The username (not empty). + * @param mechanism SASL/SCRAM mechanism. + * @param iterations SASL/SCRAM iterations. + * @param password Password bytes (not empty). + * @param password_size Size of \p password (greater than 0). + * @param salt Salt bytes (optional). + * @param salt_size Size of \p salt (optional). + * + * @remark A random salt is generated, when NULL, only if OpenSSL >= 1.1.1. + * Otherwise it's a required param. + * + * @return A newly created instance of rd_kafka_UserScramCredentialAlteration_t. + * Ownership belongs to the caller, use + * rd_kafka_UserScramCredentialAlteration_destroy to destroy. + */ +RD_EXPORT +rd_kafka_UserScramCredentialAlteration_t * +rd_kafka_UserScramCredentialUpsertion_new(const char *username, + rd_kafka_ScramMechanism_t mechanism, + int32_t iterations, + const unsigned char *password, + size_t password_size, + const unsigned char *salt, + size_t salt_size); + +/** + * @brief Allocates a new UserScramCredentialDeletion given its fields. + * + * @param username The username (not empty). + * @param mechanism SASL/SCRAM mechanism. + * @return A newly created instance of rd_kafka_UserScramCredentialAlteration_t. + * Ownership belongs to the caller, use + * rd_kafka_UserScramCredentialAlteration_destroy to destroy. + */ +RD_EXPORT +rd_kafka_UserScramCredentialAlteration_t * +rd_kafka_UserScramCredentialDeletion_new(const char *username, + rd_kafka_ScramMechanism_t mechanism); + + +/** + * @brief Destroys a UserScramCredentialAlteration given its pointer + */ +RD_EXPORT +void rd_kafka_UserScramCredentialAlteration_destroy( + rd_kafka_UserScramCredentialAlteration_t *alteration); + +/** + * @brief Destroys an array of UserScramCredentialAlteration + */ +RD_EXPORT +void rd_kafka_UserScramCredentialAlteration_destroy_array( + rd_kafka_UserScramCredentialAlteration_t **alterations, + size_t alteration_cnt); + +/** + * @brief Result of a single user SCRAM alteration. + */ +typedef struct rd_kafka_AlterUserScramCredentials_result_response_s + rd_kafka_AlterUserScramCredentials_result_response_t; + +/** + * @brief Returns the username for a + * rd_kafka_AlterUserScramCredentials_result_response. + */ +RD_EXPORT +const char *rd_kafka_AlterUserScramCredentials_result_response_user( + const rd_kafka_AlterUserScramCredentials_result_response_t *response); + +/** + * @brief Returns the error of a + * rd_kafka_AlterUserScramCredentials_result_response. + */ +RD_EXPORT +const rd_kafka_error_t * +rd_kafka_AlterUserScramCredentials_result_response_error( + const rd_kafka_AlterUserScramCredentials_result_response_t *response); + +/** + * @brief Get an array of responses from a AlterUserScramCredentials result. + * + * The returned value life-time is the same as the \p result object. + * + * @param result Result to get responses from. + * @param cntp is updated to the number of elements in the array. + */ +RD_EXPORT +const rd_kafka_AlterUserScramCredentials_result_response_t ** +rd_kafka_AlterUserScramCredentials_result_responses( + const rd_kafka_AlterUserScramCredentials_result_t *result, + size_t *cntp); + +/** + * @brief Alter SASL/SCRAM credentials. + * This operation is supported by brokers with version 2.7.0 or higher. + * + * @remark For upsertions to be processed, librdkfka must be build with + * OpenSSL support. It's needed to calculate the HMAC. + * + * @param rk Client instance. + * @param alterations The alterations to be applied. + * @param alteration_cnt Number of elements in \p alterations array. + * @param options Optional admin options, or NULL for defaults. + * @param rkqu Queue to emit result on. + */ +RD_EXPORT +void rd_kafka_AlterUserScramCredentials( + rd_kafka_t *rk, + rd_kafka_UserScramCredentialAlteration_t **alterations, + size_t alteration_cnt, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu); + +/**@}*/ + /** * @name Admin API - ACL operations * @{ @@ -8396,32 +9753,6 @@ RD_EXPORT const rd_kafka_error_t * rd_kafka_acl_result_error(const rd_kafka_acl_result_t *aclres); -/** - * @enum rd_kafka_AclOperation_t - * @brief Apache Kafka ACL operation types. - */ -typedef enum rd_kafka_AclOperation_t { - RD_KAFKA_ACL_OPERATION_UNKNOWN = 0, /**< Unknown */ - RD_KAFKA_ACL_OPERATION_ANY = - 1, /**< In a filter, matches any AclOperation */ - RD_KAFKA_ACL_OPERATION_ALL = 2, /**< ALL operation */ - RD_KAFKA_ACL_OPERATION_READ = 3, /**< READ operation */ - RD_KAFKA_ACL_OPERATION_WRITE = 4, /**< WRITE operation */ - RD_KAFKA_ACL_OPERATION_CREATE = 5, /**< CREATE operation */ - RD_KAFKA_ACL_OPERATION_DELETE = 6, /**< DELETE operation */ - RD_KAFKA_ACL_OPERATION_ALTER = 7, /**< ALTER operation */ - RD_KAFKA_ACL_OPERATION_DESCRIBE = 8, /**< DESCRIBE operation */ - RD_KAFKA_ACL_OPERATION_CLUSTER_ACTION = - 9, /**< CLUSTER_ACTION operation */ - RD_KAFKA_ACL_OPERATION_DESCRIBE_CONFIGS = - 10, /**< DESCRIBE_CONFIGS operation */ - RD_KAFKA_ACL_OPERATION_ALTER_CONFIGS = - 11, /**< ALTER_CONFIGS operation */ - RD_KAFKA_ACL_OPERATION_IDEMPOTENT_WRITE = - 12, /**< IDEMPOTENT_WRITE operation */ - RD_KAFKA_ACL_OPERATION__CNT -} rd_kafka_AclOperation_t; - /** * @returns a string representation of the \p acl_operation */ @@ -8715,6 +10046,100 @@ RD_EXPORT void rd_kafka_DeleteAcls(rd_kafka_t *rk, /**@}*/ +/** + * @name Admin API - Elect Leaders + * @{ + * + * + * + */ + +/** + * @brief Represents elect leaders request. + */ +typedef struct rd_kafka_ElectLeaders_s rd_kafka_ElectLeaders_t; + +/** + * @enum rd_kafka_ElectionType_t + * @brief Apache Kafka Election Types + */ +typedef enum rd_kafka_ElectionType_t { + RD_KAFKA_ELECTION_TYPE_PREFERRED = 0, /**< Preferred Replica Election */ + RD_KAFKA_ELECTION_TYPE_UNCLEAN = 1, /**< Unclean Election */ +} rd_kafka_ElectionType_t; + +/** + * @brief Create a new rd_kafka_ElectLeaders_t object. This object is later + * passed to rd_kafka_ElectLeaders(). + * + * @param election_type The election type that needs to be performed, + * preferred or unclean. + * @param partitions The topic partitions for which the leader election + * needs to be performed. + * + * @returns a new allocated elect leaders object or returns NULL in case + * of invalid election_type. + * Use rd_kafka_ElectLeaders_destroy() to free object when done. + */ +RD_EXPORT rd_kafka_ElectLeaders_t * +rd_kafka_ElectLeaders_new(rd_kafka_ElectionType_t election_type, + rd_kafka_topic_partition_list_t *partitions); + +/** + * @brief Destroy and free a rd_kafka_ElectLeaders_t object previously created + * with rd_kafka_ElectLeaders_new() + * + * @param elect_leaders The rd_kafka_ElectLeaders_t object to be destroyed. + */ +RD_EXPORT void +rd_kafka_ElectLeaders_destroy(rd_kafka_ElectLeaders_t *elect_leaders); + +/** + * @brief Elect Leaders for the provided Topic Partitions + * according to the specified election type. + * + * @param rk Client instance. + * @param elect_leaders The elect leaders request containing + * election type and partitions information. + * @param options Optional admin options, or NULL for defaults. + * @param rkqu Queue to emit result on. + * + * Supported admin options: + * - rd_kafka_AdminOptions_set_operation_timeout() - default 60 seconds. + * Controls how long the brokers will wait for records to be deleted. + * - rd_kafka_AdminOptions_set_request_timeout() - default socket.timeout.ms. + * Controls how long \c rdkafka will wait for the request to complete. + * + * @remark The result event type emitted on the supplied queue is of type + * \c RD_KAFKA_EVENT_ELECTLEADERS_RESULT + * @remark If we are passing partitions as NULL, then the broker + * will attempt leader election for all partitions, but the results + * will contain only partitions for which there was an election or + * resulted in an error. + */ +RD_EXPORT void rd_kafka_ElectLeaders(rd_kafka_t *rk, + rd_kafka_ElectLeaders_t *elect_leaders, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu); + +/** + * @brief Get the array of topic partition result objects from the + * elect leaders result event and populates the size of the + * array in \p cntp. + * + * @param result The elect leaders result. + * @param cntp The number of elements in the array. + * + * @returns the array of topic partition result objects from the + * elect leaders result event. + */ +RD_EXPORT const rd_kafka_topic_partition_result_t ** +rd_kafka_ElectLeaders_result_partitions( + const rd_kafka_ElectLeaders_result_t *result, + size_t *cntp); + +/**@}*/ + /** * @name Security APIs * @{ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_admin.c b/src/third_party/librdkafka/dist/src/rdkafka_admin.c index d6863f43e1f..b2671f3c821 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_admin.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_admin.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -230,11 +231,12 @@ static const char *rd_kafka_admin_state_desc[] = { * @enum Admin request target broker. Must be negative values since the field * used is broker_id. */ -enum { RD_KAFKA_ADMIN_TARGET_CONTROLLER = -1, /**< Cluster controller */ - RD_KAFKA_ADMIN_TARGET_COORDINATOR = -2, /**< (Group) Coordinator */ - RD_KAFKA_ADMIN_TARGET_FANOUT = -3, /**< This rko is a fanout and - * and has no target broker */ - RD_KAFKA_ADMIN_TARGET_ALL = -4, /**< All available brokers */ +enum { + RD_KAFKA_ADMIN_TARGET_CONTROLLER = -1, /**< Cluster controller */ + RD_KAFKA_ADMIN_TARGET_COORDINATOR = -2, /**< (Group) Coordinator */ + RD_KAFKA_ADMIN_TARGET_FANOUT = -3, /**< This rko is a fanout and + * and has no target broker */ + RD_KAFKA_ADMIN_TARGET_ALL = -4, /**< All available brokers */ }; /** @@ -380,6 +382,8 @@ static rd_kafka_op_t *rd_kafka_admin_result_new(rd_kafka_op_t *rko_req) { rko_result->rko_evtype = rko_req->rko_u.admin_request.reply_event_type; + rko_result->rko_u.admin_result.cbs = rko_req->rko_u.admin_request.cbs; + return rko_result; } @@ -427,6 +431,8 @@ static RD_UNUSED RD_FORMAT(printf, 3, 4) void rd_kafka_admin_result_set_err( */ static RD_INLINE void rd_kafka_admin_result_enq(rd_kafka_op_t *rko_req, rd_kafka_op_t *rko_result) { + if (rko_req->rko_u.admin_result.result_cb) + rko_req->rko_u.admin_result.result_cb(rko_result); rd_kafka_replyq_enq(&rko_req->rko_u.admin_request.replyq, rko_result, rko_req->rko_u.admin_request.replyq.version); } @@ -492,8 +498,8 @@ rd_kafka_admin_coord_request(rd_kafka_broker_t *rkb, rkb, &rko->rko_u.admin_request.args, &rko->rko_u.admin_request.options, errstr, sizeof(errstr), replyq, rd_kafka_admin_handle_response, eonce); + if (err) { - rd_kafka_enq_once_del_source(eonce, "coordinator response"); rd_kafka_admin_result_fail( rko, err, "%s worker failed to send request: %s", rd_kafka_op2str(rko->rko_type), errstr); @@ -528,7 +534,8 @@ rd_kafka_admin_result_ret_resources(const rd_kafka_op_t *rko, size_t *cntp) { rd_kafka_op_type_t reqtype = rko->rko_u.admin_result.reqtype & ~RD_KAFKA_OP_FLAGMASK; rd_assert(reqtype == RD_KAFKA_OP_ALTERCONFIGS || - reqtype == RD_KAFKA_OP_DESCRIBECONFIGS); + reqtype == RD_KAFKA_OP_DESCRIBECONFIGS || + reqtype == RD_KAFKA_OP_INCREMENTALALTERCONFIGS); *cntp = rd_list_cnt(&rko->rko_u.admin_result.results); return (const rd_kafka_ConfigResource_t **) @@ -658,6 +665,12 @@ rd_kafka_admin_request_op_new(rd_kafka_t *rk, return rko; } +static void +rd_kafka_admin_request_op_result_cb_set(rd_kafka_op_t *op, + void (*result_cb)(rd_kafka_op_t *)) { + op->rko_u.admin_result.result_cb = result_cb; +} + /** * @returns the remaining request timeout in milliseconds. @@ -1426,8 +1439,7 @@ static rd_kafka_op_res_t rd_kafka_admin_fanout_worker(rd_kafka_t *rk, NULL); /* Enqueue result on application queue, we're done. */ - rd_kafka_replyq_enq(&rko_fanout->rko_u.admin_request.replyq, rko_result, - rko_fanout->rko_u.admin_request.replyq.version); + rd_kafka_admin_result_enq(rko_fanout, rko_result); /* FALLTHRU */ if (rko_fanout->rko_u.admin_request.fanout.outstanding == 0) @@ -1480,6 +1492,35 @@ static rd_kafka_op_t *rd_kafka_admin_request_op_target_all_new( return rko; } + +/** + * @brief Construct MetadataRequest for use with AdminAPI (does not send). + * Common for DescribeTopics and DescribeCluster. + * + * @sa rd_kafka_MetadataRequest_resp_cb. + */ +static rd_kafka_resp_err_t +rd_kafka_admin_MetadataRequest(rd_kafka_broker_t *rkb, + const rd_list_t *topics, + const char *reason, + rd_bool_t include_cluster_authorized_operations, + rd_bool_t include_topic_authorized_operations, + rd_bool_t force_racks, + rd_kafka_resp_cb_t *resp_cb, + rd_kafka_replyq_t replyq, + void *opaque) { + return rd_kafka_MetadataRequest_resp_cb( + rkb, topics, NULL, reason, + rd_false /* No admin operation requires topic creation. */, + include_cluster_authorized_operations, + include_topic_authorized_operations, + rd_false /* No admin operation should update cgrp. */, + -1 /* No subscription version is used */, force_racks, resp_cb, + replyq, + rd_true /* Admin operation metadata requests are always forced. */, + opaque); +} + /**@}*/ @@ -1522,20 +1563,6 @@ rd_kafka_AdminOptions_set_validate_only(rd_kafka_AdminOptions_t *options, errstr, errstr_size); } -rd_kafka_resp_err_t -rd_kafka_AdminOptions_set_incremental(rd_kafka_AdminOptions_t *options, - int true_or_false, - char *errstr, - size_t errstr_size) { - rd_snprintf(errstr, errstr_size, - "Incremental updates currently not supported, see KIP-248"); - return RD_KAFKA_RESP_ERR__NOT_IMPLEMENTED; - - return rd_kafka_confval_set_type(&options->incremental, - RD_KAFKA_CONFVAL_INT, &true_or_false, - errstr, errstr_size); -} - rd_kafka_resp_err_t rd_kafka_AdminOptions_set_broker(rd_kafka_AdminOptions_t *options, int32_t broker_id, @@ -1547,6 +1574,16 @@ rd_kafka_AdminOptions_set_broker(rd_kafka_AdminOptions_t *options, &ibroker_id, errstr, errstr_size); } +rd_kafka_error_t * +rd_kafka_AdminOptions_set_isolation_level(rd_kafka_AdminOptions_t *options, + rd_kafka_IsolationLevel_t value) { + char errstr[512]; + rd_kafka_resp_err_t err = rd_kafka_confval_set_type( + &options->isolation_level, RD_KAFKA_CONFVAL_INT, &value, errstr, + sizeof(errstr)); + return !err ? NULL : rd_kafka_error_new(err, "%s", errstr); +} + rd_kafka_error_t *rd_kafka_AdminOptions_set_require_stable_offsets( rd_kafka_AdminOptions_t *options, int true_or_false) { @@ -1557,6 +1594,16 @@ rd_kafka_error_t *rd_kafka_AdminOptions_set_require_stable_offsets( return !err ? NULL : rd_kafka_error_new(err, "%s", errstr); } +rd_kafka_error_t *rd_kafka_AdminOptions_set_include_authorized_operations( + rd_kafka_AdminOptions_t *options, + int true_or_false) { + char errstr[512]; + rd_kafka_resp_err_t err = rd_kafka_confval_set_type( + &options->include_authorized_operations, RD_KAFKA_CONFVAL_INT, + &true_or_false, errstr, sizeof(errstr)); + return !err ? NULL : rd_kafka_error_new(err, "%s", errstr); +} + rd_kafka_error_t *rd_kafka_AdminOptions_set_match_consumer_group_states( rd_kafka_AdminOptions_t *options, const rd_kafka_consumer_group_state_t *consumer_group_states, @@ -1604,6 +1651,60 @@ rd_kafka_error_t *rd_kafka_AdminOptions_set_match_consumer_group_states( return !err ? NULL : rd_kafka_error_new(err, "%s", errstr); } +rd_kafka_error_t *rd_kafka_AdminOptions_set_match_consumer_group_types( + rd_kafka_AdminOptions_t *options, + const rd_kafka_consumer_group_type_t *consumer_group_types, + size_t consumer_group_types_cnt) { + size_t i; + char errstr[512]; + rd_kafka_resp_err_t err; + rd_list_t *types_list = rd_list_new(0, NULL); + uint64_t types_bitmask = 0; + + rd_list_init_int32(types_list, consumer_group_types_cnt); + + if (RD_KAFKA_CONSUMER_GROUP_TYPE__CNT >= 64) { + rd_assert("BUG: cannot handle types with a bitmask anymore"); + } + + for (i = 0; i < consumer_group_types_cnt; i++) { + uint64_t type_bit; + rd_kafka_consumer_group_type_t type = consumer_group_types[i]; + + if (type < RD_KAFKA_CONSUMER_GROUP_TYPE_UNKNOWN || + type >= RD_KAFKA_CONSUMER_GROUP_TYPE__CNT) { + rd_list_destroy(types_list); + return rd_kafka_error_new( + RD_KAFKA_RESP_ERR__INVALID_ARG, + "Only a valid type is allowed"); + } else if (type == RD_KAFKA_CONSUMER_GROUP_TYPE_UNKNOWN) { + rd_list_destroy(types_list); + return rd_kafka_error_new( + RD_KAFKA_RESP_ERR__INVALID_ARG, + "UNKNOWN type is not allowed"); + } + + type_bit = 1 << type; + if (types_bitmask & type_bit) { + rd_list_destroy(types_list); + return rd_kafka_error_new( + RD_KAFKA_RESP_ERR__INVALID_ARG, + "Duplicate types not allowed"); + } else { + types_bitmask = types_bitmask | type_bit; + rd_list_set_int32(types_list, (int32_t)i, type); + } + } + + err = rd_kafka_confval_set_type(&options->match_consumer_group_types, + RD_KAFKA_CONFVAL_PTR, types_list, + errstr, sizeof(errstr)); + if (err) { + rd_list_destroy(types_list); + } + return !err ? NULL : rd_kafka_error_new(err, "%s", errstr); +} + void rd_kafka_AdminOptions_set_opaque(rd_kafka_AdminOptions_t *options, void *opaque) { rd_kafka_confval_set_type(&options->opaque, RD_KAFKA_CONFVAL_PTR, @@ -1624,7 +1725,9 @@ static void rd_kafka_AdminOptions_init(rd_kafka_t *rk, options->for_api == RD_KAFKA_ADMIN_OP_CREATETOPICS || options->for_api == RD_KAFKA_ADMIN_OP_DELETETOPICS || options->for_api == RD_KAFKA_ADMIN_OP_CREATEPARTITIONS || - options->for_api == RD_KAFKA_ADMIN_OP_DELETERECORDS) + options->for_api == RD_KAFKA_ADMIN_OP_DELETERECORDS || + options->for_api == RD_KAFKA_ADMIN_OP_LISTOFFSETS || + options->for_api == RD_KAFKA_ADMIN_OP_ELECTLEADERS) rd_kafka_confval_init_int(&options->operation_timeout, "operation_timeout", -1, 3600 * 1000, rk->rk_conf.admin.request_timeout_ms); @@ -1635,20 +1738,14 @@ static void rd_kafka_AdminOptions_init(rd_kafka_t *rk, if (options->for_api == RD_KAFKA_ADMIN_OP_ANY || options->for_api == RD_KAFKA_ADMIN_OP_CREATETOPICS || options->for_api == RD_KAFKA_ADMIN_OP_CREATEPARTITIONS || - options->for_api == RD_KAFKA_ADMIN_OP_ALTERCONFIGS) + options->for_api == RD_KAFKA_ADMIN_OP_ALTERCONFIGS || + options->for_api == RD_KAFKA_ADMIN_OP_INCREMENTALALTERCONFIGS) rd_kafka_confval_init_int(&options->validate_only, "validate_only", 0, 1, 0); else rd_kafka_confval_disable(&options->validate_only, "validate_only"); - if (options->for_api == RD_KAFKA_ADMIN_OP_ANY || - options->for_api == RD_KAFKA_ADMIN_OP_ALTERCONFIGS) - rd_kafka_confval_init_int(&options->incremental, "incremental", - 0, 1, 0); - else - rd_kafka_confval_disable(&options->incremental, "incremental"); - if (options->for_api == RD_KAFKA_ADMIN_OP_ANY || options->for_api == RD_KAFKA_ADMIN_OP_LISTCONSUMERGROUPOFFSETS) rd_kafka_confval_init_int(&options->require_stable_offsets, @@ -1657,6 +1754,18 @@ static void rd_kafka_AdminOptions_init(rd_kafka_t *rk, rd_kafka_confval_disable(&options->require_stable_offsets, "require_stable_offsets"); + if (options->for_api == RD_KAFKA_ADMIN_OP_ANY || + options->for_api == RD_KAFKA_ADMIN_OP_DESCRIBECONSUMERGROUPS || + options->for_api == RD_KAFKA_ADMIN_OP_DESCRIBECLUSTER || + options->for_api == RD_KAFKA_ADMIN_OP_DESCRIBETOPICS) + rd_kafka_confval_init_int( + &options->include_authorized_operations, + "include_authorized_operations", 0, 1, 0); + else + rd_kafka_confval_disable( + &options->include_authorized_operations, + "include_authorized_operations"); + if (options->for_api == RD_KAFKA_ADMIN_OP_ANY || options->for_api == RD_KAFKA_ADMIN_OP_LISTCONSUMERGROUPS) rd_kafka_confval_init_ptr(&options->match_consumer_group_states, @@ -1665,6 +1774,22 @@ static void rd_kafka_AdminOptions_init(rd_kafka_t *rk, rd_kafka_confval_disable(&options->match_consumer_group_states, "match_consumer_group_states"); + if (options->for_api == RD_KAFKA_ADMIN_OP_ANY || + options->for_api == RD_KAFKA_ADMIN_OP_LISTCONSUMERGROUPS) + rd_kafka_confval_init_ptr(&options->match_consumer_group_types, + "match_consumer_group_types"); + else + rd_kafka_confval_disable(&options->match_consumer_group_types, + "match_consumer_group_types"); + + if (options->for_api == RD_KAFKA_ADMIN_OP_ANY || + options->for_api == RD_KAFKA_ADMIN_OP_LISTOFFSETS) + rd_kafka_confval_init_int(&options->isolation_level, + "isolation_level", 0, 1, 0); + else + rd_kafka_confval_disable(&options->isolation_level, + "isolation_level"); + rd_kafka_confval_init_int(&options->broker, "broker", 0, INT32_MAX, -1); rd_kafka_confval_init_ptr(&options->opaque, "opaque"); } @@ -1689,6 +1814,16 @@ static void rd_kafka_AdminOptions_copy_to(rd_kafka_AdminOptions_t *dst, states_list_copy, errstr, sizeof(errstr)); rd_assert(!err); } + if (src->match_consumer_group_types.u.PTR) { + char errstr[512]; + rd_list_t *types_list_copy = rd_list_copy_preallocated( + src->match_consumer_group_types.u.PTR, NULL); + + rd_kafka_resp_err_t err = rd_kafka_confval_set_type( + &dst->match_consumer_group_types, RD_KAFKA_CONFVAL_PTR, + types_list_copy, errstr, sizeof(errstr)); + rd_assert(!err); + } } @@ -1712,6 +1847,9 @@ void rd_kafka_AdminOptions_destroy(rd_kafka_AdminOptions_t *options) { if (options->match_consumer_group_states.u.PTR) { rd_list_destroy(options->match_consumer_group_states.u.PTR); } + if (options->match_consumer_group_types.u.PTR) { + rd_list_destroy(options->match_consumer_group_types.u.PTR); + } rd_free(options); } @@ -1883,18 +2021,14 @@ rd_kafka_NewTopic_set_replica_assignment(rd_kafka_NewTopic_t *new_topic, * @brief Generic constructor of ConfigEntry which is also added to \p rl */ static rd_kafka_resp_err_t -rd_kafka_admin_add_config0(rd_list_t *rl, - const char *name, - const char *value, - rd_kafka_AlterOperation_t operation) { +rd_kafka_admin_add_config0(rd_list_t *rl, const char *name, const char *value) { rd_kafka_ConfigEntry_t *entry; if (!name) return RD_KAFKA_RESP_ERR__INVALID_ARG; - entry = rd_calloc(1, sizeof(*entry)); - entry->kv = rd_strtup_new(name, value); - entry->a.operation = operation; + entry = rd_calloc(1, sizeof(*entry)); + entry->kv = rd_strtup_new(name, value); rd_list_add(rl, entry); @@ -1902,11 +2036,36 @@ rd_kafka_admin_add_config0(rd_list_t *rl, } +/** + * @brief Generic constructor of ConfigEntry for Incremental Alter Operations + * which is also added to \p rl + */ +static rd_kafka_error_t * +rd_kafka_admin_incremental_add_config0(rd_list_t *rl, + const char *name, + rd_kafka_AlterConfigOpType_t op_type, + const char *value) { + rd_kafka_ConfigEntry_t *entry; + + if (!name) { + return rd_kafka_error_new(RD_KAFKA_RESP_ERR__INVALID_ARG, + "Config name is required"); + } + + entry = rd_calloc(1, sizeof(*entry)); + entry->kv = rd_strtup_new(name, value); + entry->a.op_type = op_type; + + rd_list_add(rl, entry); + + return NULL; +} + + rd_kafka_resp_err_t rd_kafka_NewTopic_set_config(rd_kafka_NewTopic_t *new_topic, const char *name, const char *value) { - return rd_kafka_admin_add_config0(&new_topic->config, name, value, - RD_KAFKA_ALTER_OP_ADD); + return rd_kafka_admin_add_config0(&new_topic->config, name, value); } @@ -1996,7 +2155,7 @@ rd_kafka_CreateTopicsResponse_parse(rd_kafka_op_t *rko_req, * does not maintain ordering unfortunately. */ skel.topic = terr->topic; orig_pos = rd_list_index(&rko_result->rko_u.admin_result.args, - &skel, rd_kafka_NewTopic_cmp); + &skel, rd_kafka_NewTopic_cmp); if (orig_pos == -1) { rd_kafka_topic_result_destroy(terr); rd_kafka_buf_parse_fail( @@ -2205,7 +2364,7 @@ rd_kafka_DeleteTopicsResponse_parse(rd_kafka_op_t *rko_req, * does not maintain ordering unfortunately. */ skel.topic = terr->topic; orig_pos = rd_list_index(&rko_result->rko_u.admin_result.args, - &skel, rd_kafka_DeleteTopic_cmp); + &skel, rd_kafka_DeleteTopic_cmp); if (orig_pos == -1) { rd_kafka_topic_result_destroy(terr); rd_kafka_buf_parse_fail( @@ -2490,7 +2649,7 @@ rd_kafka_CreatePartitionsResponse_parse(rd_kafka_op_t *rko_req, * does not maintain ordering unfortunately. */ skel.topic = terr->topic; orig_pos = rd_list_index(&rko_result->rko_u.admin_result.args, - &skel, rd_kafka_NewPartitions_cmp); + &skel, rd_kafka_NewPartitions_cmp); if (orig_pos == -1) { rd_kafka_topic_result_destroy(terr); rd_kafka_buf_parse_fail( @@ -2709,9 +2868,15 @@ rd_kafka_ConfigEntry_synonyms(const rd_kafka_ConfigEntry_t *entry, const char *rd_kafka_ConfigSource_name(rd_kafka_ConfigSource_t confsource) { static const char *names[] = { - "UNKNOWN_CONFIG", "DYNAMIC_TOPIC_CONFIG", - "DYNAMIC_BROKER_CONFIG", "DYNAMIC_DEFAULT_BROKER_CONFIG", - "STATIC_BROKER_CONFIG", "DEFAULT_CONFIG", + "UNKNOWN_CONFIG", + "DYNAMIC_TOPIC_CONFIG", + "DYNAMIC_BROKER_CONFIG", + "DYNAMIC_DEFAULT_BROKER_CONFIG", + "STATIC_BROKER_CONFIG", + "DEFAULT_CONFIG", + "DYNAMIC_BROKER_LOGGER_CONFIG", + "CLIENT_METRICS_CONFIG", + "GROUP_CONFIG", }; if ((unsigned int)confsource >= @@ -2746,9 +2911,8 @@ const char *rd_kafka_ResourcePatternType_name( } const char *rd_kafka_ResourceType_name(rd_kafka_ResourceType_t restype) { - static const char *names[] = { - "UNKNOWN", "ANY", "TOPIC", "GROUP", "BROKER", - }; + static const char *names[] = {"UNKNOWN", "ANY", "TOPIC", + "GROUP", "BROKER", "TRANSACTIONAL_ID"}; if ((unsigned int)restype >= (unsigned int)RD_KAFKA_RESOURCE__CNT) return "UNSUPPORTED"; @@ -2757,6 +2921,35 @@ const char *rd_kafka_ResourceType_name(rd_kafka_ResourceType_t restype) { } +rd_kafka_ConfigResourceType_t +rd_kafka_ResourceType_to_ConfigResourceType(rd_kafka_ResourceType_t restype) { + switch (restype) { + case RD_KAFKA_RESOURCE_TOPIC: + return RD_KAFKA_CONFIG_RESOURCE_TOPIC; + case RD_KAFKA_RESOURCE_BROKER: + return RD_KAFKA_CONFIG_RESOURCE_BROKER; + case RD_KAFKA_RESOURCE_GROUP: + return RD_KAFKA_CONFIG_RESOURCE_GROUP; + default: + return RD_KAFKA_CONFIG_RESOURCE_UNKNOWN; + } +} + +rd_kafka_ResourceType_t rd_kafka_ConfigResourceType_to_ResourceType( + rd_kafka_ConfigResourceType_t config_resource_type) { + switch (config_resource_type) { + case RD_KAFKA_CONFIG_RESOURCE_TOPIC: + return RD_KAFKA_RESOURCE_TOPIC; + case RD_KAFKA_CONFIG_RESOURCE_BROKER: + return RD_KAFKA_RESOURCE_BROKER; + case RD_KAFKA_CONFIG_RESOURCE_GROUP: + return RD_KAFKA_RESOURCE_GROUP; + default: + return RD_KAFKA_RESOURCE_UNKNOWN; + } +} + + rd_kafka_ConfigResource_t * rd_kafka_ConfigResource_new(rd_kafka_ResourceType_t restype, const char *resname) { @@ -2831,18 +3024,6 @@ rd_kafka_ConfigResource_add_ConfigEntry(rd_kafka_ConfigResource_t *config, rd_list_add(&config->config, entry); } - -rd_kafka_resp_err_t -rd_kafka_ConfigResource_add_config(rd_kafka_ConfigResource_t *config, - const char *name, - const char *value) { - if (!name || !*name || !value) - return RD_KAFKA_RESP_ERR__INVALID_ARG; - - return rd_kafka_admin_add_config0(&config->config, name, value, - RD_KAFKA_ALTER_OP_ADD); -} - rd_kafka_resp_err_t rd_kafka_ConfigResource_set_config(rd_kafka_ConfigResource_t *config, const char *name, @@ -2850,18 +3031,35 @@ rd_kafka_ConfigResource_set_config(rd_kafka_ConfigResource_t *config, if (!name || !*name || !value) return RD_KAFKA_RESP_ERR__INVALID_ARG; - return rd_kafka_admin_add_config0(&config->config, name, value, - RD_KAFKA_ALTER_OP_SET); + return rd_kafka_admin_add_config0(&config->config, name, value); } -rd_kafka_resp_err_t -rd_kafka_ConfigResource_delete_config(rd_kafka_ConfigResource_t *config, - const char *name) { - if (!name || !*name) - return RD_KAFKA_RESP_ERR__INVALID_ARG; - return rd_kafka_admin_add_config0(&config->config, name, NULL, - RD_KAFKA_ALTER_OP_DELETE); +rd_kafka_error_t *rd_kafka_ConfigResource_add_incremental_config( + rd_kafka_ConfigResource_t *config, + const char *name, + rd_kafka_AlterConfigOpType_t op_type, + const char *value) { + if (op_type < 0 || op_type >= RD_KAFKA_ALTER_CONFIG_OP_TYPE__CNT) { + return rd_kafka_error_new( + RD_KAFKA_RESP_ERR__INVALID_ARG, + "Invalid alter config operation type"); + } + + if (!name || !*name) { + return rd_kafka_error_new(RD_KAFKA_RESP_ERR__INVALID_ARG, + !name + ? "Config name is required" + : "Config name mustn't be empty"); + } + + if (op_type != RD_KAFKA_ALTER_CONFIG_OP_TYPE_DELETE && !value) { + return rd_kafka_error_new(RD_KAFKA_RESP_ERR__INVALID_ARG, + "Config value is required"); + } + + return rd_kafka_admin_incremental_add_config0(&config->config, name, + op_type, value); } @@ -2995,7 +3193,7 @@ rd_kafka_AlterConfigsResponse_parse(rd_kafka_op_t *rko_req, rd_kafka_buf_read_i32(reply, &Throttle_Time); rd_kafka_op_throttle_time(rkb, rk->rk_rep, Throttle_Time); - rd_kafka_buf_read_i32(reply, &res_cnt); + rd_kafka_buf_read_arraycnt(reply, &res_cnt, RD_KAFKAP_CONFIGS_MAX); if (res_cnt > rd_list_cnt(&rko_req->rko_u.admin_request.args)) { rd_snprintf(errstr, errstr_size, @@ -3016,6 +3214,7 @@ rd_kafka_AlterConfigsResponse_parse(rd_kafka_op_t *rko_req, int16_t error_code; rd_kafkap_str_t error_msg; int8_t res_type; + int8_t config_resource_type; rd_kafkap_str_t kres_name; char *res_name; char *this_errstr = NULL; @@ -3025,9 +3224,13 @@ rd_kafka_AlterConfigsResponse_parse(rd_kafka_op_t *rko_req, rd_kafka_buf_read_i16(reply, &error_code); rd_kafka_buf_read_str(reply, &error_msg); - rd_kafka_buf_read_i8(reply, &res_type); + rd_kafka_buf_read_i8(reply, &config_resource_type); rd_kafka_buf_read_str(reply, &kres_name); RD_KAFKAP_STR_DUPA(&res_name, &kres_name); + rd_kafka_buf_skip_tags(reply); + + res_type = rd_kafka_ConfigResourceType_to_ResourceType( + config_resource_type); if (error_code) { if (RD_KAFKAP_STR_IS_NULL(&error_msg) || @@ -3157,6 +3360,281 @@ const rd_kafka_ConfigResource_t **rd_kafka_AlterConfigs_result_resources( +/** + * @name IncrementalAlterConfigs + * @{ + * + * + * + */ + + + +/** + * @brief Parse IncrementalAlterConfigsResponse and create ADMIN_RESULT op. + */ +static rd_kafka_resp_err_t +rd_kafka_IncrementalAlterConfigsResponse_parse(rd_kafka_op_t *rko_req, + rd_kafka_op_t **rko_resultp, + rd_kafka_buf_t *reply, + char *errstr, + size_t errstr_size) { + const int log_decode_errors = LOG_ERR; + rd_kafka_broker_t *rkb = reply->rkbuf_rkb; + rd_kafka_t *rk = rkb->rkb_rk; + rd_kafka_op_t *rko_result = NULL; + int32_t res_cnt; + int i; + int32_t Throttle_Time; + + rd_kafka_buf_read_i32(reply, &Throttle_Time); + rd_kafka_op_throttle_time(rkb, rk->rk_rep, Throttle_Time); + + rd_kafka_buf_read_arraycnt(reply, &res_cnt, RD_KAFKAP_CONFIGS_MAX); + + if (res_cnt != rd_list_cnt(&rko_req->rko_u.admin_request.args)) { + rd_snprintf(errstr, errstr_size, + "Received %" PRId32 + " ConfigResources in response " + "when %d were requested", + res_cnt, + rd_list_cnt(&rko_req->rko_u.admin_request.args)); + return RD_KAFKA_RESP_ERR__BAD_MSG; + } + + rko_result = rd_kafka_admin_result_new(rko_req); + + rd_list_init(&rko_result->rko_u.admin_result.results, res_cnt, + rd_kafka_ConfigResource_free); + + for (i = 0; i < (int)res_cnt; i++) { + int16_t error_code; + rd_kafkap_str_t error_msg; + int8_t res_type; + int8_t config_resource_type; + rd_kafkap_str_t kres_name; + char *res_name; + char *this_errstr = NULL; + rd_kafka_ConfigResource_t *config; + rd_kafka_ConfigResource_t skel; + int orig_pos; + + rd_kafka_buf_read_i16(reply, &error_code); + rd_kafka_buf_read_str(reply, &error_msg); + rd_kafka_buf_read_i8(reply, &config_resource_type); + rd_kafka_buf_read_str(reply, &kres_name); + RD_KAFKAP_STR_DUPA(&res_name, &kres_name); + rd_kafka_buf_skip_tags(reply); + + res_type = rd_kafka_ConfigResourceType_to_ResourceType( + config_resource_type); + + if (error_code) { + if (RD_KAFKAP_STR_IS_NULL(&error_msg) || + RD_KAFKAP_STR_LEN(&error_msg) == 0) + this_errstr = + (char *)rd_kafka_err2str(error_code); + else + RD_KAFKAP_STR_DUPA(&this_errstr, &error_msg); + } + + config = rd_kafka_ConfigResource_new(res_type, res_name); + if (!config) { + rd_kafka_log(rko_req->rko_rk, LOG_ERR, "ADMIN", + "IncrementalAlterConfigs returned " + "unsupported ConfigResource #%d with " + "type %d and name \"%s\": ignoring", + i, res_type, res_name); + continue; + } + + config->err = error_code; + if (this_errstr) + config->errstr = rd_strdup(this_errstr); + + /* As a convenience to the application we insert result + * in the same order as they were requested. The broker + * does not maintain ordering unfortunately. */ + skel.restype = config->restype; + skel.name = config->name; + orig_pos = rd_list_index(&rko_result->rko_u.admin_result.args, + &skel, rd_kafka_ConfigResource_cmp); + if (orig_pos == -1) { + rd_kafka_ConfigResource_destroy(config); + rd_kafka_buf_parse_fail( + reply, + "Broker returned ConfigResource %d,%s " + "that was not " + "included in the original request", + res_type, res_name); + } + + if (rd_list_elem(&rko_result->rko_u.admin_result.results, + orig_pos) != NULL) { + rd_kafka_ConfigResource_destroy(config); + rd_kafka_buf_parse_fail( + reply, + "Broker returned ConfigResource %d,%s " + "multiple times", + res_type, res_name); + } + + rd_list_set(&rko_result->rko_u.admin_result.results, orig_pos, + config); + } + + *rko_resultp = rko_result; + + return RD_KAFKA_RESP_ERR_NO_ERROR; + +err_parse: + if (rko_result) + rd_kafka_op_destroy(rko_result); + + rd_snprintf( + errstr, errstr_size, + "IncrementalAlterConfigs response protocol parse failure: %s", + rd_kafka_err2str(reply->rkbuf_err)); + + return reply->rkbuf_err; +} + +typedef RD_MAP_TYPE(const char *, const rd_bool_t *) map_str_bool; + + +void rd_kafka_IncrementalAlterConfigs(rd_kafka_t *rk, + rd_kafka_ConfigResource_t **configs, + size_t config_cnt, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu) { + rd_kafka_op_t *rko; + size_t i; + rd_kafka_resp_err_t err; + char errstr[256]; + rd_bool_t value = rd_true; + + static const struct rd_kafka_admin_worker_cbs cbs = { + rd_kafka_IncrementalAlterConfigsRequest, + rd_kafka_IncrementalAlterConfigsResponse_parse, + }; + + rd_assert(rkqu); + + rko = rd_kafka_admin_request_op_new( + rk, RD_KAFKA_OP_INCREMENTALALTERCONFIGS, + RD_KAFKA_EVENT_INCREMENTALALTERCONFIGS_RESULT, &cbs, options, + rkqu->rkqu_q); + + rd_list_init(&rko->rko_u.admin_request.args, (int)config_cnt, + rd_kafka_ConfigResource_free); + + /* Check duplicate ConfigResource */ + map_str_bool configs_map = RD_MAP_INITIALIZER( + config_cnt, rd_map_str_cmp, rd_map_str_hash, NULL, NULL); + + for (i = 0; i < config_cnt; i++) { + /* 2 chars for the decimal restype + 1 for the comma + * + 1 for the trailing zero. */ + size_t len = 4 + strlen(configs[i]->name); + char *key = rd_alloca(len); + const rd_kafka_ConfigEntry_t **entries; + size_t entry_cnt, j; + + rd_snprintf(key, len - 1, "%d,%s", configs[i]->restype, + configs[i]->name); + if (RD_MAP_GET(&configs_map, key)) { + /* Duplicate ConfigResource found */ + break; + } + RD_MAP_SET(&configs_map, key, &value); + entries = + rd_kafka_ConfigResource_configs(configs[i], &entry_cnt); + + /* Check duplicate ConfigEntry */ + map_str_bool entries_map = RD_MAP_INITIALIZER( + entry_cnt, rd_map_str_cmp, rd_map_str_hash, NULL, NULL); + + for (j = 0; j < entry_cnt; j++) { + const rd_kafka_ConfigEntry_t *entry = entries[j]; + const char *key = rd_kafka_ConfigEntry_name(entry); + + if (RD_MAP_GET(&entries_map, key)) { + /* Duplicate ConfigEntry found */ + break; + } + RD_MAP_SET(&entries_map, key, &value); + } + RD_MAP_DESTROY(&entries_map); + + if (j != entry_cnt) { + RD_MAP_DESTROY(&configs_map); + rd_kafka_admin_result_fail( + rko, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Duplicate ConfigEntry found"); + rd_kafka_admin_common_worker_destroy( + rk, rko, rd_true /*destroy*/); + return; + } + + rd_list_add(&rko->rko_u.admin_request.args, + rd_kafka_ConfigResource_copy(configs[i])); + } + + RD_MAP_DESTROY(&configs_map); + + if (i != config_cnt) { + rd_kafka_admin_result_fail(rko, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Duplicate ConfigResource found"); + rd_kafka_admin_common_worker_destroy(rk, rko, + rd_true /*destroy*/); + return; + } + + /* If there's a BROKER resource in the list we need to + * speak directly to that broker rather than the controller. + * + * Multiple BROKER resources are not allowed. + */ + err = rd_kafka_ConfigResource_get_single_broker_id( + &rko->rko_u.admin_request.args, &rko->rko_u.admin_request.broker_id, + errstr, sizeof(errstr)); + if (err) { + rd_kafka_admin_result_fail(rko, err, "%s", errstr); + rd_kafka_admin_common_worker_destroy(rk, rko, + rd_true /*destroy*/); + return; + } + if (rko->rko_u.admin_request.broker_id != + RD_KAFKA_ADMIN_TARGET_CONTROLLER) { + /* Revert broker option to default if altering + * broker configs. */ + err = rd_kafka_confval_set_type( + &rko->rko_u.admin_request.options.broker, + RD_KAFKA_CONFVAL_INT, NULL, errstr, sizeof(errstr)); + if (err) { + rd_kafka_admin_result_fail(rko, err, "%s", errstr); + rd_kafka_admin_common_worker_destroy( + rk, rko, rd_true /*destroy*/); + return; + } + } + + rd_kafka_q_enq(rk->rk_ops, rko); +} + + +const rd_kafka_ConfigResource_t ** +rd_kafka_IncrementalAlterConfigs_result_resources( + const rd_kafka_IncrementalAlterConfigs_result_t *result, + size_t *cntp) { + return rd_kafka_admin_result_ret_resources( + (const rd_kafka_op_t *)result, cntp); +} + +/**@}*/ + + + /** * @name DescribeConfigs * @{ @@ -3207,6 +3685,7 @@ rd_kafka_DescribeConfigsResponse_parse(rd_kafka_op_t *rko_req, for (i = 0; i < (int)res_cnt; i++) { int16_t error_code; rd_kafkap_str_t error_msg; + int8_t config_resource_type; int8_t res_type; rd_kafkap_str_t kres_name; char *res_name; @@ -3218,10 +3697,13 @@ rd_kafka_DescribeConfigsResponse_parse(rd_kafka_op_t *rko_req, rd_kafka_buf_read_i16(reply, &error_code); rd_kafka_buf_read_str(reply, &error_msg); - rd_kafka_buf_read_i8(reply, &res_type); + rd_kafka_buf_read_i8(reply, &config_resource_type); rd_kafka_buf_read_str(reply, &kres_name); RD_KAFKAP_STR_DUPA(&res_name, &kres_name); + res_type = rd_kafka_ConfigResourceType_to_ResourceType( + config_resource_type); + if (error_code) { if (RD_KAFKAP_STR_IS_NULL(&error_msg) || RD_KAFKAP_STR_LEN(&error_msg) == 0) @@ -3576,8 +4058,14 @@ rd_kafka_DeleteRecordsResponse_parse(rd_kafka_op_t *rko_req, rd_kafka_buf_read_throttle_time(reply); + + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_OFFSET, + RD_KAFKA_TOPIC_PARTITION_FIELD_ERR, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; offsets = rd_kafka_buf_read_topic_partitions( - reply, 0, rd_true /*read_offset*/, rd_true /*read_part_errs*/); + reply, rd_false /*don't use topic_id*/, rd_true, 0, fields); if (!offsets) rd_kafka_buf_parse_fail(reply, "Failed to parse topic partitions"); @@ -3598,13 +4086,309 @@ err_parse: return reply->rkbuf_err; } +/** + * @brief Creates a ListOffsetsResultInfo with the topic and parition and + * returns the ListOffsetsResultInfo. + */ +rd_kafka_ListOffsetsResultInfo_t * +rd_kafka_ListOffsetsResultInfo_new(rd_kafka_topic_partition_t *rktpar, + rd_ts_t timestamp) { + rd_kafka_ListOffsetsResultInfo_t *result_info; + result_info = rd_calloc(1, sizeof(*result_info)); + result_info->timestamp = timestamp; + result_info->topic_partition = rd_kafka_topic_partition_copy(rktpar); + return result_info; +} + +/** + * @brief Copies the ListOffsetsResultInfo. + */ +static rd_kafka_ListOffsetsResultInfo_t *rd_kafka_ListOffsetsResultInfo_copy( + const rd_kafka_ListOffsetsResultInfo_t *result_info) { + return rd_kafka_ListOffsetsResultInfo_new(result_info->topic_partition, + result_info->timestamp); +} + +/** + * @brief Same as rd_kafka_ListOffsetsResultInfo_copy() but suitable for + * rd_list_copy(). The \p opaque is ignored. + */ +static void *rd_kafka_ListOffsetsResultInfo_copy_opaque(const void *element, + void *opaque) { + return rd_kafka_ListOffsetsResultInfo_copy(element); +} + +/** + * @brief Returns the topic partition of the passed \p result_info. + */ +const rd_kafka_topic_partition_t * +rd_kafka_ListOffsetsResultInfo_topic_partition( + const rd_kafka_ListOffsetsResultInfo_t *result_info) { + return result_info->topic_partition; +} + +/** + * @brief Returns the timestamp specified for the offset of the + * rd_kafka_ListOffsetsResultInfo_t. + */ +int64_t rd_kafka_ListOffsetsResultInfo_timestamp( + const rd_kafka_ListOffsetsResultInfo_t *result_info) { + return result_info->timestamp; +} + +static void rd_kafka_ListOffsetsResultInfo_destroy( + rd_kafka_ListOffsetsResultInfo_t *element) { + rd_kafka_topic_partition_destroy(element->topic_partition); + rd_free(element); +} + +static void rd_kafka_ListOffsetsResultInfo_destroy_free(void *element) { + rd_kafka_ListOffsetsResultInfo_destroy(element); +} + +/** + * @brief Merges the response of the partial request made for ListOffsets via + * the \p rko_partial into the \p rko_fanout responsible for the + * ListOffsets request. + * @param rko_fanout The rd_kafka_op_t corresponding to the whole original + * ListOffsets request. + * @param rko_partial The rd_kafka_op_t corresponding to the leader specific + * ListOffset request sent after leaders querying. + */ +static void +rd_kafka_ListOffsets_response_merge(rd_kafka_op_t *rko_fanout, + const rd_kafka_op_t *rko_partial) { + size_t partition_cnt; + size_t total_partitions; + size_t i, j; + rd_assert(rko_partial->rko_evtype == RD_KAFKA_EVENT_LISTOFFSETS_RESULT); + + partition_cnt = rd_list_cnt(&rko_partial->rko_u.admin_result.results); + total_partitions = + rd_list_cnt(&rko_fanout->rko_u.admin_request.fanout.results); + + for (i = 0; i < partition_cnt; i++) { + rd_kafka_ListOffsetsResultInfo_t *partial_result_info = + rd_list_elem(&rko_partial->rko_u.admin_result.results, i); + for (j = 0; j < total_partitions; j++) { + rd_kafka_ListOffsetsResultInfo_t *result_info = + rd_list_elem( + &rko_fanout->rko_u.admin_request.fanout.results, + j); + if (rd_kafka_topic_partition_cmp( + result_info->topic_partition, + partial_result_info->topic_partition) == 0) { + result_info->timestamp = + partial_result_info->timestamp; + rd_kafka_topic_partition_destroy( + result_info->topic_partition); + result_info->topic_partition = + rd_kafka_topic_partition_copy( + partial_result_info->topic_partition); + break; + } + } + } +} + +/** + * @brief Returns the array of pointers of rd_kafka_ListOffsetsResultInfo_t + * given rd_kafka_ListOffsets_result_t and populates the size of the array. + */ +const rd_kafka_ListOffsetsResultInfo_t ** +rd_kafka_ListOffsets_result_infos(const rd_kafka_ListOffsets_result_t *result, + size_t *cntp) { + *cntp = rd_list_cnt(&result->rko_u.admin_result.results); + return (const rd_kafka_ListOffsetsResultInfo_t **) + result->rko_u.admin_result.results.rl_elems; +} + +/** + * @brief Admin compatible API to parse the ListOffsetResponse buffer + * provided in \p reply. + */ +static rd_kafka_resp_err_t +rd_kafka_ListOffsetsResponse_parse(rd_kafka_op_t *rko_req, + rd_kafka_op_t **rko_resultp, + rd_kafka_buf_t *reply, + char *errstr, + size_t errstr_size) { + rd_list_t *result_list = + rd_list_new(1, rd_kafka_ListOffsetsResultInfo_destroy_free); + rd_kafka_op_t *rko_result; + rd_kafka_parse_ListOffsets(reply, NULL, result_list); + if (reply->rkbuf_err) { + rd_snprintf(errstr, errstr_size, + "Error parsing ListOffsets response: %s", + rd_kafka_err2str(reply->rkbuf_err)); + return reply->rkbuf_err; + } + + rko_result = rd_kafka_admin_result_new(rko_req); + rd_list_init_copy(&rko_result->rko_u.admin_result.results, result_list); + rd_list_copy_to(&rko_result->rko_u.admin_result.results, result_list, + rd_kafka_ListOffsetsResultInfo_copy_opaque, NULL); + rd_list_destroy(result_list); + + *rko_resultp = rko_result; + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Should the received error code cause a metadata refresh? + */ +static rd_bool_t rd_kafka_admin_result_err_refresh(rd_kafka_resp_err_t err) { + switch (err) { + case RD_KAFKA_RESP_ERR_NOT_LEADER_OR_FOLLOWER: + case RD_KAFKA_RESP_ERR_LEADER_NOT_AVAILABLE: + return rd_true; + default: + return rd_false; + } +} + +/** + * @brief ListOffsets result handler for internal side effects. + */ +static void rd_kafka_ListOffsets_handle_result(rd_kafka_op_t *rko_result) { + rd_kafka_topic_partition_list_t *rktpars; + rd_kafka_ListOffsetsResultInfo_t *result_info; + rd_kafka_t *rk; + rd_kafka_resp_err_t err, rktpar_err; + rd_kafka_topic_partition_t *rktpar; + size_t i; + + err = rko_result->rko_err; + if (rd_list_empty(&rko_result->rko_u.admin_result.args) || + rd_list_empty(&rko_result->rko_u.admin_result.results)) + return; + + rk = rko_result->rko_rk; + rktpars = rd_list_elem(&rko_result->rko_u.admin_result.args, 0); + rd_kafka_wrlock(rk); + i = 0; + RD_KAFKA_TPLIST_FOREACH(rktpar, rktpars) { + result_info = + rd_list_elem(&rko_result->rko_u.admin_result.results, i); + rktpar_err = err ? err : result_info->topic_partition->err; + + if (rd_kafka_admin_result_err_refresh(rktpar_err)) { + rd_kafka_metadata_cache_delete_by_name(rk, + rktpar->topic); + } + i++; + } + rd_kafka_wrunlock(rk); +} + +/** + * @brief Call when leaders have been queried to progress the ListOffsets + * admin op to its next phase, sending ListOffsets to partition + * leaders. + */ +static rd_kafka_op_res_t +rd_kafka_ListOffsets_leaders_queried_cb(rd_kafka_t *rk, + rd_kafka_q_t *rkq, + rd_kafka_op_t *reply) { + + rd_kafka_resp_err_t err = reply->rko_err; + const rd_list_t *leaders = + reply->rko_u.leaders.leaders; /* Possibly NULL (on err) */ + rd_kafka_topic_partition_list_t *partitions = + reply->rko_u.leaders.partitions; /* Possibly NULL (on err) */ + rd_kafka_op_t *rko_fanout = reply->rko_u.leaders.opaque; + rd_kafka_topic_partition_list_t *topic_partitions; + rd_kafka_topic_partition_t *rktpar; + size_t partition_cnt; + const struct rd_kafka_partition_leader *leader; + size_t i; + static const struct rd_kafka_admin_worker_cbs cbs = { + rd_kafka_ListOffsetsRequest_admin, + rd_kafka_ListOffsetsResponse_parse, + }; + + rd_assert((rko_fanout->rko_type & ~RD_KAFKA_OP_FLAGMASK) == + RD_KAFKA_OP_ADMIN_FANOUT); + + if (err) { + rd_kafka_admin_result_fail( + rko_fanout, err, "Failed to query partition leaders: %s", + err == RD_KAFKA_RESP_ERR__NOENT ? "No leaders found" + : rd_kafka_err2str(err)); + rd_kafka_admin_common_worker_destroy(rk, rko_fanout, + rd_true /*destroy*/); + return RD_KAFKA_OP_RES_HANDLED; + } + + /* Create fanout results */ + topic_partitions = + rd_list_elem(&rko_fanout->rko_u.admin_request.args, 0); + partition_cnt = topic_partitions->cnt; + rd_list_init(&rko_fanout->rko_u.admin_request.fanout.results, + partition_cnt, + rd_kafka_ListOffsetsResultInfo_destroy_free); + + for (i = 0; i < partition_cnt; i++) { + rd_kafka_topic_partition_t *topic_partition = + &topic_partitions->elems[i]; + rd_kafka_ListOffsetsResultInfo_t *result_element = + rd_kafka_ListOffsetsResultInfo_new(topic_partition, -1); + rd_kafka_topic_partition_set_from_fetch_pos( + result_element->topic_partition, + RD_KAFKA_FETCH_POS(RD_KAFKA_OFFSET_INVALID, -1)); + result_element->topic_partition->err = + RD_KAFKA_RESP_ERR_NO_ERROR; + rd_list_add(&rko_fanout->rko_u.admin_request.fanout.results, + result_element); + } + + /* Set errors to corresponding result partitions */ + RD_KAFKA_TPLIST_FOREACH(rktpar, partitions) { + rd_kafka_ListOffsetsResultInfo_t *result_element; + if (!rktpar->err) + continue; + result_element = NULL; + for (i = 0; i < partition_cnt; i++) { + result_element = rd_list_elem( + &rko_fanout->rko_u.admin_request.fanout.results, i); + if (rd_kafka_topic_partition_cmp( + result_element->topic_partition, rktpar) == 0) + break; + } + result_element->topic_partition->err = rktpar->err; + } + + /* For each leader send a request for its partitions */ + rko_fanout->rko_u.admin_request.fanout.outstanding = + rd_list_cnt(leaders); + + RD_LIST_FOREACH(leader, leaders, i) { + rd_kafka_op_t *rko = rd_kafka_admin_request_op_new( + rk, RD_KAFKA_OP_LISTOFFSETS, + RD_KAFKA_EVENT_LISTOFFSETS_RESULT, &cbs, + &rko_fanout->rko_u.admin_request.options, rk->rk_ops); + + rko->rko_u.admin_request.fanout_parent = rko_fanout; + rko->rko_u.admin_request.broker_id = leader->rkb->rkb_nodeid; + + rd_kafka_topic_partition_list_sort_by_topic(leader->partitions); + rd_list_init(&rko->rko_u.admin_request.args, 1, + rd_kafka_topic_partition_list_destroy_free); + rd_list_add( + &rko->rko_u.admin_request.args, + rd_kafka_topic_partition_list_copy(leader->partitions)); + + /* Enqueue op for admin_worker() to transition to next state */ + rd_kafka_q_enq(rk->rk_ops, rko); + } + + return RD_KAFKA_OP_RES_HANDLED; +} /** * @brief Call when leaders have been queried to progress the DeleteRecords * admin op to its next phase, sending DeleteRecords to partition * leaders. - * - * @param rko Reply op (RD_KAFKA_OP_LEADERS). */ static rd_kafka_op_res_t rd_kafka_DeleteRecords_leaders_queried_cb(rd_kafka_t *rk, @@ -3772,6 +4556,111 @@ void rd_kafka_DeleteRecords(rd_kafka_t *rk, } +void rd_kafka_ListOffsets(rd_kafka_t *rk, + rd_kafka_topic_partition_list_t *topic_partitions, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu) { + int i; + rd_kafka_op_t *rko_fanout; + rd_kafka_topic_partition_list_t *copied_topic_partitions; + rd_list_t *topic_partitions_sorted = NULL; + + static const struct rd_kafka_admin_fanout_worker_cbs fanout_cbs = { + rd_kafka_ListOffsets_response_merge, + rd_kafka_ListOffsetsResultInfo_copy_opaque, + rd_kafka_topic_partition_list_copy_opaque}; + + rko_fanout = rd_kafka_admin_fanout_op_new( + rk, RD_KAFKA_OP_LISTOFFSETS, RD_KAFKA_EVENT_LISTOFFSETS_RESULT, + &fanout_cbs, options, rkqu->rkqu_q); + + rd_kafka_admin_request_op_result_cb_set( + rko_fanout, rd_kafka_ListOffsets_handle_result); + + if (topic_partitions->cnt) { + for (i = 0; i < topic_partitions->cnt; i++) { + if (!topic_partitions->elems[i].topic[0]) { + rd_kafka_admin_result_fail( + rko_fanout, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Partition topic name at index %d must be " + "non-empty", + i); + goto err; + } + if (topic_partitions->elems[i].partition < 0) { + rd_kafka_admin_result_fail( + rko_fanout, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Partition at index %d cannot be negative", + i); + goto err; + } + } + + + topic_partitions_sorted = + rd_list_new(topic_partitions->cnt, + rd_kafka_topic_partition_destroy_free); + for (i = 0; i < topic_partitions->cnt; i++) + rd_list_add(topic_partitions_sorted, + rd_kafka_topic_partition_copy( + &topic_partitions->elems[i])); + + rd_list_sort(topic_partitions_sorted, + rd_kafka_topic_partition_cmp); + if (rd_list_find_duplicate(topic_partitions_sorted, + rd_kafka_topic_partition_cmp)) { + + rd_kafka_admin_result_fail( + rko_fanout, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Partitions must not contain duplicates"); + goto err; + } + } + + for (i = 0; i < topic_partitions->cnt; i++) { + rd_kafka_topic_partition_t *partition = + &topic_partitions->elems[i]; + if (partition->offset < RD_KAFKA_OFFSET_SPEC_MAX_TIMESTAMP) { + rd_kafka_admin_result_fail( + rko_fanout, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Partition %d has an invalid offset %" PRId64, i, + partition->offset); + goto err; + } + } + + copied_topic_partitions = + rd_kafka_topic_partition_list_copy(topic_partitions); + rd_list_init(&rko_fanout->rko_u.admin_request.args, 1, + rd_kafka_topic_partition_list_destroy_free); + rd_list_add(&rko_fanout->rko_u.admin_request.args, + copied_topic_partitions); + + if (topic_partitions->cnt) { + /* Async query for partition leaders */ + rd_kafka_topic_partition_list_query_leaders_async( + rk, copied_topic_partitions, + rd_kafka_admin_timeout_remains(rko_fanout), + RD_KAFKA_REPLYQ(rk->rk_ops, 0), + rd_kafka_ListOffsets_leaders_queried_cb, rko_fanout); + } else { + /* Empty list */ + rd_kafka_op_t *rko_result = + rd_kafka_admin_result_new(rko_fanout); + /* Enqueue empty result on application queue, we're done. */ + rd_kafka_admin_result_enq(rko_fanout, rko_result); + rd_kafka_admin_common_worker_destroy(rk, rko_fanout, + rd_true /*destroy*/); + } + + RD_IF_FREE(topic_partitions_sorted, rd_list_destroy); + return; +err: + RD_IF_FREE(topic_partitions_sorted, rd_list_destroy); + rd_kafka_admin_common_worker_destroy(rk, rko_fanout, + rd_true /*destroy*/); +} + /** * @brief Get the list of offsets from a DeleteRecords result. * @@ -4157,8 +5046,13 @@ rd_kafka_OffsetDeleteResponse_parse(rd_kafka_op_t *rko_req, rd_kafka_buf_read_throttle_time(reply); + + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_ERR, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; partitions = rd_kafka_buf_read_topic_partitions( - reply, 16, rd_false /*no offset */, rd_true /*read error*/); + reply, rd_false /*don't use topic_id*/, rd_true, 16, fields); if (!partitions) { rd_snprintf(errstr, errstr_size, "Failed to parse OffsetDeleteResponse partitions"); @@ -4800,6 +5694,796 @@ void rd_kafka_DescribeAcls(rd_kafka_t *rk, rd_kafka_q_enq(rk->rk_ops, rko); } +struct rd_kafka_ScramCredentialInfo_s { + rd_kafka_ScramMechanism_t mechanism; + int32_t iterations; +}; + +rd_kafka_ScramMechanism_t rd_kafka_ScramCredentialInfo_mechanism( + const rd_kafka_ScramCredentialInfo_t *scram_credential_info) { + return scram_credential_info->mechanism; +} + +int32_t rd_kafka_ScramCredentialInfo_iterations( + const rd_kafka_ScramCredentialInfo_t *scram_credential_info) { + return scram_credential_info->iterations; +} + +struct rd_kafka_UserScramCredentialsDescription_s { + char *user; + rd_kafka_error_t *error; + size_t credential_info_cnt; + rd_kafka_ScramCredentialInfo_t *credential_infos; +}; + +rd_kafka_UserScramCredentialsDescription_t * +rd_kafka_UserScramCredentialsDescription_new(const char *username, + size_t num_credentials) { + rd_kafka_UserScramCredentialsDescription_t *description; + description = rd_calloc(1, sizeof(*description)); + description->user = rd_strdup(username); + description->error = NULL; + description->credential_info_cnt = num_credentials; + description->credential_infos = NULL; + if (num_credentials > 0) { + rd_kafka_ScramCredentialInfo_t *credentialinfo; + description->credential_infos = + rd_calloc(num_credentials, sizeof(*credentialinfo)); + } + return description; +} + +void rd_kafka_UserScramCredentialsDescription_destroy( + rd_kafka_UserScramCredentialsDescription_t *description) { + if (!description) + return; + rd_free(description->user); + rd_kafka_error_destroy(description->error); + if (description->credential_infos) + rd_free(description->credential_infos); + rd_free(description); +} + +void rd_kafka_UserScramCredentialsDescription_destroy_free(void *description) { + rd_kafka_UserScramCredentialsDescription_destroy(description); +} + +void rd_kafka_UserScramCredentailsDescription_set_error( + rd_kafka_UserScramCredentialsDescription_t *description, + rd_kafka_resp_err_t errorcode, + const char *err) { + rd_kafka_error_destroy(description->error); + description->error = rd_kafka_error_new(errorcode, "%s", err); +} + +const char *rd_kafka_UserScramCredentialsDescription_user( + const rd_kafka_UserScramCredentialsDescription_t *description) { + return description->user; +} + +const rd_kafka_error_t *rd_kafka_UserScramCredentialsDescription_error( + const rd_kafka_UserScramCredentialsDescription_t *description) { + return description->error; +} + +size_t rd_kafka_UserScramCredentialsDescription_scramcredentialinfo_count( + const rd_kafka_UserScramCredentialsDescription_t *description) { + return description->credential_info_cnt; +} + +const rd_kafka_ScramCredentialInfo_t * +rd_kafka_UserScramCredentialsDescription_scramcredentialinfo( + const rd_kafka_UserScramCredentialsDescription_t *description, + size_t idx) { + return &description->credential_infos[idx]; +} + +const rd_kafka_UserScramCredentialsDescription_t ** +rd_kafka_DescribeUserScramCredentials_result_descriptions( + const rd_kafka_DescribeUserScramCredentials_result_t *result, + size_t *cntp) { + *cntp = rd_list_cnt(&result->rko_u.admin_result.results); + return (const rd_kafka_UserScramCredentialsDescription_t **) + result->rko_u.admin_result.results.rl_elems; +} + +rd_kafka_resp_err_t +rd_kafka_DescribeUserScramCredentialsRequest(rd_kafka_broker_t *rkb, + const rd_list_t *userlist, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_buf_t *rkbuf; + int16_t ApiVersion = 0; + int features; + size_t i; + size_t num_users; + + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_DescribeUserScramCredentials, 0, 0, &features); + if (ApiVersion == -1) { + rd_snprintf( + errstr, errstr_size, + "DescribeUserScramCredentials API (KIP-554) not supported " + "by broker"); + return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; + } + + num_users = rd_list_cnt(userlist); + + rkbuf = rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_DescribeUserScramCredentials, 1, num_users * 25, + rd_true); + /* #Users */ + rd_kafka_buf_write_arraycnt(rkbuf, num_users); + for (i = 0; i < num_users; i++) { + rd_kafkap_str_t *user = rd_list_elem(userlist, i); + /* Name */ + rd_kafka_buf_write_str(rkbuf, user->str, user->len); + rd_kafka_buf_write_tags_empty(rkbuf); + } + rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); + /* Last Tag buffer included automatically*/ + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +static rd_kafka_resp_err_t +rd_kafka_DescribeUserScramCredentialsResponse_parse(rd_kafka_op_t *rko_req, + rd_kafka_op_t **rko_resultp, + rd_kafka_buf_t *reply, + char *errstr, + size_t errstr_size) { + const int log_decode_errors = LOG_ERR; + rd_kafka_op_t *rko_result = NULL; + int32_t num_users; + int16_t ErrorCode; + rd_kafkap_str_t ErrorMessage = RD_KAFKAP_STR_INITIALIZER; + int32_t i; + + rko_result = rd_kafka_admin_result_new(rko_req); + + /* ThrottleTimeMs */ + rd_kafka_buf_read_throttle_time(reply); + + /* ErrorCode */ + rd_kafka_buf_read_i16(reply, &ErrorCode); + rko_result->rko_err = ErrorCode; /*Request Level Error Code */ + + /* ErrorMessage */ + rd_kafka_buf_read_str(reply, &ErrorMessage); + if (ErrorCode) { + if (RD_KAFKAP_STR_LEN(&ErrorMessage) == 0) + errstr = (char *)rd_kafka_err2str(ErrorCode); + else + RD_KAFKAP_STR_DUPA(&errstr, &ErrorMessage); + rko_result->rko_u.admin_result.errstr = + errstr; /* Request Level Error string*/ + } + + /* #Results */ + rd_kafka_buf_read_arraycnt(reply, &num_users, 10000); + rd_list_init(&rko_result->rko_u.admin_result.results, num_users, + rd_kafka_UserScramCredentialsDescription_destroy_free); + + for (i = 0; i < num_users; i++) { + rd_kafkap_str_t User; + int16_t ErrorCode; + rd_kafkap_str_t ErrorMessage = RD_KAFKAP_STR_INITIALIZER; + size_t itr; + /* User */ + rd_kafka_buf_read_str(reply, &User); + /* ErrorCode */ + rd_kafka_buf_read_i16(reply, &ErrorCode); + /* ErrorMessage */ + rd_kafka_buf_read_str(reply, &ErrorMessage); + + int32_t num_credentials; + /* #CredentialInfos */ + rd_kafka_buf_read_arraycnt(reply, &num_credentials, 10000); + rd_kafka_UserScramCredentialsDescription_t *description = + rd_kafka_UserScramCredentialsDescription_new( + User.str, num_credentials); + rd_kafka_UserScramCredentailsDescription_set_error( + description, ErrorCode, ErrorMessage.str); + for (itr = 0; itr < (size_t)num_credentials; itr++) { + int8_t Mechanism; + int32_t Iterations; + /* Mechanism */ + rd_kafka_buf_read_i8(reply, &Mechanism); + /* Iterations */ + rd_kafka_buf_read_i32(reply, &Iterations); + rd_kafka_buf_skip_tags(reply); + rd_kafka_ScramCredentialInfo_t *scram_credential = + &description->credential_infos[itr]; + scram_credential->mechanism = Mechanism; + scram_credential->iterations = Iterations; + } + rd_kafka_buf_skip_tags(reply); + rd_list_add(&rko_result->rko_u.admin_result.results, + description); + } + *rko_resultp = rko_result; + + return RD_KAFKA_RESP_ERR_NO_ERROR; + +err_parse: + if (rko_result) + rd_kafka_op_destroy(rko_result); + + rd_snprintf( + errstr, errstr_size, + "DescribeUserScramCredentials response protocol parse failure: %s", + rd_kafka_err2str(reply->rkbuf_err)); + + return reply->rkbuf_err; +} + +void rd_kafka_DescribeUserScramCredentials( + rd_kafka_t *rk, + const char **users, + size_t user_cnt, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu) { + + rd_kafka_op_t *rko; + size_t i; + rd_list_t *userlist = NULL; + + static const struct rd_kafka_admin_worker_cbs cbs = { + rd_kafka_DescribeUserScramCredentialsRequest, + rd_kafka_DescribeUserScramCredentialsResponse_parse, + }; + + rko = rd_kafka_admin_request_op_new( + rk, RD_KAFKA_OP_DESCRIBEUSERSCRAMCREDENTIALS, + RD_KAFKA_EVENT_DESCRIBEUSERSCRAMCREDENTIALS_RESULT, &cbs, options, + rkqu->rkqu_q); + + /* Check empty strings */ + for (i = 0; i < user_cnt; i++) { + if (!*users[i]) { + rd_kafka_admin_result_fail( + rko, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Empty users aren't allowed, " + "index %" PRIusz, + i); + goto err; + } + } + + /* Check Duplicates */ + if (user_cnt > 1) { + userlist = rd_list_new(user_cnt, rd_free); + for (i = 0; i < user_cnt; i++) { + rd_list_add(userlist, rd_strdup(users[i])); + } + rd_list_sort(userlist, rd_strcmp2); + if (rd_list_find_duplicate(userlist, rd_strcmp2)) { + rd_kafka_admin_result_fail( + rko, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Duplicate users aren't allowed " + "in the same request"); + goto err; + } + rd_list_destroy(userlist); + } + + rd_list_init(&rko->rko_u.admin_request.args, user_cnt, rd_free); + for (i = 0; i < user_cnt; i++) { + rd_list_add(&rko->rko_u.admin_request.args, + rd_kafkap_str_new(users[i], -1)); + } + rd_kafka_q_enq(rk->rk_ops, rko); + return; +err: + RD_IF_FREE(userlist, rd_list_destroy); + rd_kafka_admin_common_worker_destroy(rk, rko, rd_true /*destroy*/); +} + +/** + * @enum rd_kafka_UserScramCredentialAlteration_type_t + * @brief Types of user SCRAM alterations. + */ +typedef enum rd_kafka_UserScramCredentialAlteration_type_s { + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_UPSERT = 0, + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_DELETE = 1, + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE__CNT +} rd_kafka_UserScramCredentialAlteration_type_t; + +struct rd_kafka_UserScramCredentialAlteration_s { + char *user; + rd_kafka_UserScramCredentialAlteration_type_t alteration_type; + union { + struct { + rd_kafka_ScramCredentialInfo_t credential_info; + rd_kafkap_bytes_t *salt; + rd_kafkap_bytes_t *password; + } upsertion; + struct { + rd_kafka_ScramMechanism_t mechanism; + } deletion; + } alteration; +}; + +rd_kafka_UserScramCredentialAlteration_t * +rd_kafka_UserScramCredentialUpsertion_new(const char *username, + rd_kafka_ScramMechanism_t mechanism, + int32_t iterations, + const unsigned char *password, + size_t password_size, + const unsigned char *salt, + size_t salt_size) { + rd_kafka_UserScramCredentialAlteration_t *alteration; + alteration = rd_calloc(1, sizeof(*alteration)); + alteration->user = rd_strdup(username); + alteration->alteration_type = + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_UPSERT; + alteration->alteration.upsertion.credential_info.mechanism = mechanism; + alteration->alteration.upsertion.credential_info.iterations = + iterations; + + alteration->alteration.upsertion.password = + rd_kafkap_bytes_new(password, password_size); + if (salt_size != 0) { + alteration->alteration.upsertion.salt = + rd_kafkap_bytes_new(salt, salt_size); + } else { +#if WITH_SSL && OPENSSL_VERSION_NUMBER >= 0x10101000L + unsigned char random_salt[64]; + if (RAND_priv_bytes(random_salt, sizeof(random_salt)) == 1) { + alteration->alteration.upsertion.salt = + rd_kafkap_bytes_new(random_salt, + sizeof(random_salt)); + } +#endif + } + return alteration; +} + +rd_kafka_UserScramCredentialAlteration_t * +rd_kafka_UserScramCredentialDeletion_new(const char *username, + rd_kafka_ScramMechanism_t mechanism) { + rd_kafka_UserScramCredentialAlteration_t *alteration; + alteration = rd_calloc(1, sizeof(*alteration)); + alteration->user = rd_strdup(username); + alteration->alteration_type = + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_DELETE; + alteration->alteration.deletion.mechanism = mechanism; + return alteration; +} + +void rd_kafka_UserScramCredentialAlteration_destroy( + rd_kafka_UserScramCredentialAlteration_t *alteration) { + if (!alteration) + return; + rd_free(alteration->user); + if (alteration->alteration_type == + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_UPSERT) { + rd_kafkap_bytes_destroy(alteration->alteration.upsertion.salt); + rd_kafkap_bytes_destroy( + alteration->alteration.upsertion.password); + } + rd_free(alteration); +} + +void rd_kafka_UserScramCredentialAlteration_destroy_free(void *alteration) { + rd_kafka_UserScramCredentialAlteration_destroy(alteration); +} + +void rd_kafka_UserScramCredentialAlteration_destroy_array( + rd_kafka_UserScramCredentialAlteration_t **alterations, + size_t alteration_cnt) { + size_t i; + for (i = 0; i < alteration_cnt; i++) + rd_kafka_UserScramCredentialAlteration_destroy(alterations[i]); +} + +static rd_kafka_UserScramCredentialAlteration_t * +rd_kafka_UserScramCredentialAlteration_copy( + const rd_kafka_UserScramCredentialAlteration_t *alteration) { + rd_kafka_UserScramCredentialAlteration_t *copied_alteration = + rd_calloc(1, sizeof(*alteration)); + copied_alteration->user = rd_strdup(alteration->user); + copied_alteration->alteration_type = alteration->alteration_type; + + if (alteration->alteration_type == + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_UPSERT /*Upsert*/) { + copied_alteration->alteration.upsertion.salt = + rd_kafkap_bytes_copy(alteration->alteration.upsertion.salt); + copied_alteration->alteration.upsertion.password = + rd_kafkap_bytes_copy( + alteration->alteration.upsertion.password); + copied_alteration->alteration.upsertion.credential_info + .mechanism = + alteration->alteration.upsertion.credential_info.mechanism; + copied_alteration->alteration.upsertion.credential_info + .iterations = + alteration->alteration.upsertion.credential_info.iterations; + } else if ( + alteration->alteration_type == + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_DELETE /*Delete*/) { + copied_alteration->alteration.deletion.mechanism = + alteration->alteration.deletion.mechanism; + } + + return copied_alteration; +} + +struct rd_kafka_AlterUserScramCredentials_result_response_s { + char *user; + rd_kafka_error_t *error; +}; + +rd_kafka_AlterUserScramCredentials_result_response_t * +rd_kafka_AlterUserScramCredentials_result_response_new(const char *username) { + rd_kafka_AlterUserScramCredentials_result_response_t *response; + response = rd_calloc(1, sizeof(*response)); + response->user = rd_strdup(username); + response->error = NULL; + return response; +} + +void rd_kafka_AlterUserScramCredentials_result_response_destroy( + rd_kafka_AlterUserScramCredentials_result_response_t *response) { + if (response->user) + rd_free(response->user); + rd_kafka_error_destroy(response->error); + rd_free(response); +} + +void rd_kafka_AlterUserScramCredentials_result_response_destroy_free( + void *response) { + rd_kafka_AlterUserScramCredentials_result_response_destroy(response); +} + +void rd_kafka_AlterUserScramCredentials_result_response_set_error( + rd_kafka_AlterUserScramCredentials_result_response_t *response, + rd_kafka_resp_err_t errorcode, + const char *errstr) { + rd_kafka_error_destroy(response->error); + response->error = rd_kafka_error_new(errorcode, "%s", errstr); +} + +const char *rd_kafka_AlterUserScramCredentials_result_response_user( + const rd_kafka_AlterUserScramCredentials_result_response_t *response) { + return response->user; +} + +const rd_kafka_error_t * +rd_kafka_AlterUserScramCredentials_result_response_error( + const rd_kafka_AlterUserScramCredentials_result_response_t *response) { + return response->error; +} + +const rd_kafka_AlterUserScramCredentials_result_response_t ** +rd_kafka_AlterUserScramCredentials_result_responses( + const rd_kafka_AlterUserScramCredentials_result_t *result, + size_t *cntp) { + *cntp = rd_list_cnt(&result->rko_u.admin_result.results); + return (const rd_kafka_AlterUserScramCredentials_result_response_t **) + result->rko_u.admin_result.results.rl_elems; +} + + +#if WITH_SSL +static rd_kafkap_bytes_t * +rd_kafka_AlterUserScramCredentialsRequest_salted_password( + rd_kafka_broker_t *rkb, + rd_kafkap_bytes_t *salt, + rd_kafkap_bytes_t *password, + rd_kafka_ScramMechanism_t mechanism, + int32_t iterations) { + rd_chariov_t saltedpassword_chariov = {.ptr = + rd_alloca(EVP_MAX_MD_SIZE)}; + + rd_chariov_t salt_chariov; + salt_chariov.ptr = (char *)salt->data; + salt_chariov.size = RD_KAFKAP_BYTES_LEN(salt); + + rd_chariov_t password_chariov; + password_chariov.ptr = (char *)password->data; + password_chariov.size = RD_KAFKAP_BYTES_LEN(password); + + const EVP_MD *evp = NULL; + if (mechanism == RD_KAFKA_SCRAM_MECHANISM_SHA_256) + evp = EVP_sha256(); + else if (mechanism == RD_KAFKA_SCRAM_MECHANISM_SHA_512) + evp = EVP_sha512(); + rd_assert(evp != NULL); + + rd_kafka_ssl_hmac(rkb, evp, &password_chariov, &salt_chariov, + iterations, &saltedpassword_chariov); + + return rd_kafkap_bytes_new( + (const unsigned char *)saltedpassword_chariov.ptr, + saltedpassword_chariov.size); +} +#endif + +rd_kafka_resp_err_t rd_kafka_AlterUserScramCredentialsRequest( + rd_kafka_broker_t *rkb, + const rd_list_t *user_scram_credential_alterations, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + + rd_kafka_buf_t *rkbuf; + int16_t ApiVersion = 0; + int features; + size_t num_deletions = 0; + size_t i; + size_t num_alterations; + size_t of_deletions; + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_DescribeUserScramCredentials, 0, 0, &features); + if (ApiVersion == -1) { + rd_snprintf( + errstr, errstr_size, + "AlterUserScramCredentials API (KIP-554) not supported " + "by broker"); + return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; + } + + num_alterations = rd_list_cnt(user_scram_credential_alterations); + + rkbuf = rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_AlterUserScramCredentials, 1, num_alterations * 100, + rd_true); + + /* Deletion scram requests*/ + + /* #Deletions */ + of_deletions = rd_kafka_buf_write_arraycnt_pos(rkbuf); + + for (i = 0; i < num_alterations; i++) { + rd_kafka_UserScramCredentialAlteration_t *alteration = + rd_list_elem(user_scram_credential_alterations, i); + if (alteration->alteration_type != + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_DELETE) + continue; + + num_deletions++; + /* Name */ + rd_kafka_buf_write_str(rkbuf, alteration->user, + strlen(alteration->user)); + /* Mechanism */ + rd_kafka_buf_write_i8( + rkbuf, alteration->alteration.deletion.mechanism); + rd_kafka_buf_write_tags_empty(rkbuf); + } + rd_kafka_buf_finalize_arraycnt(rkbuf, of_deletions, num_deletions); + + /* Upsertion scram request*/ + + /* #Upsertions */ + rd_kafka_buf_write_arraycnt(rkbuf, num_alterations - num_deletions); + for (i = 0; i < num_alterations; i++) { + rd_kafka_UserScramCredentialAlteration_t *alteration = + rd_list_elem(user_scram_credential_alterations, i); + if (alteration->alteration_type != + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_UPSERT) + continue; + +#if !WITH_SSL + rd_assert(!*"OpenSSL is required for upsertions"); +#else + char *user = alteration->user; + size_t usersize = strlen(user); + rd_kafka_ScramMechanism_t mechanism = + alteration->alteration.upsertion.credential_info.mechanism; + int32_t iterations = + alteration->alteration.upsertion.credential_info.iterations; + /* Name */ + rd_kafka_buf_write_str(rkbuf, user, usersize); + + /* Mechanism */ + rd_kafka_buf_write_i8(rkbuf, mechanism); + + /* Iterations */ + rd_kafka_buf_write_i32(rkbuf, iterations); + + /* Salt */ + rd_kafka_buf_write_kbytes( + rkbuf, alteration->alteration.upsertion.salt); + + rd_kafkap_bytes_t *password_bytes = + rd_kafka_AlterUserScramCredentialsRequest_salted_password( + rkb, alteration->alteration.upsertion.salt, + alteration->alteration.upsertion.password, mechanism, + iterations); + + /* SaltedPassword */ + rd_kafka_buf_write_kbytes(rkbuf, password_bytes); + rd_kafkap_bytes_destroy(password_bytes); + rd_kafka_buf_write_tags_empty(rkbuf); +#endif + } + + rd_kafka_buf_write_tags_empty(rkbuf); + rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +rd_kafka_resp_err_t +rd_kafka_AlterUserScramCredentialsResponse_parse(rd_kafka_op_t *rko_req, + rd_kafka_op_t **rko_resultp, + rd_kafka_buf_t *reply, + char *errstr, + size_t errstr_size) { + const int log_decode_errors = LOG_ERR; + rd_kafka_op_t *rko_result = NULL; + int32_t num_results; + int32_t i; + + rko_result = rd_kafka_admin_result_new(rko_req); + + /* ThrottleTimeMs */ + rd_kafka_buf_read_throttle_time(reply); + + /* #Results */ + rd_kafka_buf_read_arraycnt(reply, &num_results, 10000); + + rd_list_init( + &rko_result->rko_u.admin_result.results, num_results, + rd_kafka_AlterUserScramCredentials_result_response_destroy_free); + for (i = 0; i < num_results; i++) { + rd_kafkap_str_t User; + int16_t ErrorCode; + rd_kafkap_str_t ErrorMessage = RD_KAFKAP_STR_INITIALIZER; + + /* User */ + rd_kafka_buf_read_str(reply, &User); + + /* ErrorCode */ + rd_kafka_buf_read_i16(reply, &ErrorCode); + + /* ErrorMessage */ + rd_kafka_buf_read_str(reply, &ErrorMessage); + + rd_kafka_buf_skip_tags(reply); + + rd_kafka_AlterUserScramCredentials_result_response_t *response = + rd_kafka_AlterUserScramCredentials_result_response_new( + User.str); + rd_kafka_AlterUserScramCredentials_result_response_set_error( + response, ErrorCode, ErrorMessage.str); + rd_list_add(&rko_result->rko_u.admin_result.results, response); + } + *rko_resultp = rko_result; + + return RD_KAFKA_RESP_ERR_NO_ERROR; + +err_parse: + if (rko_result) + rd_kafka_op_destroy(rko_result); + + rd_snprintf( + errstr, errstr_size, + "AlterUserScramCredentials response protocol parse failure: %s", + rd_kafka_err2str(reply->rkbuf_err)); + + return reply->rkbuf_err; +} + +void rd_kafka_AlterUserScramCredentials( + rd_kafka_t *rk, + rd_kafka_UserScramCredentialAlteration_t **alterations, + size_t alteration_cnt, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu) { + + rd_kafka_op_t *rko; + size_t i; + + static const struct rd_kafka_admin_worker_cbs cbs = { + rd_kafka_AlterUserScramCredentialsRequest, + rd_kafka_AlterUserScramCredentialsResponse_parse, + }; + + rko = rd_kafka_admin_request_op_new( + rk, RD_KAFKA_OP_ALTERUSERSCRAMCREDENTIALS, + RD_KAFKA_EVENT_ALTERUSERSCRAMCREDENTIALS_RESULT, &cbs, options, + rkqu->rkqu_q); + + if (alteration_cnt > 0) { + const char *errstr = NULL; + for (i = 0; i < alteration_cnt; i++) { + rd_bool_t is_upsert = + alterations[i]->alteration_type == + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_UPSERT; + rd_bool_t is_delete = + alterations[i]->alteration_type == + RD_KAFKA_USER_SCRAM_CREDENTIAL_ALTERATION_TYPE_DELETE; + + if ((is_upsert || is_delete) && + alterations[i] + ->alteration.upsertion.credential_info + .mechanism == + RD_KAFKA_SCRAM_MECHANISM_UNKNOWN) { + errstr = + "SCRAM mechanism must be specified at " + "index %" PRIusz; + break; + } + + + if (!alterations[i]->user || !*alterations[i]->user) { + errstr = "Empty user at index %" PRIusz; + break; + } + + if (is_upsert) { +#if !WITH_SSL + errstr = + "OpenSSL required for upsertion at index " + "%" PRIusz; + break; +#endif + if (RD_KAFKAP_BYTES_LEN( + alterations[i] + ->alteration.upsertion.password) == + 0) { + errstr = + "Empty password at index %" PRIusz; + break; + } + + if (!alterations[i] + ->alteration.upsertion.salt || + RD_KAFKAP_BYTES_LEN( + alterations[i] + ->alteration.upsertion.salt) == 0) { + errstr = "Empty salt at index %" PRIusz; + break; + } + + if (alterations[i] + ->alteration.upsertion.credential_info + .iterations <= 0) { + errstr = + "Non-positive iterations at index " + "%" PRIusz; + break; + } + } + } + + if (errstr) { + rd_kafka_admin_result_fail( + rko, RD_KAFKA_RESP_ERR__INVALID_ARG, errstr, i); + rd_kafka_admin_common_worker_destroy( + rk, rko, rd_true /*destroy*/); + return; + } + } else { + rd_kafka_admin_result_fail( + rko, RD_KAFKA_RESP_ERR__INVALID_ARG, + "At least one alteration is required"); + rd_kafka_admin_common_worker_destroy(rk, rko, + rd_true /*destroy*/); + return; + } + + rd_list_init(&rko->rko_u.admin_request.args, alteration_cnt, + rd_kafka_UserScramCredentialAlteration_destroy_free); + + for (i = 0; i < alteration_cnt; i++) { + rd_list_add(&rko->rko_u.admin_request.args, + rd_kafka_UserScramCredentialAlteration_copy( + alterations[i])); + } + rd_kafka_q_enq(rk->rk_ops, rko); + return; +} + /** * @brief Get an array of rd_kafka_AclBinding_t from a DescribeAcls result. * @@ -4932,7 +6616,7 @@ rd_kafka_DeleteAclsResponse_parse(rd_kafka_op_t *rko_req, result_response = rd_kafka_DeleteAcls_result_response_new(error_code, errstr); - /* #maching_acls */ + /* #matching_acls */ rd_kafka_buf_read_arraycnt(reply, &matching_acls_cnt, 100000); for (j = 0; j < (int)matching_acls_cnt; j++) { int16_t acl_error_code; @@ -5297,7 +6981,6 @@ fail: rd_kafka_admin_common_worker_destroy(rk, rko, rd_true /*destroy*/); } - /** * @brief Get an array of group results from a AlterGroups result. * @@ -5404,8 +7087,8 @@ static rd_kafka_resp_err_t rd_kafka_ListConsumerGroupOffsetsRequest( require_stable_offsets = rd_kafka_confval_get_int(&options->require_stable_offsets); rd_kafka_OffsetFetchRequest( - rkb, grpoffsets->group_id, grpoffsets->partitions, - require_stable_offsets, op_timeout, replyq, resp_cb, opaque); + rkb, grpoffsets->group_id, grpoffsets->partitions, rd_false, -1, + NULL, require_stable_offsets, op_timeout, replyq, resp_cb, opaque); return RD_KAFKA_RESP_ERR_NO_ERROR; } @@ -5574,12 +7257,14 @@ const rd_kafka_group_result_t **rd_kafka_ListConsumerGroupOffsets_result_groups( static rd_kafka_ConsumerGroupListing_t * rd_kafka_ConsumerGroupListing_new(const char *group_id, rd_bool_t is_simple_consumer_group, - rd_kafka_consumer_group_state_t state) { + rd_kafka_consumer_group_state_t state, + rd_kafka_consumer_group_type_t type) { rd_kafka_ConsumerGroupListing_t *grplist; grplist = rd_calloc(1, sizeof(*grplist)); grplist->group_id = rd_strdup(group_id); grplist->is_simple_consumer_group = is_simple_consumer_group; grplist->state = state; + grplist->type = type; return grplist; } @@ -5593,7 +7278,7 @@ static rd_kafka_ConsumerGroupListing_t *rd_kafka_ConsumerGroupListing_copy( const rd_kafka_ConsumerGroupListing_t *grplist) { return rd_kafka_ConsumerGroupListing_new( grplist->group_id, grplist->is_simple_consumer_group, - grplist->state); + grplist->state, grplist->type); } /** @@ -5630,6 +7315,11 @@ rd_kafka_consumer_group_state_t rd_kafka_ConsumerGroupListing_state( return grplist->state; } +rd_kafka_consumer_group_type_t rd_kafka_ConsumerGroupListing_type( + const rd_kafka_ConsumerGroupListing_t *grplist) { + return grplist->type; +} + /** * @brief Create a new ListConsumerGroupsResult object. * @@ -5699,11 +7389,16 @@ rd_kafka_admin_ListConsumerGroupsRequest(rd_kafka_broker_t *rkb, rd_kafka_resp_err_t err; rd_kafka_error_t *error; const char **states_str = NULL; + const char **types_str = NULL; int states_str_cnt = 0; rd_list_t *states = rd_kafka_confval_get_ptr(&options->match_consumer_group_states); + int types_str_cnt = 0; + rd_list_t *types = + rd_kafka_confval_get_ptr(&options->match_consumer_group_types); - /* Prepare list_options */ + + /* Prepare list_options for consumer group state */ if (states && rd_list_cnt(states) > 0) { states_str_cnt = rd_list_cnt(states); states_str = rd_calloc(states_str_cnt, sizeof(*states_str)); @@ -5713,13 +7408,27 @@ rd_kafka_admin_ListConsumerGroupsRequest(rd_kafka_broker_t *rkb, } } + /* Prepare list_options for consumer group type */ + if (types && rd_list_cnt(types) > 0) { + types_str_cnt = rd_list_cnt(types); + types_str = rd_calloc(types_str_cnt, sizeof(*types_str)); + for (i = 0; i < types_str_cnt; i++) { + types_str[i] = rd_kafka_consumer_group_type_name( + rd_list_get_int32(types, i)); + } + } error = rd_kafka_ListGroupsRequest(rkb, -1, states_str, states_str_cnt, - replyq, resp_cb, opaque); + types_str, types_str_cnt, replyq, + resp_cb, opaque); if (states_str) { rd_free(states_str); } + if (types_str) { + rd_free(types_str); + } + if (error) { rd_snprintf(errstr, errstr_size, "%s", rd_kafka_error_string(error)); @@ -5748,7 +7457,8 @@ rd_kafka_ListConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, rd_kafka_broker_t *rkb = reply->rkbuf_rkb; rd_list_t valid, errors; rd_kafka_ListConsumerGroupsResult_t *list_result; - char *group_id = NULL, *group_state = NULL, *proto_type = NULL; + char *group_id = NULL, *group_state = NULL, *proto_type = NULL, + *group_type_str = NULL; api_version = rd_kafka_buf_ApiVersion(reply); if (api_version >= 1) { @@ -5776,17 +7486,22 @@ rd_kafka_ListConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, for (i = 0; i < cnt; i++) { rd_kafkap_str_t GroupId, ProtocolType, - GroupState = RD_ZERO_INIT; + GroupState = RD_ZERO_INIT, GroupType = RD_ZERO_INIT; rd_kafka_ConsumerGroupListing_t *group_listing; rd_bool_t is_simple_consumer_group, is_consumer_protocol_type; rd_kafka_consumer_group_state_t state = RD_KAFKA_CONSUMER_GROUP_STATE_UNKNOWN; + rd_kafka_consumer_group_type_t type = + RD_KAFKA_CONSUMER_GROUP_TYPE_UNKNOWN; rd_kafka_buf_read_str(reply, &GroupId); rd_kafka_buf_read_str(reply, &ProtocolType); if (api_version >= 4) { rd_kafka_buf_read_str(reply, &GroupState); } + if (api_version >= 5) { + rd_kafka_buf_read_str(reply, &GroupType); + } rd_kafka_buf_skip_tags(reply); group_id = RD_KAFKAP_STR_DUP(&GroupId); @@ -5796,21 +7511,29 @@ rd_kafka_ListConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, state = rd_kafka_consumer_group_state_code(group_state); } + if (api_version >= 5) { + group_type_str = RD_KAFKAP_STR_DUP(&GroupType); + type = + rd_kafka_consumer_group_type_code(group_type_str); + } + is_simple_consumer_group = *proto_type == '\0'; is_consumer_protocol_type = !strcmp(proto_type, CONSUMER_PROTOCOL_TYPE); if (is_simple_consumer_group || is_consumer_protocol_type) { group_listing = rd_kafka_ConsumerGroupListing_new( - group_id, is_simple_consumer_group, state); + group_id, is_simple_consumer_group, state, type); rd_list_add(&valid, group_listing); } rd_free(group_id); rd_free(group_state); rd_free(proto_type); - group_id = NULL; - group_state = NULL; - proto_type = NULL; + rd_free(group_type_str); + group_id = NULL; + group_state = NULL; + proto_type = NULL; + group_type_str = NULL; } rd_kafka_buf_skip_tags(reply); @@ -5821,14 +7544,16 @@ err_parse: rd_free(group_state); if (proto_type) rd_free(proto_type); + if (group_type_str) + rd_free(group_type_str); if (reply->rkbuf_err) { error_code = reply->rkbuf_err; error = rd_kafka_error_new( error_code, "Broker [%d" - "] " - "ListConsumerGroups response protocol parse failure: %s", + "] " + "ListConsumerGroups response protocol parse failure: %s", rd_kafka_broker_id(rkb), rd_kafka_err2str(error_code)); rd_list_add(&errors, error); } @@ -5972,6 +7697,89 @@ const rd_kafka_error_t **rd_kafka_ListConsumerGroups_result_errors( * */ +/** + * @brief Parse authorized_operations returned in + * - DescribeConsumerGroups + * - DescribeTopics + * - DescribeCluster + * + * @param authorized_operations returned by RPC, containing operations encoded + * per-bit. + * @param cntp is set to the count of the operations, or -1 if the operations + * were not requested. + * @returns rd_kafka_AclOperation_t *. May be NULL. + */ +static rd_kafka_AclOperation_t * +rd_kafka_AuthorizedOperations_parse(int32_t authorized_operations, int *cntp) { + rd_kafka_AclOperation_t i; + int j = 0; + int count = 0; + rd_kafka_AclOperation_t *operations = NULL; + + /* In case of authorized_operations not requested, return NULL. */ + if (authorized_operations < 0) { + *cntp = -1; + return NULL; + } + + /* Count number of bits set. ALL, ANY and UNKNOWN bits are skipped as + * they are always unset as per KIP-430. */ + for (i = RD_KAFKA_ACL_OPERATION_READ; i < RD_KAFKA_ACL_OPERATION__CNT; + i++) + count += ((authorized_operations >> i) & 1); + *cntp = count; + + /* In case no operations exist, allocate 1 byte so that the returned + * pointer is non-NULL. A NULL pointer implies that authorized + * operations were not requested. */ + if (count == 0) + return rd_malloc(1); + + operations = rd_malloc(sizeof(rd_kafka_AclOperation_t) * count); + j = 0; + for (i = RD_KAFKA_ACL_OPERATION_READ; i < RD_KAFKA_ACL_OPERATION__CNT; + i++) { + if ((authorized_operations >> i) & 1) { + operations[j] = i; + j++; + } + } + + return operations; +} + +/** + * @brief Copy a list of rd_kafka_AclOperation_t. + * + * @param src Array of rd_kafka_AclOperation_t to copy from. May be NULL if + * authorized operations were not requested. + * @param authorized_operations_cnt Count of \p src. May be -1 if authorized + * operations were not requested. + * @returns Copy of \p src. May be NULL. + */ +static rd_kafka_AclOperation_t * +rd_kafka_AuthorizedOperations_copy(const rd_kafka_AclOperation_t *src, + int authorized_operations_cnt) { + size_t copy_bytes = 0; + rd_kafka_AclOperation_t *dst = NULL; + + if (authorized_operations_cnt == -1 || src == NULL) + return NULL; + + /* Allocate and copy 1 byte so that the returned pointer + * is non-NULL. A NULL pointer implies that authorized operations were + * not requested. */ + if (authorized_operations_cnt == 0) + copy_bytes = 1; + else + copy_bytes = + sizeof(rd_kafka_AclOperation_t) * authorized_operations_cnt; + + dst = rd_malloc(copy_bytes); + memcpy(dst, src, copy_bytes); + return dst; +} + /** * @brief Create a new MemberDescription object. This object is used for * creating a ConsumerGroupDescription. @@ -5991,7 +7799,8 @@ static rd_kafka_MemberDescription_t *rd_kafka_MemberDescription_new( const char *consumer_id, const char *group_instance_id, const char *host, - const rd_kafka_topic_partition_list_t *assignment) { + const rd_kafka_topic_partition_list_t *assignment, + const rd_kafka_topic_partition_list_t *target_assignment) { rd_kafka_MemberDescription_t *member; member = rd_calloc(1, sizeof(*member)); member->client_id = rd_strdup(client_id); @@ -6005,6 +7814,12 @@ static rd_kafka_MemberDescription_t *rd_kafka_MemberDescription_new( else member->assignment.partitions = rd_kafka_topic_partition_list_new(0); + if (target_assignment) { + member->target_assignment = + rd_calloc(1, sizeof(rd_kafka_MemberAssignment_t)); + member->target_assignment->partitions = + rd_kafka_topic_partition_list_copy(target_assignment); + } return member; } @@ -6018,9 +7833,10 @@ static rd_kafka_MemberDescription_t *rd_kafka_MemberDescription_new( */ static rd_kafka_MemberDescription_t * rd_kafka_MemberDescription_copy(const rd_kafka_MemberDescription_t *src) { - return rd_kafka_MemberDescription_new(src->client_id, src->consumer_id, - src->group_instance_id, src->host, - src->assignment.partitions); + return rd_kafka_MemberDescription_new( + src->client_id, src->consumer_id, src->group_instance_id, src->host, + src->assignment.partitions, + src->target_assignment ? src->target_assignment->partitions : NULL); } /** @@ -6039,11 +7855,14 @@ rd_kafka_MemberDescription_destroy(rd_kafka_MemberDescription_t *member) { rd_free(member->client_id); rd_free(member->consumer_id); rd_free(member->host); - if (member->group_instance_id != NULL) - rd_free(member->group_instance_id); - if (member->assignment.partitions) - rd_kafka_topic_partition_list_destroy( - member->assignment.partitions); + RD_IF_FREE(member->group_instance_id, rd_free); + RD_IF_FREE(member->assignment.partitions, + rd_kafka_topic_partition_list_destroy); + if (member->target_assignment) { + RD_IF_FREE(member->target_assignment->partitions, + rd_kafka_topic_partition_list_destroy); + rd_free(member->target_assignment); + } rd_free(member); } @@ -6081,6 +7900,11 @@ const rd_kafka_topic_partition_list_t *rd_kafka_MemberAssignment_partitions( return assignment->partitions; } +const rd_kafka_MemberAssignment_t *rd_kafka_MemberDescription_target_assignment( + const rd_kafka_MemberDescription_t *member) { + return member->target_assignment; +} + /** * @brief Create a new ConsumerGroupDescription object. @@ -6090,6 +7914,7 @@ const rd_kafka_topic_partition_list_t *rd_kafka_MemberAssignment_partitions( * @param members List of members (rd_kafka_MemberDescription_t) of this * group. * @param partition_assignor (optional) Chosen assignor. + * @param authorized_operations (optional) authorized operations. * @param state Group state. * @param coordinator (optional) Group coordinator. * @param error (optional) Error received for this group. @@ -6097,13 +7922,17 @@ const rd_kafka_topic_partition_list_t *rd_kafka_MemberAssignment_partitions( * Use rd_kafka_ConsumerGroupDescription_destroy() to free when done. */ static rd_kafka_ConsumerGroupDescription_t * -rd_kafka_ConsumerGroupDescription_new(const char *group_id, - rd_bool_t is_simple_consumer_group, - const rd_list_t *members, - const char *partition_assignor, - rd_kafka_consumer_group_state_t state, - const rd_kafka_Node_t *coordinator, - rd_kafka_error_t *error) { +rd_kafka_ConsumerGroupDescription_new( + const char *group_id, + rd_bool_t is_simple_consumer_group, + const rd_list_t *members, + const char *partition_assignor, + const rd_kafka_AclOperation_t *authorized_operations, + int authorized_operations_cnt, + rd_kafka_consumer_group_state_t state, + rd_kafka_consumer_group_type_t type, + const rd_kafka_Node_t *coordinator, + rd_kafka_error_t *error) { rd_kafka_ConsumerGroupDescription_t *grpdesc; grpdesc = rd_calloc(1, sizeof(*grpdesc)); grpdesc->group_id = rd_strdup(group_id); @@ -6119,7 +7948,13 @@ rd_kafka_ConsumerGroupDescription_new(const char *group_id, grpdesc->partition_assignor = !partition_assignor ? (char *)partition_assignor : rd_strdup(partition_assignor); + + grpdesc->authorized_operations_cnt = authorized_operations_cnt; + grpdesc->authorized_operations = rd_kafka_AuthorizedOperations_copy( + authorized_operations, authorized_operations_cnt); + grpdesc->state = state; + grpdesc->type = type; if (coordinator != NULL) grpdesc->coordinator = rd_kafka_Node_copy(coordinator); grpdesc->error = @@ -6133,15 +7968,17 @@ rd_kafka_ConsumerGroupDescription_new(const char *group_id, * @brief New instance of ConsumerGroupDescription from an error. * * @param group_id The group id. - * @param error The error. + * @param error Error received for this group. * @return A new allocated ConsumerGroupDescription with the passed error. + * Use rd_kafka_ConsumerGroupDescription_destroy() to free when done. */ static rd_kafka_ConsumerGroupDescription_t * rd_kafka_ConsumerGroupDescription_new_error(const char *group_id, rd_kafka_error_t *error) { return rd_kafka_ConsumerGroupDescription_new( - group_id, rd_false, NULL, NULL, - RD_KAFKA_CONSUMER_GROUP_STATE_UNKNOWN, NULL, error); + group_id, rd_false, NULL, NULL, NULL, 0, + RD_KAFKA_CONSUMER_GROUP_STATE_UNKNOWN, + RD_KAFKA_CONSUMER_GROUP_TYPE_UNKNOWN, NULL, error); } /** @@ -6155,8 +7992,10 @@ rd_kafka_ConsumerGroupDescription_copy( const rd_kafka_ConsumerGroupDescription_t *grpdesc) { return rd_kafka_ConsumerGroupDescription_new( grpdesc->group_id, grpdesc->is_simple_consumer_group, - &grpdesc->members, grpdesc->partition_assignor, grpdesc->state, - grpdesc->coordinator, grpdesc->error); + &grpdesc->members, grpdesc->partition_assignor, + grpdesc->authorized_operations, grpdesc->authorized_operations_cnt, + grpdesc->state, grpdesc->type, grpdesc->coordinator, + grpdesc->error); } /** @@ -6179,6 +8018,8 @@ static void rd_kafka_ConsumerGroupDescription_destroy( rd_kafka_error_destroy(grpdesc->error); if (grpdesc->coordinator) rd_kafka_Node_destroy(grpdesc->coordinator); + if (grpdesc->authorized_operations_cnt) + rd_free(grpdesc->authorized_operations); rd_free(grpdesc); } @@ -6208,6 +8049,13 @@ const char *rd_kafka_ConsumerGroupDescription_partition_assignor( return grpdesc->partition_assignor; } +const rd_kafka_AclOperation_t * +rd_kafka_ConsumerGroupDescription_authorized_operations( + const rd_kafka_ConsumerGroupDescription_t *grpdesc, + size_t *cntp) { + *cntp = RD_MAX(grpdesc->authorized_operations_cnt, 0); + return grpdesc->authorized_operations; +} rd_kafka_consumer_group_state_t rd_kafka_ConsumerGroupDescription_state( const rd_kafka_ConsumerGroupDescription_t *grpdesc) { @@ -6219,6 +8067,11 @@ const rd_kafka_Node_t *rd_kafka_ConsumerGroupDescription_coordinator( return grpdesc->coordinator; } +rd_kafka_consumer_group_type_t rd_kafka_ConsumerGroupDescription_type( + const rd_kafka_ConsumerGroupDescription_t *grpdesc) { + return grpdesc->type; +} + size_t rd_kafka_ConsumerGroupDescription_member_count( const rd_kafka_ConsumerGroupDescription_t *grpdesc) { return rd_list_cnt(&grpdesc->members); @@ -6238,51 +8091,6 @@ static int rd_kafka_DescribeConsumerGroups_cmp(const void *a, const void *b) { return strcmp(a, b); } -/** @brief Merge the DescribeConsumerGroups response from a single broker - * into the user response list. - */ -static void rd_kafka_DescribeConsumerGroups_response_merge( - rd_kafka_op_t *rko_fanout, - const rd_kafka_op_t *rko_partial) { - rd_kafka_ConsumerGroupDescription_t *groupres = NULL; - rd_kafka_ConsumerGroupDescription_t *newgroupres; - const char *grp = rko_partial->rko_u.admin_result.opaque; - int orig_pos; - - rd_assert(rko_partial->rko_evtype == - RD_KAFKA_EVENT_DESCRIBECONSUMERGROUPS_RESULT); - - if (!rko_partial->rko_err) { - /* Proper results. - * We only send one group per request, make sure it matches */ - groupres = - rd_list_elem(&rko_partial->rko_u.admin_result.results, 0); - rd_assert(groupres); - rd_assert(!strcmp(groupres->group_id, grp)); - newgroupres = rd_kafka_ConsumerGroupDescription_copy(groupres); - } else { - /* Op errored, e.g. timeout */ - rd_kafka_error_t *error = - rd_kafka_error_new(rko_partial->rko_err, NULL); - newgroupres = - rd_kafka_ConsumerGroupDescription_new_error(grp, error); - rd_kafka_error_destroy(error); - } - - /* As a convenience to the application we insert group result - * in the same order as they were requested. */ - orig_pos = rd_list_index(&rko_fanout->rko_u.admin_request.args, grp, - rd_kafka_DescribeConsumerGroups_cmp); - rd_assert(orig_pos != -1); - - /* Make sure result is not already set */ - rd_assert(rd_list_elem(&rko_fanout->rko_u.admin_request.fanout.results, - orig_pos) == NULL); - - rd_list_set(&rko_fanout->rko_u.admin_request.fanout.results, orig_pos, - newgroupres); -} - /** * @brief Construct and send DescribeConsumerGroupsRequest to \p rkb @@ -6305,7 +8113,7 @@ static rd_kafka_resp_err_t rd_kafka_admin_DescribeConsumerGroupsRequest( rd_kafka_replyq_t replyq, rd_kafka_resp_cb_t *resp_cb, void *opaque) { - int i; + int i, include_authorized_operations; char *group; rd_kafka_resp_err_t err; int groups_cnt = rd_list_cnt(groups); @@ -6315,7 +8123,12 @@ static rd_kafka_resp_err_t rd_kafka_admin_DescribeConsumerGroupsRequest( RD_LIST_FOREACH(group, groups, i) { groups_arr[i] = rd_list_elem(groups, i); } + + include_authorized_operations = + rd_kafka_confval_get_int(&options->include_authorized_operations); + error = rd_kafka_DescribeGroupsRequest(rkb, -1, groups_arr, groups_cnt, + include_authorized_operations, replyq, resp_cb, opaque); rd_free(groups_arr); @@ -6330,6 +8143,50 @@ static rd_kafka_resp_err_t rd_kafka_admin_DescribeConsumerGroupsRequest( return RD_KAFKA_RESP_ERR_NO_ERROR; } +/** + * @brief Construct and send ConsumerGroupDescribeRequest to \p rkb + * with the groups (char *) in \p groups, using + * \p options. + * + * The response (unparsed) will be enqueued on \p replyq + * for handling by \p resp_cb (with \p opaque passed). + * + * @returns RD_KAFKA_RESP_ERR_NO_ERROR if the request was enqueued for + * transmission, otherwise an error code and errstr will be + * updated with a human readable error string. + */ +static rd_kafka_resp_err_t +rd_kafka_admin_ConsumerGroupDescribeRequest(rd_kafka_broker_t *rkb, + const rd_list_t *groups /*(char*)*/, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + + int include_authorized_operations; + rd_kafka_resp_err_t err; + int groups_cnt = rd_list_cnt(groups); + rd_kafka_error_t *error = NULL; + + include_authorized_operations = + rd_kafka_confval_get_int(&options->include_authorized_operations); + + error = rd_kafka_ConsumerGroupDescribeRequest( + rkb, (char **)groups->rl_elems, groups_cnt, + include_authorized_operations, replyq, resp_cb, opaque); + + if (error) { + rd_snprintf(errstr, errstr_size, "%s", + rd_kafka_error_string(error)); + err = rd_kafka_error_code(error); + rd_kafka_error_destroy(error); + return err; + } + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} /** * @brief Parse DescribeConsumerGroupsResponse and create ADMIN_RESULT op. */ @@ -6340,7 +8197,7 @@ rd_kafka_DescribeConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, char *errstr, size_t errstr_size) { const int log_decode_errors = LOG_ERR; - int nodeid; + int32_t nodeid; uint16_t port; int16_t api_version; int32_t cnt; @@ -6350,6 +8207,8 @@ rd_kafka_DescribeConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, rd_kafka_error_t *error = NULL; char *group_id = NULL, *group_state = NULL, *proto_type = NULL, *proto = NULL, *host = NULL; + rd_kafka_AclOperation_t *operations = NULL; + int operation_cnt = -1; api_version = rd_kafka_buf_ApiVersion(reply); if (api_version >= 1) { @@ -6362,15 +8221,16 @@ rd_kafka_DescribeConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, rd_list_init(&rko_result->rko_u.admin_result.results, cnt, rd_kafka_ConsumerGroupDescription_free); - rd_kafka_broker_lock(rkb); nodeid = rkb->rkb_nodeid; - host = rd_strdup(rkb->rkb_origname); - port = rkb->rkb_port; + rd_kafka_broker_lock(rkb); + host = rd_strdup(rkb->rkb_origname); + port = rkb->rkb_port; rd_kafka_broker_unlock(rkb); node = rd_kafka_Node_new(nodeid, host, port, NULL); while (cnt-- > 0) { int16_t error_code; + int32_t authorized_operations = -1; rd_kafkap_str_t GroupId, GroupState, ProtocolType, ProtocolData; rd_bool_t is_simple_consumer_group, is_consumer_protocol_type; int32_t member_cnt; @@ -6424,8 +8284,8 @@ rd_kafka_DescribeConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, } rd_kafka_buf_read_str(reply, &ClientId); rd_kafka_buf_read_str(reply, &ClientHost); - rd_kafka_buf_read_bytes(reply, &MemberMetadata); - rd_kafka_buf_read_bytes(reply, &MemberAssignment); + rd_kafka_buf_read_kbytes(reply, &MemberMetadata); + rd_kafka_buf_read_kbytes(reply, &MemberAssignment); if (error != NULL) continue; @@ -6442,8 +8302,12 @@ rd_kafka_DescribeConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, /* Decreased in rd_kafka_buf_destroy */ rd_kafka_broker_keep(rkb); rd_kafka_buf_read_i16(rkbuf, &version); + const rd_kafka_topic_partition_field_t fields[] = + {RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; partitions = rd_kafka_buf_read_topic_partitions( - rkbuf, 0, rd_false, rd_false); + rkbuf, rd_false /*don't use topic_id*/, + rd_true, 0, fields); rd_kafka_buf_destroy(rkbuf); if (!partitions) rd_kafka_buf_parse_fail( @@ -6459,9 +8323,13 @@ rd_kafka_DescribeConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, client_id = RD_KAFKAP_STR_DUP(&ClientId); client_host = RD_KAFKAP_STR_DUP(&ClientHost); + /* Target Assignment is `NULL` for the `classic` + * protocol as there is no concept of Target Assignment + * there. */ member = rd_kafka_MemberDescription_new( client_id, member_id, group_instance_id, - client_host, partitions); + client_host, partitions, + NULL /* target assignment */); if (partitions) rd_kafka_topic_partition_list_destroy( partitions); @@ -6477,33 +8345,40 @@ rd_kafka_DescribeConsumerGroupsResponse_parse(rd_kafka_op_t *rko_req, } if (api_version >= 3) { - /* TODO: implement KIP-430 */ - int32_t authorized_operations; rd_kafka_buf_read_i32(reply, &authorized_operations); + /* Authorized_operations is INT_MIN + * in case of not being requested, and the list is NULL + * that case. */ + operations = rd_kafka_AuthorizedOperations_parse( + authorized_operations, &operation_cnt); } if (error == NULL) { grpdesc = rd_kafka_ConsumerGroupDescription_new( group_id, is_simple_consumer_group, &members, proto, + operations, operation_cnt, rd_kafka_consumer_group_state_code(group_state), - node, error); - } else { + RD_KAFKA_CONSUMER_GROUP_TYPE_CLASSIC, node, error); + } else grpdesc = rd_kafka_ConsumerGroupDescription_new_error( group_id, error); - } + rd_list_add(&rko_result->rko_u.admin_result.results, grpdesc); - if (error) - rd_kafka_error_destroy(error); + rd_list_destroy(&members); rd_free(group_id); rd_free(group_state); rd_free(proto_type); rd_free(proto); + RD_IF_FREE(error, rd_kafka_error_destroy); + RD_IF_FREE(operations, rd_free); + error = NULL; group_id = NULL; group_state = NULL; proto_type = NULL; proto = NULL; + operations = NULL; } if (host) @@ -6530,6 +8405,7 @@ err_parse: rd_kafka_Node_destroy(node); if (rko_result) rd_kafka_op_destroy(rko_result); + RD_IF_FREE(operations, rd_free); rd_snprintf( errstr, errstr_size, @@ -6539,6 +8415,325 @@ err_parse: return reply->rkbuf_err; } +/** + * @brief Parse ConsumerGroupDescriberesponse and create ADMIN_RESULT op. + */ +static rd_kafka_resp_err_t +rd_kafka_ConsumerGroupDescribeResponse_parse(rd_kafka_op_t *rko_req, + rd_kafka_op_t **rko_resultp, + rd_kafka_buf_t *reply, + char *errstr, + size_t errstr_size) { + const int log_decode_errors = LOG_ERR; + int32_t groups_cnt; + rd_kafka_op_t *rko_result = NULL; + rd_kafka_broker_t *rkb = reply->rkbuf_rkb; + rd_kafka_error_t *error = NULL; + char *group_id = NULL, *group_state = NULL, *assignor_name = NULL, + *host = NULL; + rd_kafka_AclOperation_t *operations = NULL; + rd_kafka_Node_t *node = NULL; + rd_kafka_topic_partition_list_t *assignment = NULL, + *target_assignment = NULL; + int32_t nodeid; + uint16_t port; + int operation_cnt = -1; + int32_t i; + + rd_kafka_buf_read_throttle_time(reply); + + rd_kafka_buf_read_arraycnt(reply, &groups_cnt, RD_KAFKAP_GROUPS_MAX); + + nodeid = rkb->rkb_nodeid; + rd_kafka_broker_lock(rkb); + host = rd_strdup(rkb->rkb_origname); + port = rkb->rkb_port; + rd_kafka_broker_unlock(rkb); + + node = rd_kafka_Node_new(nodeid, host, port, NULL); + + rko_result = rd_kafka_admin_result_new(rko_req); + rd_list_init(&rko_result->rko_u.admin_result.results, groups_cnt, + rd_kafka_ConsumerGroupDescription_free); + + for (i = 0; i < groups_cnt; i++) { + int16_t ErrorCode; + int32_t authorized_operations = -1; + int32_t MemberCnt, j; + int32_t GroupEpoch, AssignmentEpoch; + rd_kafkap_str_t GroupId, GroupState, AssignorName, ErrorString; + rd_list_t members; + rd_kafka_ConsumerGroupDescription_t *grpdesc = NULL; + + rd_kafka_buf_read_i16(reply, &ErrorCode); + rd_kafka_buf_read_str(reply, &ErrorString); + rd_kafka_buf_read_str(reply, &GroupId); + rd_kafka_buf_read_str(reply, &GroupState); + rd_kafka_buf_read_i32(reply, &GroupEpoch); + rd_kafka_buf_read_i32(reply, &AssignmentEpoch); + rd_kafka_buf_read_str(reply, &AssignorName); + rd_kafka_buf_read_arraycnt(reply, &MemberCnt, 100000); + + group_id = RD_KAFKAP_STR_DUP(&GroupId); + group_state = RD_KAFKAP_STR_DUP(&GroupState); + assignor_name = RD_KAFKAP_STR_DUP(&AssignorName); + + if (ErrorCode) { + error = rd_kafka_error_new( + ErrorCode, "ConsumerGroupDescribe: %.*s", + RD_KAFKAP_STR_PR(&ErrorString)); + } + + rd_list_init(&members, MemberCnt, + rd_kafka_MemberDescription_free); + + for (j = 0; j < MemberCnt; j++) { + char *member_id = NULL, *instance_id = NULL, + *client_id = NULL, *client_host = NULL; + rd_kafkap_str_t MemberId, InstanceId, RackId, ClientId, + ClientHost, SubscribedTopicRegex; + int32_t MemberEpoch, idx; + rd_kafka_MemberDescription_t *member; + int32_t SubscribedTopicNamesArrayCnt; + + rd_kafka_buf_read_str(reply, &MemberId); + rd_kafka_buf_read_str(reply, &InstanceId); + rd_kafka_buf_read_str(reply, &RackId); + rd_kafka_buf_read_i32(reply, &MemberEpoch); + rd_kafka_buf_read_str(reply, &ClientId); + rd_kafka_buf_read_str(reply, &ClientHost); + rd_kafka_buf_read_arraycnt( + reply, &SubscribedTopicNamesArrayCnt, 100000); + + for (idx = 0; idx < SubscribedTopicNamesArrayCnt; + idx++) { + rd_kafkap_str_t SubscribedTopicName; + rd_kafka_buf_read_str(reply, + &SubscribedTopicName); + } + rd_kafka_buf_read_str(reply, &SubscribedTopicRegex); + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + + assignment = rd_kafka_buf_read_topic_partitions( + reply, rd_true /* use topic_id */, + rd_true /* use topic name*/, 0, fields); + + /* Assignment tags */ + rd_kafka_buf_skip_tags(reply); + + target_assignment = rd_kafka_buf_read_topic_partitions( + reply, rd_true /* use topic_id */, + rd_true /* use topic name*/, 0, fields); + + /* TargetAssignment tags */ + rd_kafka_buf_skip_tags(reply); + + /* Member tags */ + rd_kafka_buf_skip_tags(reply); + + member_id = RD_KAFKAP_STR_DUP(&MemberId); + if (!RD_KAFKAP_STR_IS_NULL(&InstanceId)) { + instance_id = RD_KAFKAP_STR_DUP(&InstanceId); + } + client_id = RD_KAFKAP_STR_DUP(&ClientId); + client_host = RD_KAFKAP_STR_DUP(&ClientHost); + + member = rd_kafka_MemberDescription_new( + client_id, member_id, instance_id, client_host, + assignment, target_assignment); + + + rd_list_add(&members, member); + + RD_IF_FREE(assignment, + rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(target_assignment, + rd_kafka_topic_partition_list_destroy); + + RD_IF_FREE(member_id, rd_free); + RD_IF_FREE(instance_id, rd_free); + RD_IF_FREE(client_id, rd_free); + RD_IF_FREE(client_host, rd_free); + member_id = NULL; + instance_id = NULL; + client_id = NULL; + client_host = NULL; + } + rd_kafka_buf_read_i32(reply, &authorized_operations); + operations = rd_kafka_AuthorizedOperations_parse( + authorized_operations, &operation_cnt); + rd_kafka_buf_skip_tags(reply); + + /* If the error code is Group ID Not Found or Unsupported + Version, we will set the ConsumerGroupType to Consumer to + identify it for further processing with the old protocol and + eventually in rd_kafka_DescribeConsumerGroupsResponse_parse + we will set the ConsumerGroupType to Unknown */ + if (!error) { + grpdesc = rd_kafka_ConsumerGroupDescription_new( + group_id, rd_false, &members, assignor_name, + operations, operation_cnt, + rd_kafka_consumer_group_state_code(group_state), + RD_KAFKA_CONSUMER_GROUP_TYPE_CONSUMER, node, error); + } else { + grpdesc = rd_kafka_ConsumerGroupDescription_new_error( + group_id, error); + } + + rd_list_add(&rko_result->rko_u.admin_result.results, grpdesc); + + rd_list_destroy(&members); + rd_free(group_id); + rd_free(group_state); + rd_free(assignor_name); + RD_IF_FREE(error, rd_kafka_error_destroy); + RD_IF_FREE(operations, rd_free); + + error = NULL; + group_id = NULL; + group_state = NULL; + assignor_name = NULL; + operations = NULL; + } + rd_kafka_buf_skip_tags(reply); + RD_IF_FREE(host, rd_free); + RD_IF_FREE(node, rd_kafka_Node_destroy); + *rko_resultp = rko_result; + return RD_KAFKA_RESP_ERR_NO_ERROR; +err_parse: + RD_IF_FREE(group_id, rd_free); + RD_IF_FREE(group_state, rd_free); + RD_IF_FREE(assignor_name, rd_free); + RD_IF_FREE(host, rd_free); + RD_IF_FREE(node, rd_kafka_Node_destroy); + RD_IF_FREE(error, rd_kafka_error_destroy); + RD_IF_FREE(operations, rd_free); + RD_IF_FREE(assignment, rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(target_assignment, rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(rko_result, rd_kafka_op_destroy); + + rd_snprintf( + errstr, errstr_size, + "DescribeConsumerGroups response protocol parse failure: %s", + rd_kafka_err2str(reply->rkbuf_err)); + return reply->rkbuf_err; +} + +/** + * @brief In case if we get an Unsupported Feature error or if it is a consumer + group and we get errors GROUP_ID_NOT_FOUND(69) or + UNSUPPORTED_VERSION(35) we need to send a request to the old + protocol. + */ +static rd_bool_t rd_kafka_admin_describe_consumer_group_do_fallback_to_classic( + rd_kafka_ConsumerGroupDescription_t *groupres) { + return groupres->error && + (groupres->error->code == RD_KAFKA_RESP_ERR_GROUP_ID_NOT_FOUND || + groupres->error->code == + RD_KAFKA_RESP_ERR_UNSUPPORTED_VERSION || + groupres->error->code == + RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE); +} + +static void rd_kafka_admin_describe_consumer_group_request( + rd_kafka_op_t *rko_fanout, + rd_kafka_t *rk, + const char *group_id, + const struct rd_kafka_admin_worker_cbs *cbs, + const rd_kafka_AdminOptions_t *options, + rd_kafka_q_t *rkq) { + rd_kafka_op_t *rko = rd_kafka_admin_request_op_new( + rk, RD_KAFKA_OP_DESCRIBECONSUMERGROUPS, + RD_KAFKA_EVENT_DESCRIBECONSUMERGROUPS_RESULT, cbs, options, rkq); + + rko->rko_u.admin_request.fanout_parent = rko_fanout; + rko->rko_u.admin_request.broker_id = RD_KAFKA_ADMIN_TARGET_COORDINATOR; + rko->rko_u.admin_request.coordtype = RD_KAFKA_COORD_GROUP; + rko->rko_u.admin_request.coordkey = rd_strdup(group_id); + + /* Set the group name as the opaque so the fanout worker use it + * to fill in errors. + * References rko_fanout's memory, which will always outlive + * the fanned out op. */ + rd_kafka_AdminOptions_set_opaque(&rko->rko_u.admin_request.options, + (void *)group_id); + + rd_list_init(&rko->rko_u.admin_request.args, 1, rd_free); + rd_list_add(&rko->rko_u.admin_request.args, rd_strdup(group_id)); + + rd_kafka_q_enq(rko_fanout->rko_rk->rk_ops, rko); +} + +/** @brief Merge the DescribeConsumerGroups response from a single broker + * into the user response list. + */ +static void rd_kafka_DescribeConsumerGroups_response_merge( + rd_kafka_op_t *rko_fanout, + const rd_kafka_op_t *rko_partial) { + rd_kafka_ConsumerGroupDescription_t *groupres = NULL; + rd_kafka_ConsumerGroupDescription_t *newgroupres; + const char *grp = rko_partial->rko_u.admin_result.opaque; + int orig_pos; + + rd_assert(rko_partial->rko_evtype == + RD_KAFKA_EVENT_DESCRIBECONSUMERGROUPS_RESULT); + + if (!rko_partial->rko_err) { + /* Proper results. + * We only send one group per request, make sure it matches */ + groupres = + rd_list_elem(&rko_partial->rko_u.admin_result.results, 0); + rd_assert(groupres); + rd_assert(!strcmp(groupres->group_id, grp)); + newgroupres = rd_kafka_ConsumerGroupDescription_copy(groupres); + } else { + /* Op errored, e.g. timeout */ + rd_kafka_error_t *error = + rd_kafka_error_new(rko_partial->rko_err, NULL); + newgroupres = + rd_kafka_ConsumerGroupDescription_new_error(grp, error); + rd_kafka_error_destroy(error); + } + + rd_bool_t is_consumer_group_response = + rko_partial->rko_u.admin_result.cbs->request == + rd_kafka_admin_ConsumerGroupDescribeRequest; + + if (is_consumer_group_response && + rd_kafka_admin_describe_consumer_group_do_fallback_to_classic( + newgroupres)) { + /* We need to send a request to the old protocol */ + rko_fanout->rko_u.admin_request.fanout.outstanding++; + static const struct rd_kafka_admin_worker_cbs cbs = { + rd_kafka_admin_DescribeConsumerGroupsRequest, + rd_kafka_DescribeConsumerGroupsResponse_parse, + }; + rd_kafka_admin_describe_consumer_group_request( + rko_fanout, rko_fanout->rko_rk, grp, &cbs, + &rko_fanout->rko_u.admin_request.options, + rko_fanout->rko_rk->rk_ops); + + rd_kafka_ConsumerGroupDescription_destroy(newgroupres); + } else { + /* As a convenience to the application we insert group result + * in the same order as they were requested. */ + orig_pos = + rd_list_index(&rko_fanout->rko_u.admin_request.args, grp, + rd_kafka_DescribeConsumerGroups_cmp); + rd_assert(orig_pos != -1); + + /* Make sure result is not already set */ + rd_assert(rd_list_elem( + &rko_fanout->rko_u.admin_request.fanout.results, + orig_pos) == NULL); + + rd_list_set(&rko_fanout->rko_u.admin_request.fanout.results, + orig_pos, newgroupres); + } +} + void rd_kafka_DescribeConsumerGroups(rd_kafka_t *rk, const char **groups, size_t groups_cnt, @@ -6608,34 +8803,13 @@ void rd_kafka_DescribeConsumerGroups(rd_kafka_t *rk, * coordinator into one op. */ for (i = 0; i < groups_cnt; i++) { static const struct rd_kafka_admin_worker_cbs cbs = { - rd_kafka_admin_DescribeConsumerGroupsRequest, - rd_kafka_DescribeConsumerGroupsResponse_parse, + rd_kafka_admin_ConsumerGroupDescribeRequest, + rd_kafka_ConsumerGroupDescribeResponse_parse, }; char *grp = rd_list_elem(&rko_fanout->rko_u.admin_request.args, (int)i); - rd_kafka_op_t *rko = rd_kafka_admin_request_op_new( - rk, RD_KAFKA_OP_DESCRIBECONSUMERGROUPS, - RD_KAFKA_EVENT_DESCRIBECONSUMERGROUPS_RESULT, &cbs, options, - rk->rk_ops); - - rko->rko_u.admin_request.fanout_parent = rko_fanout; - rko->rko_u.admin_request.broker_id = - RD_KAFKA_ADMIN_TARGET_COORDINATOR; - rko->rko_u.admin_request.coordtype = RD_KAFKA_COORD_GROUP; - rko->rko_u.admin_request.coordkey = rd_strdup(grp); - - /* Set the group name as the opaque so the fanout worker use it - * to fill in errors. - * References rko_fanout's memory, which will always outlive - * the fanned out op. */ - rd_kafka_AdminOptions_set_opaque( - &rko->rko_u.admin_request.options, grp); - - rd_list_init(&rko->rko_u.admin_request.args, 1, rd_free); - rd_list_add(&rko->rko_u.admin_request.args, - rd_strdup(groups[i])); - - rd_kafka_q_enq(rk->rk_ops, rko); + rd_kafka_admin_describe_consumer_group_request( + rko_fanout, rk, grp, &cbs, options, rk->rk_ops); } } @@ -6654,3 +8828,967 @@ rd_kafka_DescribeConsumerGroups_result_groups( } /**@}*/ + +/** + * @name Describe Topic + * @{ + * + * + * + * + */ + +rd_kafka_TopicCollection_t * +rd_kafka_TopicCollection_of_topic_names(const char **topics, + size_t topics_cnt) { + size_t i; + rd_kafka_TopicCollection_t *ret = + rd_calloc(1, sizeof(rd_kafka_TopicCollection_t)); + + ret->topics_cnt = topics_cnt; + if (!ret->topics_cnt) + return ret; + + ret->topics = rd_calloc(topics_cnt, sizeof(char *)); + for (i = 0; i < topics_cnt; i++) + ret->topics[i] = rd_strdup(topics[i]); + + return ret; +} + +void rd_kafka_TopicCollection_destroy(rd_kafka_TopicCollection_t *topics) { + size_t i; + + for (i = 0; i < topics->topics_cnt; i++) + rd_free(topics->topics[i]); + + RD_IF_FREE(topics->topics, rd_free); + rd_free(topics); +} + +/** + * @brief Create a new TopicPartitionInfo object. + * + * @return A newly allocated TopicPartitionInfo. Use + * rd_kafka_TopicPartitionInfo_destroy() to free when done. + */ +static rd_kafka_TopicPartitionInfo_t *rd_kafka_TopicPartitionInfo_new( + const struct rd_kafka_metadata_partition *partition, + const struct rd_kafka_metadata_broker *brokers_sorted, + const rd_kafka_metadata_broker_internal_t *brokers_internal, + int broker_cnt) { + size_t i; + rd_kafka_TopicPartitionInfo_t *pinfo = + rd_calloc(1, sizeof(rd_kafka_TopicPartitionInfo_t)); + + pinfo->partition = partition->id; + pinfo->isr_cnt = partition->isr_cnt; + pinfo->replica_cnt = partition->replica_cnt; + + if (partition->leader >= 0) { + pinfo->leader = rd_kafka_Node_new_from_brokers( + partition->leader, brokers_sorted, brokers_internal, + broker_cnt); + } + + if (pinfo->isr_cnt > 0) { + pinfo->isr = + rd_calloc(pinfo->isr_cnt, sizeof(rd_kafka_Node_t *)); + for (i = 0; i < pinfo->isr_cnt; i++) + pinfo->isr[i] = rd_kafka_Node_new_from_brokers( + partition->isrs[i], brokers_sorted, + brokers_internal, broker_cnt); + } + + if (pinfo->replica_cnt > 0) { + pinfo->replicas = + rd_calloc(pinfo->replica_cnt, sizeof(rd_kafka_Node_t *)); + for (i = 0; i < pinfo->replica_cnt; i++) + pinfo->replicas[i] = rd_kafka_Node_new_from_brokers( + partition->replicas[i], brokers_sorted, + brokers_internal, broker_cnt); + } + + return pinfo; +} + +/** + * @brief Destroy and deallocate a TopicPartitionInfo. + */ +static void +rd_kafka_TopicPartitionInfo_destroy(rd_kafka_TopicPartitionInfo_t *pinfo) { + size_t i; + RD_IF_FREE(pinfo->leader, rd_kafka_Node_destroy); + + for (i = 0; i < pinfo->isr_cnt; i++) + rd_kafka_Node_destroy(pinfo->isr[i]); + RD_IF_FREE(pinfo->isr, rd_free); + + for (i = 0; i < pinfo->replica_cnt; i++) + rd_kafka_Node_destroy(pinfo->replicas[i]); + RD_IF_FREE(pinfo->replicas, rd_free); + + rd_free(pinfo); +} + +/** + * @brief Create a new TopicDescription object. + * + * @param topic topic name + * @param topic_id topic id + * @param partitions Array of partition metadata (rd_kafka_metadata_partition). + * @param partition_cnt Number of partitions in partition metadata. + * @param authorized_operations acl operations allowed for topic. + * @param error Topic error reported by the broker. + * @return A newly allocated TopicDescription object. + * @remark Use rd_kafka_TopicDescription_destroy() to free when done. + */ +static rd_kafka_TopicDescription_t *rd_kafka_TopicDescription_new( + const char *topic, + rd_kafka_Uuid_t topic_id, + const struct rd_kafka_metadata_partition *partitions, + int partition_cnt, + const struct rd_kafka_metadata_broker *brokers_sorted, + const rd_kafka_metadata_broker_internal_t *brokers_internal, + int broker_cnt, + const rd_kafka_AclOperation_t *authorized_operations, + int authorized_operations_cnt, + rd_bool_t is_internal, + rd_kafka_error_t *error) { + rd_kafka_TopicDescription_t *topicdesc; + int i; + topicdesc = rd_calloc(1, sizeof(*topicdesc)); + topicdesc->topic = rd_strdup(topic); + topicdesc->topic_id = topic_id; + topicdesc->partition_cnt = partition_cnt; + topicdesc->is_internal = is_internal; + if (error) + topicdesc->error = rd_kafka_error_copy(error); + + topicdesc->authorized_operations_cnt = authorized_operations_cnt; + topicdesc->authorized_operations = rd_kafka_AuthorizedOperations_copy( + authorized_operations, authorized_operations_cnt); + + if (partitions) { + topicdesc->partitions = + rd_calloc(partition_cnt, sizeof(*partitions)); + for (i = 0; i < partition_cnt; i++) + topicdesc->partitions[i] = + rd_kafka_TopicPartitionInfo_new( + &partitions[i], brokers_sorted, + brokers_internal, broker_cnt); + } + return topicdesc; +} + +/** + * @brief Create a new TopicDescription object from an error. + * + * @param topic topic name + * @param error Topic error reported by the broker. + * @return A newly allocated TopicDescription with the passed error. + * @remark Use rd_kafka_TopicDescription_destroy() to free when done. + */ +static rd_kafka_TopicDescription_t * +rd_kafka_TopicDescription_new_error(const char *topic, + rd_kafka_Uuid_t topic_id, + rd_kafka_error_t *error) { + return rd_kafka_TopicDescription_new(topic, topic_id, NULL, 0, NULL, + NULL, 0, NULL, 0, rd_false, error); +} + +static void +rd_kafka_TopicDescription_destroy(rd_kafka_TopicDescription_t *topicdesc) { + int i; + + RD_IF_FREE(topicdesc->topic, rd_free); + RD_IF_FREE(topicdesc->error, rd_kafka_error_destroy); + RD_IF_FREE(topicdesc->authorized_operations, rd_free); + for (i = 0; i < topicdesc->partition_cnt; i++) + rd_kafka_TopicPartitionInfo_destroy(topicdesc->partitions[i]); + rd_free(topicdesc->partitions); + + rd_free(topicdesc); +} + +static void rd_kafka_TopicDescription_free(void *ptr) { + rd_kafka_TopicDescription_destroy(ptr); +} + +const int rd_kafka_TopicPartitionInfo_partition( + const rd_kafka_TopicPartitionInfo_t *partition) { + return partition->partition; +} + +const rd_kafka_Node_t *rd_kafka_TopicPartitionInfo_leader( + const rd_kafka_TopicPartitionInfo_t *partition) { + return partition->leader; +} + + +const rd_kafka_Node_t ** +rd_kafka_TopicPartitionInfo_isr(const rd_kafka_TopicPartitionInfo_t *partition, + size_t *cntp) { + *cntp = partition->isr_cnt; + return (const rd_kafka_Node_t **)partition->isr; +} + +const rd_kafka_Node_t **rd_kafka_TopicPartitionInfo_replicas( + const rd_kafka_TopicPartitionInfo_t *partition, + size_t *cntp) { + *cntp = partition->replica_cnt; + return (const rd_kafka_Node_t **)partition->replicas; +} + +const rd_kafka_TopicPartitionInfo_t **rd_kafka_TopicDescription_partitions( + const rd_kafka_TopicDescription_t *topicdesc, + size_t *cntp) { + *cntp = topicdesc->partition_cnt; + return (const rd_kafka_TopicPartitionInfo_t **)topicdesc->partitions; +} + +const rd_kafka_AclOperation_t *rd_kafka_TopicDescription_authorized_operations( + const rd_kafka_TopicDescription_t *topicdesc, + size_t *cntp) { + *cntp = RD_MAX(topicdesc->authorized_operations_cnt, 0); + return topicdesc->authorized_operations; +} + + +const char * +rd_kafka_TopicDescription_name(const rd_kafka_TopicDescription_t *topicdesc) { + return topicdesc->topic; +} + +int rd_kafka_TopicDescription_is_internal( + const rd_kafka_TopicDescription_t *topicdesc) { + return topicdesc->is_internal; +} + +const rd_kafka_error_t * +rd_kafka_TopicDescription_error(const rd_kafka_TopicDescription_t *topicdesc) { + return topicdesc->error; +} + +const rd_kafka_Uuid_t *rd_kafka_TopicDescription_topic_id( + const rd_kafka_TopicDescription_t *topicdesc) { + return &topicdesc->topic_id; +} + +const rd_kafka_TopicDescription_t **rd_kafka_DescribeTopics_result_topics( + const rd_kafka_DescribeTopics_result_t *result, + size_t *cntp) { + const rd_kafka_op_t *rko = (const rd_kafka_op_t *)result; + rd_kafka_op_type_t reqtype = + rko->rko_u.admin_result.reqtype & ~RD_KAFKA_OP_FLAGMASK; + rd_assert(reqtype == RD_KAFKA_OP_DESCRIBETOPICS); + + *cntp = rd_list_cnt(&rko->rko_u.admin_result.results); + return (const rd_kafka_TopicDescription_t **) + rko->rko_u.admin_result.results.rl_elems; +} + +/** + * @brief Topics arguments comparator for DescribeTopics args + */ +static int rd_kafka_DescribeTopics_cmp(const void *a, const void *b) { + return strcmp(a, b); +} + +/** + * @brief Construct and send DescribeTopicsRequest to \p rkb + * with the topics (char *) in \p topics, using + * \p options. + * + * The response (unparsed) will be enqueued on \p replyq + * for handling by \p resp_cb (with \p opaque passed). + * + * @returns RD_KAFKA_RESP_ERR_NO_ERROR if the request was enqueued for + * transmission, otherwise an error code and errstr will be + * updated with a human readable error string. + */ +static rd_kafka_resp_err_t +rd_kafka_admin_DescribeTopicsRequest(rd_kafka_broker_t *rkb, + const rd_list_t *topics /*(char*)*/, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_resp_err_t err; + int include_topic_authorized_operations = + rd_kafka_confval_get_int(&options->include_authorized_operations); + + err = rd_kafka_admin_MetadataRequest( + rkb, topics, "describe topics", + rd_false /* don't include_topic_authorized_operations */, + include_topic_authorized_operations, + rd_false /* don't force_racks */, resp_cb, replyq, opaque); + + if (err) { + rd_snprintf(errstr, errstr_size, "%s", rd_kafka_err2str(err)); + return err; + } + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Parse DescribeTopicsResponse and create ADMIN_RESULT op. + */ +static rd_kafka_resp_err_t +rd_kafka_DescribeTopicsResponse_parse(rd_kafka_op_t *rko_req, + rd_kafka_op_t **rko_resultp, + rd_kafka_buf_t *reply, + char *errstr, + size_t errstr_size) { + rd_kafka_metadata_internal_t *mdi = NULL; + struct rd_kafka_metadata *md = NULL; + rd_kafka_resp_err_t err; + rd_list_t topics = rko_req->rko_u.admin_request.args; + rd_kafka_broker_t *rkb = reply->rkbuf_rkb; + int i; + const int log_decode_errors = LOG_ERR; + rd_kafka_op_t *rko_result = NULL; + + err = rd_kafka_parse_Metadata_admin(rkb, reply, &topics, &mdi); + if (err) + goto err_parse; + + rko_result = rd_kafka_admin_result_new(rko_req); + md = &mdi->metadata; + rd_list_init(&rko_result->rko_u.admin_result.results, md->topic_cnt, + rd_kafka_TopicDescription_free); + + for (i = 0; i < md->topic_cnt; i++) { + rd_kafka_TopicDescription_t *topicdesc = NULL; + int orig_pos; + + if (md->topics[i].err == RD_KAFKA_RESP_ERR_NO_ERROR) { + rd_kafka_AclOperation_t *authorized_operations; + int authorized_operation_cnt; + authorized_operations = + rd_kafka_AuthorizedOperations_parse( + mdi->topics[i].topic_authorized_operations, + &authorized_operation_cnt); + topicdesc = rd_kafka_TopicDescription_new( + md->topics[i].topic, mdi->topics[i].topic_id, + md->topics[i].partitions, + md->topics[i].partition_cnt, mdi->brokers_sorted, + mdi->brokers, md->broker_cnt, authorized_operations, + authorized_operation_cnt, + mdi->topics[i].is_internal, NULL); + RD_IF_FREE(authorized_operations, rd_free); + } else { + rd_kafka_error_t *error = rd_kafka_error_new( + md->topics[i].err, "%s", + rd_kafka_err2str(md->topics[i].err)); + topicdesc = rd_kafka_TopicDescription_new_error( + md->topics[i].topic, mdi->topics[i].topic_id, + error); + rd_kafka_error_destroy(error); + } + orig_pos = rd_list_index(&rko_result->rko_u.admin_result.args, + topicdesc->topic, + rd_kafka_DescribeTopics_cmp); + if (orig_pos == -1) { + rd_kafka_TopicDescription_destroy(topicdesc); + rd_kafka_buf_parse_fail( + reply, + "Broker returned topic %s that was not " + "included in the original request", + topicdesc->topic); + } + + if (rd_list_elem(&rko_result->rko_u.admin_result.results, + orig_pos) != NULL) { + rd_kafka_TopicDescription_destroy(topicdesc); + rd_kafka_buf_parse_fail( + reply, "Broker returned topic %s multiple times", + topicdesc->topic); + } + + rd_list_set(&rko_result->rko_u.admin_result.results, orig_pos, + topicdesc); + } + rd_free(mdi); + + *rko_resultp = rko_result; + return RD_KAFKA_RESP_ERR_NO_ERROR; + +err_parse: + RD_IF_FREE(rko_result, rd_kafka_op_destroy); + rd_snprintf(errstr, errstr_size, + "DescribeTopics response protocol parse failure: %s", + rd_kafka_err2str(reply->rkbuf_err)); + return reply->rkbuf_err; +} + +void rd_kafka_DescribeTopics(rd_kafka_t *rk, + const rd_kafka_TopicCollection_t *topics, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu) { + rd_kafka_op_t *rko; + rd_list_t dup_list; + size_t i; + + static const struct rd_kafka_admin_worker_cbs cbs = { + rd_kafka_admin_DescribeTopicsRequest, + rd_kafka_DescribeTopicsResponse_parse, + }; + + rd_assert(rkqu); + + rko = rd_kafka_admin_request_op_new( + rk, RD_KAFKA_OP_DESCRIBETOPICS, + RD_KAFKA_EVENT_DESCRIBETOPICS_RESULT, &cbs, options, rkqu->rkqu_q); + + rd_list_init(&rko->rko_u.admin_request.args, (int)topics->topics_cnt, + rd_free); + for (i = 0; i < topics->topics_cnt; i++) + rd_list_add(&rko->rko_u.admin_request.args, + rd_strdup(topics->topics[i])); + + if (rd_list_cnt(&rko->rko_u.admin_request.args)) { + int j; + char *topic_name; + /* Check for duplicates. + * Make a temporary copy of the topic list and sort it to check + * for duplicates, we don't want the original list sorted since + * we want to maintain ordering. */ + rd_list_init(&dup_list, + rd_list_cnt(&rko->rko_u.admin_request.args), NULL); + rd_list_copy_to(&dup_list, &rko->rko_u.admin_request.args, NULL, + NULL); + rd_list_sort(&dup_list, rd_kafka_DescribeTopics_cmp); + if (rd_list_find_duplicate(&dup_list, + rd_kafka_DescribeTopics_cmp)) { + rd_list_destroy(&dup_list); + rd_kafka_admin_result_fail( + rko, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Duplicate topics not allowed"); + rd_kafka_admin_common_worker_destroy( + rk, rko, rd_true /*destroy*/); + return; + } + + /* Check for empty topics. */ + RD_LIST_FOREACH(topic_name, &rko->rko_u.admin_request.args, j) { + if (!topic_name[0]) { + rd_list_destroy(&dup_list); + rd_kafka_admin_result_fail( + rko, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Empty topic name at index %d isn't " + "allowed", + j); + rd_kafka_admin_common_worker_destroy( + rk, rko, rd_true /*destroy*/); + return; + } + } + + rd_list_destroy(&dup_list); + rd_kafka_q_enq(rk->rk_ops, rko); + } else { + /* Empty list */ + rd_kafka_op_t *rko_result = rd_kafka_admin_result_new(rko); + /* Enqueue empty result on application queue, we're done. */ + rd_kafka_admin_result_enq(rko, rko_result); + rd_kafka_admin_common_worker_destroy(rk, rko, + rd_true /*destroy*/); + } +} + +/**@}*/ + +/** + * @name Describe cluster + * @{ + * + * + * + * + */ + +static const rd_kafka_ClusterDescription_t * +rd_kafka_DescribeCluster_result_description( + const rd_kafka_DescribeCluster_result_t *result) { + int cluster_result_cnt; + const rd_kafka_ClusterDescription_t *clusterdesc; + const rd_kafka_op_t *rko = (const rd_kafka_op_t *)result; + rd_kafka_op_type_t reqtype = + rko->rko_u.admin_result.reqtype & ~RD_KAFKA_OP_FLAGMASK; + rd_assert(reqtype == RD_KAFKA_OP_DESCRIBECLUSTER); + + cluster_result_cnt = rd_list_cnt(&rko->rko_u.admin_result.results); + rd_assert(cluster_result_cnt == 1); + clusterdesc = rd_list_elem(&rko->rko_u.admin_result.results, 0); + + return clusterdesc; +} + + +const rd_kafka_Node_t **rd_kafka_DescribeCluster_result_nodes( + const rd_kafka_DescribeCluster_result_t *result, + size_t *cntp) { + const rd_kafka_ClusterDescription_t *clusterdesc = + rd_kafka_DescribeCluster_result_description(result); + *cntp = clusterdesc->node_cnt; + return (const rd_kafka_Node_t **)clusterdesc->nodes; +} + +const rd_kafka_AclOperation_t * +rd_kafka_DescribeCluster_result_authorized_operations( + const rd_kafka_DescribeCluster_result_t *result, + size_t *cntp) { + const rd_kafka_ClusterDescription_t *clusterdesc = + rd_kafka_DescribeCluster_result_description(result); + *cntp = RD_MAX(clusterdesc->authorized_operations_cnt, 0); + return clusterdesc->authorized_operations; +} + +const char *rd_kafka_DescribeCluster_result_cluster_id( + const rd_kafka_DescribeCluster_result_t *result) { + return rd_kafka_DescribeCluster_result_description(result)->cluster_id; +} + +const rd_kafka_Node_t *rd_kafka_DescribeCluster_result_controller( + const rd_kafka_DescribeCluster_result_t *result) { + return rd_kafka_DescribeCluster_result_description(result)->controller; +} + +/** + * @brief Create a new ClusterDescription object. + * + * @param cluster_id current cluster_id + * @param controller_id current controller_id. + * @param md metadata struct returned by parse_metadata(). + * + * @returns newly allocated ClusterDescription object. + * @remark Use rd_kafka_ClusterDescription_destroy() to free when done. + */ +static rd_kafka_ClusterDescription_t * +rd_kafka_ClusterDescription_new(const rd_kafka_metadata_internal_t *mdi) { + const rd_kafka_metadata_t *md = &mdi->metadata; + rd_kafka_ClusterDescription_t *clusterdesc = + rd_calloc(1, sizeof(*clusterdesc)); + int i; + + clusterdesc->cluster_id = rd_strdup(mdi->cluster_id); + + if (mdi->controller_id >= 0) + clusterdesc->controller = rd_kafka_Node_new_from_brokers( + mdi->controller_id, mdi->brokers_sorted, mdi->brokers, + md->broker_cnt); + + clusterdesc->authorized_operations = + rd_kafka_AuthorizedOperations_parse( + mdi->cluster_authorized_operations, + &clusterdesc->authorized_operations_cnt); + + clusterdesc->node_cnt = md->broker_cnt; + clusterdesc->nodes = + rd_calloc(clusterdesc->node_cnt, sizeof(rd_kafka_Node_t *)); + + for (i = 0; i < md->broker_cnt; i++) + clusterdesc->nodes[i] = rd_kafka_Node_new_from_brokers( + md->brokers[i].id, mdi->brokers_sorted, mdi->brokers, + md->broker_cnt); + + return clusterdesc; +} + +static void rd_kafka_ClusterDescription_destroy( + rd_kafka_ClusterDescription_t *clusterdesc) { + RD_IF_FREE(clusterdesc->cluster_id, rd_free); + RD_IF_FREE(clusterdesc->controller, rd_kafka_Node_free); + RD_IF_FREE(clusterdesc->authorized_operations, rd_free); + + if (clusterdesc->node_cnt) { + size_t i; + for (i = 0; i < clusterdesc->node_cnt; i++) + rd_kafka_Node_free(clusterdesc->nodes[i]); + rd_free(clusterdesc->nodes); + } + rd_free(clusterdesc); +} + +static void rd_kafka_ClusterDescription_free(void *ptr) { + rd_kafka_ClusterDescription_destroy(ptr); +} +/** + * @brief Send DescribeClusterRequest. Admin worker compatible callback. + */ +static rd_kafka_resp_err_t rd_kafka_admin_DescribeClusterRequest( + rd_kafka_broker_t *rkb, + const rd_list_t *ignored /* We don't use any arguments set here. */, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_resp_err_t err; + int include_cluster_authorized_operations = + rd_kafka_confval_get_int(&options->include_authorized_operations); + + err = rd_kafka_admin_MetadataRequest( + rkb, NULL /* topics */, "describe cluster", + include_cluster_authorized_operations, + rd_false /* don't include_topic_authorized_operations */, + rd_false /* don't force racks */, resp_cb, replyq, opaque); + + if (err) { + rd_snprintf(errstr, errstr_size, "%s", rd_kafka_err2str(err)); + return err; + } + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Parse DescribeCluster and create ADMIN_RESULT op. + */ +static rd_kafka_resp_err_t +rd_kafka_DescribeClusterResponse_parse(rd_kafka_op_t *rko_req, + rd_kafka_op_t **rko_resultp, + rd_kafka_buf_t *reply, + char *errstr, + size_t errstr_size) { + rd_kafka_metadata_internal_t *mdi = NULL; + rd_kafka_resp_err_t err; + rd_kafka_ClusterDescription_t *clusterdesc = NULL; + rd_list_t topics = rko_req->rko_u.admin_request.args; + rd_kafka_broker_t *rkb = reply->rkbuf_rkb; + rd_kafka_op_t *rko_result = NULL; + + err = rd_kafka_parse_Metadata_admin(rkb, reply, &topics, &mdi); + if (err) + goto err; + + rko_result = rd_kafka_admin_result_new(rko_req); + rd_list_init(&rko_result->rko_u.admin_result.results, 1, + rd_kafka_ClusterDescription_free); + + clusterdesc = rd_kafka_ClusterDescription_new(mdi); + + rd_free(mdi); + + rd_list_add(&rko_result->rko_u.admin_result.results, clusterdesc); + *rko_resultp = rko_result; + return RD_KAFKA_RESP_ERR_NO_ERROR; + +err: + RD_IF_FREE(rko_result, rd_kafka_op_destroy); + rd_snprintf(errstr, errstr_size, + "DescribeCluster response protocol parse failure: %s", + rd_kafka_err2str(reply->rkbuf_err)); + return reply->rkbuf_err; +} + +void rd_kafka_DescribeCluster(rd_kafka_t *rk, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu) { + rd_kafka_op_t *rko; + static const struct rd_kafka_admin_worker_cbs cbs = { + rd_kafka_admin_DescribeClusterRequest, + rd_kafka_DescribeClusterResponse_parse}; + + rko = rd_kafka_admin_request_op_new( + rk, RD_KAFKA_OP_DESCRIBECLUSTER, + RD_KAFKA_EVENT_DESCRIBECLUSTER_RESULT, &cbs, options, rkqu->rkqu_q); + + rd_kafka_q_enq(rk->rk_ops, rko); +} + +/**@}*/ + +/** + * @name ElectLeaders + * @{ + * + * + * + * + */ + +/** + * @brief Creates a new rd_kafka_ElectLeaders_t object with the given + * \p election_type and \p partitions. + */ +rd_kafka_ElectLeaders_t * +rd_kafka_ElectLeaders_new(rd_kafka_ElectionType_t election_type, + rd_kafka_topic_partition_list_t *partitions) { + + rd_kafka_ElectLeaders_t *elect_leaders; + + elect_leaders = rd_calloc(1, sizeof(*elect_leaders)); + if (partitions) + elect_leaders->partitions = + rd_kafka_topic_partition_list_copy(partitions); + elect_leaders->election_type = election_type; + + return elect_leaders; +} + +rd_kafka_ElectLeaders_t * +rd_kafka_ElectLeaders_copy(const rd_kafka_ElectLeaders_t *elect_leaders) { + return rd_kafka_ElectLeaders_new(elect_leaders->election_type, + elect_leaders->partitions); +} + +void rd_kafka_ElectLeaders_destroy(rd_kafka_ElectLeaders_t *elect_leaders) { + if (elect_leaders->partitions) + rd_kafka_topic_partition_list_destroy( + elect_leaders->partitions); + rd_free(elect_leaders); +} + +static void rd_kafka_ElectLeaders_free(void *ptr) { + rd_kafka_ElectLeaders_destroy(ptr); +} + +/** + * @brief Creates a new rd_kafka_ElectLeadersResult_t object with the given + * \p error and \p partitions. + */ +static rd_kafka_ElectLeadersResult_t * +rd_kafka_ElectLeadersResult_new(rd_list_t *partitions) { + + rd_kafka_ElectLeadersResult_t *result; + result = rd_calloc(1, sizeof(*result)); + rd_list_init_copy(&result->partitions, partitions); + rd_list_copy_to(&result->partitions, partitions, + rd_kafka_topic_partition_result_copy_opaque, NULL); + return result; +} + +static const rd_kafka_topic_partition_result_t ** +rd_kafka_ElectLeadersResult_partitions( + const rd_kafka_ElectLeadersResult_t *result, + size_t *cntp) { + *cntp = rd_list_cnt(&result->partitions); + return (const rd_kafka_topic_partition_result_t **) + result->partitions.rl_elems; +} + +static void +rd_kafka_ElectLeadersResult_destroy(rd_kafka_ElectLeadersResult_t *result) { + rd_list_destroy(&result->partitions); + rd_free(result); +} + +static void rd_kafka_ElectLeadersResult_free(void *ptr) { + rd_kafka_ElectLeadersResult_destroy(ptr); +} + +static const rd_kafka_ElectLeadersResult_t *rd_kafka_ElectLeaders_result_result( + const rd_kafka_ElectLeaders_result_t *result) { + return (const rd_kafka_ElectLeadersResult_t *)rd_list_elem( + &result->rko_u.admin_result.results, 0); +} + +const rd_kafka_topic_partition_result_t ** +rd_kafka_ElectLeaders_result_partitions( + const rd_kafka_ElectLeaders_result_t *result, + size_t *cntp) { + return rd_kafka_ElectLeadersResult_partitions( + rd_kafka_ElectLeaders_result_result(result), cntp); +} + +/** + * @brief Parse ElectLeadersResponse and create ADMIN_RESULT op. + */ +static rd_kafka_resp_err_t +rd_kafka_ElectLeadersResponse_parse(rd_kafka_op_t *rko_req, + rd_kafka_op_t **rko_resultp, + rd_kafka_buf_t *reply, + char *errstr, + size_t errstr_size) { + const int log_decode_errors = LOG_ERR; + rd_kafka_op_t *rko_result = NULL; + rd_kafka_ElectLeadersResult_t *result = NULL; + int16_t top_level_error_code = 0; + int32_t TopicArrayCnt; + int partition_cnt; + rd_list_t partitions_arr; + rd_kafka_ElectLeaders_t *request = + rko_req->rko_u.admin_request.args.rl_elems[0]; + int i; + int j; + + rd_kafka_buf_read_throttle_time(reply); + + if (rd_kafka_buf_ApiVersion(reply) >= 1) { + rd_kafka_buf_read_i16(reply, &top_level_error_code); + } + + if (top_level_error_code) { + rd_kafka_admin_result_fail( + rko_req, top_level_error_code, + "ElectLeaders request failed: %s", + rd_kafka_err2str(top_level_error_code)); + return top_level_error_code; + } + + /* #partitions */ + rd_kafka_buf_read_arraycnt(reply, &TopicArrayCnt, RD_KAFKAP_TOPICS_MAX); + + if (request->partitions) + partition_cnt = request->partitions->cnt; + else + partition_cnt = 1; + rd_list_init(&partitions_arr, partition_cnt, + rd_kafka_topic_partition_result_free); + memset(partitions_arr.rl_elems, 0, + sizeof(*partitions_arr.rl_elems) * partition_cnt); + + for (i = 0; i < TopicArrayCnt; i++) { + rd_kafka_topic_partition_result_t *partition_result; + rd_kafkap_str_t ktopic; + char *topic; + int32_t PartArrayCnt; + + rd_kafka_buf_read_str(reply, &ktopic); + RD_KAFKAP_STR_DUPA(&topic, &ktopic); + + rd_kafka_buf_read_arraycnt(reply, &PartArrayCnt, + RD_KAFKAP_PARTITIONS_MAX); + + for (j = 0; j < PartArrayCnt; j++) { + int32_t partition; + int16_t partition_error_code; + rd_kafkap_str_t partition_error_msg; + char *partition_errstr; + int orig_pos; + + rd_kafka_buf_read_i32(reply, &partition); + rd_kafka_buf_read_i16(reply, &partition_error_code); + rd_kafka_buf_read_str(reply, &partition_error_msg); + + rd_kafka_buf_skip_tags(reply); + + if (RD_KAFKAP_STR_IS_NULL(&partition_error_msg) || + RD_KAFKAP_STR_LEN(&partition_error_msg) == 0) + partition_errstr = (char *)rd_kafka_err2str( + partition_error_code); + else + RD_KAFKAP_STR_DUPA(&partition_errstr, + &partition_error_msg); + + partition_result = rd_kafka_topic_partition_result_new( + topic, partition, partition_error_code, + partition_errstr); + + if (request->partitions) { + orig_pos = + rd_kafka_topic_partition_list_find_idx( + request->partitions, topic, partition); + + if (orig_pos == -1) { + rd_kafka_buf_parse_fail( + reply, + "Broker returned partition %s " + "[%" PRId32 + "] that was not " + "included in the original request", + topic, partition); + } + + if (rd_list_elem(&partitions_arr, orig_pos) != + NULL) { + rd_kafka_buf_parse_fail( + reply, + "Broker returned partition %s " + "[%" PRId32 "] multiple times", + topic, partition); + } + + rd_list_set(&partitions_arr, orig_pos, + partition_result); + } else { + rd_list_add(&partitions_arr, partition_result); + } + } + rd_kafka_buf_skip_tags(reply); + } + + rd_kafka_buf_skip_tags(reply); + + result = rd_kafka_ElectLeadersResult_new(&partitions_arr); + + rko_result = rd_kafka_admin_result_new(rko_req); + + rd_list_init(&rko_result->rko_u.admin_result.results, 1, + rd_kafka_ElectLeadersResult_free); + + rd_list_add(&rko_result->rko_u.admin_result.results, result); + + *rko_resultp = rko_result; + + rd_list_destroy(&partitions_arr); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +err_parse: + + rd_list_destroy(&partitions_arr); + + if (rko_result) + rd_kafka_op_destroy(rko_result); + + rd_snprintf(errstr, errstr_size, + "ElectLeaders response protocol parse failure: %s", + rd_kafka_err2str(reply->rkbuf_err)); + + return reply->rkbuf_err; +} + +void rd_kafka_ElectLeaders(rd_kafka_t *rk, + rd_kafka_ElectLeaders_t *elect_leaders, + const rd_kafka_AdminOptions_t *options, + rd_kafka_queue_t *rkqu) { + rd_kafka_op_t *rko; + rd_kafka_topic_partition_list_t *copied_partitions = NULL; + + static const struct rd_kafka_admin_worker_cbs cbs = { + rd_kafka_ElectLeadersRequest, + rd_kafka_ElectLeadersResponse_parse, + }; + + rd_assert(rkqu); + + rko = rd_kafka_admin_request_op_new(rk, RD_KAFKA_OP_ELECTLEADERS, + RD_KAFKA_EVENT_ELECTLEADERS_RESULT, + &cbs, options, rkqu->rkqu_q); + + if (elect_leaders->partitions) { + /* Duplicate topic partitions should not be present in the list + */ + copied_partitions = rd_kafka_topic_partition_list_copy( + elect_leaders->partitions); + if (rd_kafka_topic_partition_list_has_duplicates( + copied_partitions, rd_false /* check partition*/)) { + rd_kafka_admin_result_fail( + rko, RD_KAFKA_RESP_ERR__INVALID_ARG, + "Duplicate partitions specified"); + rd_kafka_admin_common_worker_destroy( + rk, rko, rd_true /*destroy*/); + rd_kafka_topic_partition_list_destroy( + copied_partitions); + return; + } + } + + rd_list_init(&rko->rko_u.admin_request.args, 1, + rd_kafka_ElectLeaders_free); + + rd_list_add(&rko->rko_u.admin_request.args, + rd_kafka_ElectLeaders_copy(elect_leaders)); + + rd_kafka_q_enq(rk->rk_ops, rko); + if (copied_partitions) + rd_kafka_topic_partition_list_destroy(copied_partitions); +} + +/**@}*/ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_admin.h b/src/third_party/librdkafka/dist/src/rdkafka_admin.h index 62fe9e87a38..c84849ea666 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_admin.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_admin.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,10 +32,23 @@ #include "rdstring.h" +#include "rdmap.h" #include "rdkafka_error.h" #include "rdkafka_confval.h" +#if WITH_SSL && OPENSSL_VERSION_NUMBER >= 0x10101000L +#include +#endif +#if WITH_SSL +typedef struct rd_kafka_broker_s rd_kafka_broker_t; +extern int rd_kafka_ssl_hmac(rd_kafka_broker_t *rkb, + const EVP_MD *evp, + const rd_chariov_t *in, + const rd_chariov_t *salt, + int itcnt, + rd_chariov_t *out); +#endif /** * @brief Common AdminOptions type used for all admin APIs. @@ -69,15 +83,9 @@ struct rd_kafka_AdminOptions_s { * CreateTopics * CreatePartitions * AlterConfigs + * IncrementalAlterConfigs */ - rd_kafka_confval_t incremental; /**< BOOL: Incremental rather than - * absolute application - * of config. - * Valid for: - * AlterConfigs - */ - rd_kafka_confval_t broker; /**< INT: Explicitly override * broker id to send * requests to. @@ -91,6 +99,14 @@ struct rd_kafka_AdminOptions_s { * Valid for: * ListConsumerGroupOffsets */ + rd_kafka_confval_t + include_authorized_operations; /**< BOOL: Whether broker should + * return authorized operations. + * Valid for: + * DescribeConsumerGroups + * DescribeCluster + * DescribeTopics + */ rd_kafka_confval_t match_consumer_group_states; /**< PTR: list of consumer group states @@ -98,6 +114,19 @@ struct rd_kafka_AdminOptions_s { * Valid for: ListConsumerGroups. */ + rd_kafka_confval_t + match_consumer_group_types; /**< PTR: list of consumer group types + * to query for. + * Valid for: ListConsumerGroups. + */ + + rd_kafka_confval_t + isolation_level; /**< INT:Isolation Level needed for list Offset + * to query for. + * Default Set to + * RD_KAFKA_ISOLATION_LEVEL_READ_UNCOMMITTED + */ + rd_kafka_confval_t opaque; /**< PTR: Application opaque. * Valid for all. */ }; @@ -188,13 +217,6 @@ struct rd_kafka_NewPartitions_s { * @{ */ -/* KIP-248 */ -typedef enum rd_kafka_AlterOperation_t { - RD_KAFKA_ALTER_OP_ADD = 0, - RD_KAFKA_ALTER_OP_SET = 1, - RD_KAFKA_ALTER_OP_DELETE = 2, -} rd_kafka_AlterOperation_t; - struct rd_kafka_ConfigEntry_s { rd_strtup_t *kv; /**< Name/Value pair */ @@ -202,8 +224,9 @@ struct rd_kafka_ConfigEntry_s { /* Attributes: this is a struct for easy copying */ struct { - rd_kafka_AlterOperation_t operation; /**< Operation */ - rd_kafka_ConfigSource_t source; /**< Config source */ + /** Operation type, used for IncrementalAlterConfigs */ + rd_kafka_AlterConfigOpType_t op_type; + rd_kafka_ConfigSource_t source; /**< Config source */ rd_bool_t is_readonly; /**< Value is read-only (on broker) */ rd_bool_t is_default; /**< Value is at its default */ rd_bool_t is_sensitive; /**< Value is sensitive */ @@ -250,12 +273,48 @@ struct rd_kafka_AlterConfigs_result_s { rd_list_t resources; /**< Type (rd_kafka_ConfigResource_t *) */ }; +struct rd_kafka_IncrementalAlterConfigs_result_s { + rd_list_t resources; /**< Type (rd_kafka_ConfigResource_t *) */ +}; + struct rd_kafka_ConfigResource_result_s { rd_list_t resources; /**< Type (struct rd_kafka_ConfigResource *): * List of config resources, sans config * but with response error values. */ }; +/** + * @brief Resource type specific to config apis. + */ +typedef enum rd_kafka_ConfigResourceType_t { + RD_KAFKA_CONFIG_RESOURCE_UNKNOWN = 0, + RD_KAFKA_CONFIG_RESOURCE_TOPIC = 2, + RD_KAFKA_CONFIG_RESOURCE_BROKER = 4, + RD_KAFKA_CONFIG_RESOURCE_GROUP = 32, +} rd_kafka_ConfigResourceType_t; + +/** + * @brief Maps `rd_kafka_ResourceType_t` to `rd_kafka_ConfigResourceType_t` + * for Config Apis. We are incorrectly using `rd_kafka_ResourceType_t` in + * both Config Apis and ACL Apis. So, we need this function to map the + * resource type internally to `rd_kafka_ConfigResourceType_t`. Like the + * enum value for `GROUP` is 32 in Config Apis, but it is 3 for ACL Apis. + */ +rd_kafka_ConfigResourceType_t +rd_kafka_ResourceType_to_ConfigResourceType(rd_kafka_ResourceType_t restype); + +/** + * @brief Maps `rd_kafka_ConfigResourceType_t` to `rd_kafka_ResourceType_t` + * for Config Apis. We are incorrectly using `rd_kafka_ResourceType_t` in + * both Config Apis and ACL Apis. So, we need this function to map the + * `rd_kafka_ConfigResourceType_t` internally to + * `rd_kafka_ResourceType_t`. Like the enum value for `GROUP` is 32 in + * Config Apis, but it is 3 for ACL Apis. + */ +rd_kafka_ResourceType_t rd_kafka_ConfigResourceType_to_ResourceType( + rd_kafka_ConfigResourceType_t config_resource_type); + + /**@}*/ @@ -298,6 +357,47 @@ struct rd_kafka_DeleteRecords_s { /**@}*/ +/** + * @name ListConsumerGroupOffsets + * @{ + */ + +/** + * @brief ListConsumerGroupOffsets result + */ +struct rd_kafka_ListConsumerGroupOffsets_result_s { + rd_list_t groups; /**< Type (rd_kafka_group_result_t *) */ +}; + +struct rd_kafka_ListConsumerGroupOffsets_s { + char *group_id; /**< Points to data */ + rd_kafka_topic_partition_list_t *partitions; + char data[1]; /**< The group id is allocated along with + * the struct here. */ +}; + +/**@}*/ + +/** + * @name AlterConsumerGroupOffsets + * @{ + */ + +/** + * @brief AlterConsumerGroupOffsets result + */ +struct rd_kafka_AlterConsumerGroupOffsets_result_s { + rd_list_t groups; /**< Type (rd_kafka_group_result_t *) */ +}; + +struct rd_kafka_AlterConsumerGroupOffsets_s { + char *group_id; /**< Points to data */ + rd_kafka_topic_partition_list_t *partitions; + char data[1]; /**< The group id is allocated along with + * the struct here. */ +}; + +/**@}*/ /** * @name DeleteConsumerGroupOffsets @@ -320,6 +420,24 @@ struct rd_kafka_DeleteConsumerGroupOffsets_s { /**@}*/ +/** + * @name ListOffsets + * @{ + */ + +/** + * @struct ListOffsets result about a single partition + */ +struct rd_kafka_ListOffsetsResultInfo_s { + rd_kafka_topic_partition_t *topic_partition; + int64_t timestamp; +}; + +rd_kafka_ListOffsetsResultInfo_t * +rd_kafka_ListOffsetsResultInfo_new(rd_kafka_topic_partition_t *rktpar, + rd_ts_t timestamp); +/**@}*/ + /** * @name CreateAcls * @{ @@ -357,50 +475,6 @@ struct rd_kafka_DeleteAcls_result_response_s { /**@}*/ - -/** - * @name AlterConsumerGroupOffsets - * @{ - */ - -/** - * @brief AlterConsumerGroupOffsets result - */ -struct rd_kafka_AlterConsumerGroupOffsets_result_s { - rd_list_t groups; /**< Type (rd_kafka_group_result_t *) */ -}; - -struct rd_kafka_AlterConsumerGroupOffsets_s { - char *group_id; /**< Points to data */ - rd_kafka_topic_partition_list_t *partitions; - char data[1]; /**< The group id is allocated along with - * the struct here. */ -}; - -/**@}*/ - - -/** - * @name ListConsumerGroupOffsets - * @{ - */ - -/** - * @brief ListConsumerGroupOffsets result - */ -struct rd_kafka_ListConsumerGroupOffsets_result_s { - rd_list_t groups; /**< Type (rd_kafka_group_result_t *) */ -}; - -struct rd_kafka_ListConsumerGroupOffsets_s { - char *group_id; /**< Points to data */ - rd_kafka_topic_partition_list_t *partitions; - char data[1]; /**< The group id is allocated along with - * the struct here. */ -}; - -/**@}*/ - /** * @name ListConsumerGroups * @{ @@ -414,6 +488,7 @@ struct rd_kafka_ConsumerGroupListing_s { /** Is it a simple consumer group? That means empty protocol_type. */ rd_bool_t is_simple_consumer_group; rd_kafka_consumer_group_state_t state; /**< Consumer group state. */ + rd_kafka_consumer_group_type_t type; /**< Consumer group type. */ }; @@ -452,6 +527,9 @@ struct rd_kafka_MemberDescription_s { char *group_instance_id; /**< Group instance id */ char *host; /**< Group member host */ rd_kafka_MemberAssignment_t assignment; /**< Member assignment */ + rd_kafka_MemberAssignment_t + *target_assignment; /**< Target assignment. `NULL` for `classic` + protocol */ }; /** @@ -471,12 +549,113 @@ struct rd_kafka_ConsumerGroupDescription_s { char *partition_assignor; /** Consumer group state. */ rd_kafka_consumer_group_state_t state; + /** Consumer group type. */ + rd_kafka_consumer_group_type_t type; /** Consumer group coordinator. */ rd_kafka_Node_t *coordinator; + /** Count of operations allowed for topic. -1 indicates operations not + * requested.*/ + int authorized_operations_cnt; + /** Operations allowed for topic. May be NULL if operations were not + * requested */ + rd_kafka_AclOperation_t *authorized_operations; /** Group specific error. */ rd_kafka_error_t *error; }; /**@}*/ +/** + * @name DescribeTopics + * @{ + */ + +/** + * @brief TopicCollection contains a list of topics. + * + */ +struct rd_kafka_TopicCollection_s { + char **topics; /**< List of topic names. */ + size_t topics_cnt; /**< Count of topic names. */ +}; + +/** + * @brief TopicPartition result type in DescribeTopics result. + * + */ +struct rd_kafka_TopicPartitionInfo_s { + int partition; /**< Partition id. */ + rd_kafka_Node_t *leader; /**< Leader of the partition. */ + size_t isr_cnt; /**< Count of insync replicas. */ + rd_kafka_Node_t **isr; /**< List of in sync replica nodes. */ + size_t replica_cnt; /**< Count of partition replicas. */ + rd_kafka_Node_t **replicas; /**< List of replica nodes. */ +}; + +/** + * @struct DescribeTopics result + */ +struct rd_kafka_TopicDescription_s { + char *topic; /**< Topic name */ + rd_kafka_Uuid_t topic_id; /**< Topic Id */ + int partition_cnt; /**< Number of partitions in \p partitions*/ + rd_bool_t is_internal; /**< Is the topic is internal to Kafka? */ + rd_kafka_TopicPartitionInfo_t **partitions; /**< Partitions */ + rd_kafka_error_t *error; /**< Topic error reported by broker */ + int authorized_operations_cnt; /**< Count of operations allowed for + * topic. -1 indicates operations not + * requested. */ + rd_kafka_AclOperation_t + *authorized_operations; /**< Operations allowed for topic. May be + * NULL if operations were not requested */ +}; + +/**@}*/ + +/** + * @name DescribeCluster + * @{ + */ +/** + * @struct DescribeCluster result - internal type. + */ +typedef struct rd_kafka_ClusterDescription_s { + char *cluster_id; /**< Cluster id */ + rd_kafka_Node_t *controller; /**< Current controller. */ + size_t node_cnt; /**< Count of brokers in the cluster. */ + rd_kafka_Node_t **nodes; /**< Brokers in the cluster. */ + int authorized_operations_cnt; /**< Count of operations allowed for + * cluster. -1 indicates operations not + * requested. */ + rd_kafka_AclOperation_t + *authorized_operations; /**< Operations allowed for cluster. May be + * NULL if operations were not requested */ + +} rd_kafka_ClusterDescription_t; + +/**@}*/ + +/** + * @name ElectLeaders + * @{ + */ + +/** + * @struct ElectLeaders request object + */ +struct rd_kafka_ElectLeaders_s { + rd_kafka_ElectionType_t election_type; /*Election Type*/ + rd_kafka_topic_partition_list_t + *partitions; /*TopicPartitions for election*/ +}; + +/** + * @struct ElectLeaders result object + */ +typedef struct rd_kafka_ElectLeadersResult_s { + rd_list_t partitions; /**< Type (rd_kafka_topic_partition_result_t *) */ +} rd_kafka_ElectLeadersResult_t; + +/**@}*/ + #endif /* _RDKAFKA_ADMIN_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_assignment.c b/src/third_party/librdkafka/dist/src/rdkafka_assignment.c index 85c275aad35..6d1f01913f9 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_assignment.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_assignment.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -135,7 +136,9 @@ rd_kafka_assignment_apply_offsets(rd_kafka_t *rk, rd_kafka_topic_partition_t *rktpar; RD_KAFKA_TPLIST_FOREACH(rktpar, offsets) { - rd_kafka_toppar_t *rktp = rktpar->_private; /* May be NULL */ + /* May be NULL, borrow ref. */ + rd_kafka_toppar_t *rktp = + rd_kafka_topic_partition_toppar(rk, rktpar); if (!rd_kafka_topic_partition_list_del( rk->rk_consumer.assignment.queried, rktpar->topic, @@ -150,8 +153,30 @@ rd_kafka_assignment_apply_offsets(rd_kafka_t *rk, continue; } - if (err == RD_KAFKA_RESP_ERR_UNSTABLE_OFFSET_COMMIT || - rktpar->err == RD_KAFKA_RESP_ERR_UNSTABLE_OFFSET_COMMIT) { + if (err == RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH || + rktpar->err == RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH) { + rd_kafka_topic_partition_t *rktpar_copy; + + rd_kafka_dbg(rk, CGRP, "OFFSETFETCH", + "Adding %s [%" PRId32 + "] back to pending " + "list because of stale member epoch", + rktpar->topic, rktpar->partition); + + rktpar_copy = rd_kafka_topic_partition_list_add_copy( + rk->rk_consumer.assignment.pending, rktpar); + /* Need to reset offset to STORED to query for + * the committed offset again. If the offset is + * kept INVALID then auto.offset.reset will be + * triggered. + * + * Not necessary if err is UNSTABLE_OFFSET_COMMIT + * because the buffer is retried there. */ + rktpar_copy->offset = RD_KAFKA_OFFSET_STORED; + + } else if (err == RD_KAFKA_RESP_ERR_UNSTABLE_OFFSET_COMMIT || + rktpar->err == + RD_KAFKA_RESP_ERR_UNSTABLE_OFFSET_COMMIT) { /* Ongoing transactions are blocking offset retrieval. * This is typically retried from the OffsetFetch * handler but we can come here if the assignment @@ -207,7 +232,9 @@ rd_kafka_assignment_apply_offsets(rd_kafka_t *rk, /* Do nothing for request-level errors (err is set). */ } - if (offsets->cnt > 0) + /* In case of stale member epoch we retry to serve the + * assignment only after a successful ConsumerGroupHeartbeat. */ + if (offsets->cnt > 0 && err != RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH) rd_kafka_assignment_serve(rk); } @@ -271,18 +298,32 @@ static void rd_kafka_assignment_handle_OffsetFetch(rd_kafka_t *rk, return; } - - if (err) { - rd_kafka_dbg(rk, CGRP, "OFFSET", - "Offset fetch error for %d partition(s): %s", - offsets->cnt, rd_kafka_err2str(err)); - rd_kafka_consumer_err( - rk->rk_consumer.q, rd_kafka_broker_id(rkb), err, 0, NULL, - NULL, RD_KAFKA_OFFSET_INVALID, - "Failed to fetch committed offsets for " - "%d partition(s) in group \"%s\": %s", - offsets->cnt, rk->rk_group_id->str, rd_kafka_err2str(err)); + switch (err) { + case RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH: + rk->rk_cgrp->rkcg_consumer_flags |= + RD_KAFKA_CGRP_CONSUMER_F_SERVE_PENDING; + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rk->rk_cgrp, + "OffsetFetch error: Stale member epoch"); + break; + case RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID: + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rk->rk_cgrp, "OffsetFetch error: Unknown member"); + break; + default: + rd_kafka_dbg( + rk, CGRP, "OFFSET", + "Offset fetch error for %d partition(s): %s", + offsets->cnt, rd_kafka_err2str(err)); + rd_kafka_consumer_err( + rk->rk_consumer.q, rd_kafka_broker_id(rkb), err, 0, + NULL, NULL, RD_KAFKA_OFFSET_INVALID, + "Failed to fetch committed offsets for " + "%d partition(s) in group \"%s\": %s", + offsets->cnt, rk->rk_group_id->str, + rd_kafka_err2str(err)); + } } /* Apply the fetched offsets to the assignment */ @@ -302,7 +343,9 @@ static int rd_kafka_assignment_serve_removals(rd_kafka_t *rk) { int valid_offsets = 0; RD_KAFKA_TPLIST_FOREACH(rktpar, rk->rk_consumer.assignment.removed) { - rd_kafka_toppar_t *rktp = rktpar->_private; /* Borrow ref */ + rd_kafka_toppar_t *rktp = + rd_kafka_topic_partition_ensure_toppar( + rk, rktpar, rd_true); /* Borrow ref */ int was_pending, was_queried; /* Remove partition from pending and querying lists, @@ -333,17 +376,21 @@ static int rd_kafka_assignment_serve_removals(rd_kafka_t *rk) { rd_kafka_toppar_lock(rktp); - /* Save the currently stored offset on .removed + /* Save the currently stored offset and epoch on .removed * so it will be committed below. */ - rktpar->offset = rktp->rktp_stored_offset; + rd_kafka_topic_partition_set_from_fetch_pos( + rktpar, rktp->rktp_stored_pos); + rd_kafka_topic_partition_set_metadata_from_rktp_stored(rktpar, + rktp); valid_offsets += !RD_KAFKA_OFFSET_IS_LOGICAL(rktpar->offset); /* Reset the stored offset to invalid so that * a manual offset-less commit() or the auto-committer * will not commit a stored offset from a previous * assignment (issue #2782). */ - rd_kafka_offset_store0(rktp, RD_KAFKA_OFFSET_INVALID, rd_true, - RD_DONT_LOCK); + rd_kafka_offset_store0( + rktp, RD_KAFKA_FETCH_POS(RD_KAFKA_OFFSET_INVALID, -1), NULL, + 0, rd_true, RD_DONT_LOCK); /* Partition is no longer desired */ rd_kafka_toppar_desired_del(rktp); @@ -422,7 +469,9 @@ static int rd_kafka_assignment_serve_pending(rd_kafka_t *rk) { for (i = rk->rk_consumer.assignment.pending->cnt - 1; i >= 0; i--) { rd_kafka_topic_partition_t *rktpar = &rk->rk_consumer.assignment.pending->elems[i]; - rd_kafka_toppar_t *rktp = rktpar->_private; /* Borrow ref */ + /* Borrow ref */ + rd_kafka_toppar_t *rktp = + rd_kafka_topic_partition_ensure_toppar(rk, rktpar, rd_true); rd_assert(!rktp->rktp_started); @@ -443,9 +492,11 @@ static int rd_kafka_assignment_serve_pending(rd_kafka_t *rk) { rd_kafka_dbg(rk, CGRP, "SRVPEND", "Starting pending assigned partition " - "%s [%" PRId32 "] at offset %s", + "%s [%" PRId32 "] at %s", rktpar->topic, rktpar->partition, - rd_kafka_offset2str(rktpar->offset)); + rd_kafka_fetch_pos2str( + rd_kafka_topic_partition_get_fetch_pos( + rktpar))); /* Reset the (lib) pause flag which may have been set by * the cgrp when scheduling the rebalance callback. */ @@ -457,9 +508,10 @@ static int rd_kafka_assignment_serve_pending(rd_kafka_t *rk) { rktp->rktp_started = rd_true; rk->rk_consumer.assignment.started_cnt++; - rd_kafka_toppar_op_fetch_start(rktp, rktpar->offset, - rk->rk_consumer.q, - RD_KAFKA_NO_REPLYQ); + rd_kafka_toppar_op_fetch_start( + rktp, + rd_kafka_topic_partition_get_fetch_pos(rktpar), + rk->rk_consumer.q, RD_KAFKA_NO_REPLYQ); } else if (can_query_offsets) { @@ -529,7 +581,8 @@ static int rd_kafka_assignment_serve_pending(rd_kafka_t *rk) { partitions_to_query->cnt); rd_kafka_OffsetFetchRequest( - coord, rk->rk_group_id->str, partitions_to_query, + coord, rk->rk_group_id->str, partitions_to_query, rd_false, + -1, NULL, rk->rk_conf.isolation_level == RD_KAFKA_READ_COMMITTED /*require_stable_offsets*/, 0, /* Timeout */ @@ -733,8 +786,9 @@ rd_kafka_assignment_add(rd_kafka_t *rk, /* Reset the stored offset to INVALID to avoid the race * condition described in rdkafka_offset.h */ - rd_kafka_offset_store0(rktp, RD_KAFKA_OFFSET_INVALID, - rd_true /* force */, RD_DONT_LOCK); + rd_kafka_offset_store0( + rktp, RD_KAFKA_FETCH_POS(RD_KAFKA_OFFSET_INVALID, -1), NULL, + 0, rd_true /* force */, RD_DONT_LOCK); rd_kafka_toppar_unlock(rktp); } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_assignment.h b/src/third_party/librdkafka/dist/src/rdkafka_assignment.h index fa51bb10c30..1f73c4ede8b 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_assignment.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_assignment.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_assignor.c b/src/third_party/librdkafka/dist/src/rdkafka_assignor.c index dfd1c775f3e..465568c41da 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_assignor.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_assignor.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -59,6 +60,9 @@ void rd_kafka_group_member_clear(rd_kafka_group_member_t *rkgm) { if (rkgm->rkgm_member_metadata) rd_kafkap_bytes_destroy(rkgm->rkgm_member_metadata); + if (rkgm->rkgm_rack_id) + rd_kafkap_str_destroy(rkgm->rkgm_rack_id); + memset(rkgm, 0, sizeof(*rkgm)); } @@ -106,7 +110,9 @@ rd_kafkap_bytes_t *rd_kafka_consumer_protocol_member_metadata_new( const rd_list_t *topics, const void *userdata, size_t userdata_size, - const rd_kafka_topic_partition_list_t *owned_partitions) { + const rd_kafka_topic_partition_list_t *owned_partitions, + int generation, + const rd_kafkap_str_t *rack_id) { rd_kafka_buf_t *rkbuf; rd_kafkap_bytes_t *kbytes; @@ -124,12 +130,14 @@ rd_kafkap_bytes_t *rd_kafka_consumer_protocol_member_metadata_new( * OwnedPartitions => [Topic Partitions] // added in v1 * Topic => string * Partitions => [int32] + * GenerationId => int32 // added in v2 + * RackId => string // added in v3 */ rkbuf = rd_kafka_buf_new(1, 100 + (topic_cnt * 100) + userdata_size); /* Version */ - rd_kafka_buf_write_i16(rkbuf, 1); + rd_kafka_buf_write_i16(rkbuf, 3); rd_kafka_buf_write_i32(rkbuf, topic_cnt); RD_LIST_FOREACH(tinfo, topics, i) rd_kafka_buf_write_str(rkbuf, tinfo->topic, -1); @@ -144,13 +152,22 @@ rd_kafkap_bytes_t *rd_kafka_consumer_protocol_member_metadata_new( /* If there are no owned partitions, this is specified as an * empty array, not NULL. */ rd_kafka_buf_write_i32(rkbuf, 0); /* Topic count */ - else + else { + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; rd_kafka_buf_write_topic_partitions( rkbuf, owned_partitions, rd_false /*don't skip invalid offsets*/, - rd_false /*any offset*/, rd_false /*don't write offsets*/, - rd_false /*don't write epoch*/, - rd_false /*don't write metadata*/); + rd_false /*any offset*/, rd_false /*don't use topic id*/, + rd_true /*use topic name*/, fields); + } + + /* Following data is ignored by consumer version < 2 */ + rd_kafka_buf_write_i32(rkbuf, generation); + + /* Following data is ignored by consumer version < 3 */ + rd_kafka_buf_write_kstr(rkbuf, rack_id); /* Get binary buffer and allocate a new Kafka Bytes with a copy. */ rd_slice_init_full(&rkbuf->rkbuf_reader, &rkbuf->rkbuf_buf); @@ -168,9 +185,13 @@ rd_kafkap_bytes_t *rd_kafka_assignor_get_metadata_with_empty_userdata( const rd_kafka_assignor_t *rkas, void *assignor_state, const rd_list_t *topics, - const rd_kafka_topic_partition_list_t *owned_partitions) { - return rd_kafka_consumer_protocol_member_metadata_new(topics, NULL, 0, - owned_partitions); + const rd_kafka_topic_partition_list_t *owned_partitions, + const rd_kafkap_str_t *rack_id) { + /* Generation was earlier populated inside userData, and older versions + * of clients still expect that. So, in case the userData is empty, we + * set the explicit generation field to the default value, -1 */ + return rd_kafka_consumer_protocol_member_metadata_new( + topics, NULL, 0, owned_partitions, -1 /* generation */, rack_id); } @@ -242,6 +263,8 @@ rd_kafka_member_subscriptions_map(rd_kafka_cgrp_t *rkcg, int member_cnt) { int ti; rd_kafka_assignor_topic_t *eligible_topic = NULL; + rd_kafka_metadata_internal_t *mdi = + rd_kafka_metadata_get_internal(metadata); rd_list_init(eligible_topics, RD_MIN(metadata->topic_cnt, 10), (void *)rd_kafka_assignor_topic_destroy); @@ -283,7 +306,8 @@ rd_kafka_member_subscriptions_map(rd_kafka_cgrp_t *rkcg, continue; } - eligible_topic->metadata = &metadata->topics[ti]; + eligible_topic->metadata = &metadata->topics[ti]; + eligible_topic->metadata_internal = &mdi->topics[ti]; rd_list_add(eligible_topics, eligible_topic); eligible_topic = NULL; } @@ -483,7 +507,8 @@ rd_kafka_resp_err_t rd_kafka_assignor_add( const struct rd_kafka_assignor_s *rkas, void *assignor_state, const rd_list_t *topics, - const rd_kafka_topic_partition_list_t *owned_partitions), + const rd_kafka_topic_partition_list_t *owned_partitions, + const rd_kafkap_str_t *rack_id), void (*on_assignment_cb)(const struct rd_kafka_assignor_s *rkas, void **assignor_state, const rd_kafka_topic_partition_list_t *assignment, @@ -634,6 +659,676 @@ void rd_kafka_assignors_term(rd_kafka_t *rk) { rd_list_destroy(&rk->rk_conf.partition_assignors); } +/** + * @brief Computes whether rack-aware assignment needs to be used, or not. + */ +rd_bool_t +rd_kafka_use_rack_aware_assignment(rd_kafka_assignor_topic_t **topics, + size_t topic_cnt, + const rd_kafka_metadata_internal_t *mdi) { + /* Computing needs_rack_aware_assignment requires the evaluation of + three criteria: + + 1. At least one of the member has a non-null rack. + 2. At least one common rack exists between members and partitions. + 3. There is a partition which doesn't have replicas on all possible + racks, or in other words, all partitions don't have replicas on all + racks. Note that 'all racks' here means racks across all replicas of + all partitions, not including consumer racks. Also note that 'all + racks' are computed per-topic for range assignor, and across topics + for sticky assignor. + */ + + int i; + size_t t; + rd_kafka_group_member_t *member; + rd_list_t *all_consumer_racks = NULL; /* Contained Type: char* */ + rd_list_t *all_partition_racks = NULL; /* Contained Type: char* */ + char *rack_id = NULL; + rd_bool_t needs_rack_aware_assignment = rd_true; /* assume true */ + + /* Criteria 1 */ + /* We don't copy racks, so the free function is NULL. */ + all_consumer_racks = rd_list_new(0, NULL); + + for (t = 0; t < topic_cnt; t++) { + RD_LIST_FOREACH(member, &topics[t]->members, i) { + if (member->rkgm_rack_id && + RD_KAFKAP_STR_LEN(member->rkgm_rack_id)) { + /* Repetitions are fine, we will dedup it later. + */ + rd_list_add( + all_consumer_racks, + /* The const qualifier has to be discarded + because of how rd_list_t and + rd_kafkap_str_t are, but we never modify + items in all_consumer_racks. */ + (char *)member->rkgm_rack_id->str); + } + } + } + if (rd_list_cnt(all_consumer_racks) == 0) { + needs_rack_aware_assignment = rd_false; + goto done; + } + + + /* Critera 2 */ + /* We don't copy racks, so the free function is NULL. */ + all_partition_racks = rd_list_new(0, NULL); + + for (t = 0; t < topic_cnt; t++) { + const int partition_cnt = topics[t]->metadata->partition_cnt; + for (i = 0; i < partition_cnt; i++) { + size_t j; + for (j = 0; j < topics[t] + ->metadata_internal->partitions[i] + .racks_cnt; + j++) { + char *rack = + topics[t] + ->metadata_internal->partitions[i] + .racks[j]; + rd_list_add(all_partition_racks, rack); + } + } + } + + /* If there are no partition racks, Criteria 2 cannot possibly be met. + */ + if (rd_list_cnt(all_partition_racks) == 0) { + needs_rack_aware_assignment = rd_false; + goto done; + } + + /* Sort and dedup the racks. */ + rd_list_deduplicate(&all_consumer_racks, rd_strcmp2); + rd_list_deduplicate(&all_partition_racks, rd_strcmp2); + + + /* Iterate through each list in order, and see if there's anything in + * common */ + RD_LIST_FOREACH(rack_id, all_consumer_racks, i) { + /* Break if there's even a single match. */ + if (rd_list_find(all_partition_racks, rack_id, rd_strcmp2)) { + break; + } + } + if (i == rd_list_cnt(all_consumer_racks)) { + needs_rack_aware_assignment = rd_false; + goto done; + } + + /* Criteria 3 */ + for (t = 0; t < topic_cnt; t++) { + const int partition_cnt = topics[t]->metadata->partition_cnt; + for (i = 0; i < partition_cnt; i++) { + /* Since partition_racks[i] is a subset of + * all_partition_racks, and both of them are deduped, + * the same size indicates that they're equal. */ + if ((size_t)(rd_list_cnt(all_partition_racks)) != + topics[t] + ->metadata_internal->partitions[i] + .racks_cnt) { + break; + } + } + if (i < partition_cnt) { + /* Break outer loop if inner loop was broken. */ + break; + } + } + + /* Implies that all partitions have replicas on all racks. */ + if (t == topic_cnt) + needs_rack_aware_assignment = rd_false; + +done: + RD_IF_FREE(all_consumer_racks, rd_list_destroy); + RD_IF_FREE(all_partition_racks, rd_list_destroy); + + return needs_rack_aware_assignment; +} + + +/* Helper to populate the racks for brokers in the metadata for unit tests. + * Passing num_broker_racks = 0 will return NULL racks. */ +void ut_populate_internal_broker_metadata(rd_kafka_metadata_internal_t *mdi, + int num_broker_racks, + rd_kafkap_str_t *all_racks[], + size_t all_racks_cnt) { + int i; + + rd_assert(num_broker_racks < (int)all_racks_cnt); + + for (i = 0; i < mdi->metadata.broker_cnt; i++) { + mdi->brokers[i].id = i; + /* Cast from const to non-const. We don't intend to modify it, + * but unfortunately neither implementation of rd_kafkap_str_t + * or rd_kafka_metadata_broker_internal_t can be changed. So, + * this cast is used - in unit tests only. */ + mdi->brokers[i].rack_id = + (char *)(num_broker_racks + ? all_racks[i % num_broker_racks]->str + : NULL); + } +} + +/* Helper to populate the deduplicated racks inside each partition. It's assumed + * that `mdi->brokers` is set, maybe using + * `ut_populate_internal_broker_metadata`. */ +void ut_populate_internal_topic_metadata(rd_kafka_metadata_internal_t *mdi) { + int ti; + rd_kafka_metadata_broker_internal_t *brokers_internal; + size_t broker_cnt; + + rd_assert(mdi->brokers); + + brokers_internal = mdi->brokers; + broker_cnt = mdi->metadata.broker_cnt; + + for (ti = 0; ti < mdi->metadata.topic_cnt; ti++) { + int i; + rd_kafka_metadata_topic_t *mdt = &mdi->metadata.topics[ti]; + rd_kafka_metadata_topic_internal_t *mdti = &mdi->topics[ti]; + + for (i = 0; i < mdt->partition_cnt; i++) { + int j; + rd_kafka_metadata_partition_t *partition = + &mdt->partitions[i]; + rd_kafka_metadata_partition_internal_t + *partition_internal = &mdti->partitions[i]; + + rd_list_t *curr_list; + char *rack; + + if (partition->replica_cnt == 0) + continue; + + curr_list = rd_list_new( + 0, NULL); /* use a list for de-duplication */ + for (j = 0; j < partition->replica_cnt; j++) { + rd_kafka_metadata_broker_internal_t key = { + .id = partition->replicas[j]}; + rd_kafka_metadata_broker_internal_t *broker = + bsearch( + &key, brokers_internal, broker_cnt, + sizeof( + rd_kafka_metadata_broker_internal_t), + rd_kafka_metadata_broker_internal_cmp); + if (!broker || !broker->rack_id) + continue; + rd_list_add(curr_list, broker->rack_id); + } + rd_list_deduplicate(&curr_list, rd_strcmp2); + + partition_internal->racks_cnt = rd_list_cnt(curr_list); + partition_internal->racks = rd_malloc( + sizeof(char *) * partition_internal->racks_cnt); + RD_LIST_FOREACH(rack, curr_list, j) { + partition_internal->racks[j] = + rack; /* no duplication */ + } + rd_list_destroy(curr_list); + } + } +} + +/* Helper to destroy test metadata. Destroying the metadata has some additional + * steps in case of tests. */ +void ut_destroy_metadata(rd_kafka_metadata_t *md) { + int ti; + rd_kafka_metadata_internal_t *mdi = rd_kafka_metadata_get_internal(md); + + for (ti = 0; ti < md->topic_cnt; ti++) { + int i; + rd_kafka_metadata_topic_t *mdt = &md->topics[ti]; + rd_kafka_metadata_topic_internal_t *mdti = &mdi->topics[ti]; + + for (i = 0; mdti && i < mdt->partition_cnt; i++) { + rd_free(mdti->partitions[i].racks); + } + } + + rd_kafka_metadata_destroy(md); +} + + +/** + * @brief Set a member's owned partitions based on its assignment. + * + * For use between assignor_run(). This is mimicing a consumer receiving + * its new assignment and including it in the next rebalance as its + * owned-partitions. + */ +void ut_set_owned(rd_kafka_group_member_t *rkgm) { + if (rkgm->rkgm_owned) + rd_kafka_topic_partition_list_destroy(rkgm->rkgm_owned); + + rkgm->rkgm_owned = + rd_kafka_topic_partition_list_copy(rkgm->rkgm_assignment); +} + + +void ut_print_toppar_list(const rd_kafka_topic_partition_list_t *partitions) { + int i; + + for (i = 0; i < partitions->cnt; i++) + RD_UT_SAY(" %s [%" PRId32 "]", partitions->elems[i].topic, + partitions->elems[i].partition); +} + + +/* Implementation for ut_init_member and ut_init_member_with_rackv. */ +static void ut_init_member_internal(rd_kafka_group_member_t *rkgm, + const char *member_id, + const rd_kafkap_str_t *rack_id, + va_list ap) { + const char *topic; + + memset(rkgm, 0, sizeof(*rkgm)); + + rkgm->rkgm_member_id = rd_kafkap_str_new(member_id, -1); + rkgm->rkgm_group_instance_id = rd_kafkap_str_new(member_id, -1); + rkgm->rkgm_rack_id = rack_id ? rd_kafkap_str_copy(rack_id) : NULL; + + rd_list_init(&rkgm->rkgm_eligible, 0, NULL); + + rkgm->rkgm_subscription = rd_kafka_topic_partition_list_new(4); + + while ((topic = va_arg(ap, const char *))) + rd_kafka_topic_partition_list_add(rkgm->rkgm_subscription, + topic, RD_KAFKA_PARTITION_UA); + + rkgm->rkgm_assignment = + rd_kafka_topic_partition_list_new(rkgm->rkgm_subscription->size); + + rkgm->rkgm_generation = 1; +} + +/** + * @brief Initialize group member struct for testing. + * + * va-args is a NULL-terminated list of (const char *) topics. + * + * Use rd_kafka_group_member_clear() to free fields. + */ +void ut_init_member(rd_kafka_group_member_t *rkgm, const char *member_id, ...) { + va_list ap; + va_start(ap, member_id); + ut_init_member_internal(rkgm, member_id, NULL, ap); + va_end(ap); +} + +/** + * @brief Initialize group member struct for testing with a rackid. + * + * va-args is a NULL-terminated list of (const char *) topics. + * + * Use rd_kafka_group_member_clear() to free fields. + */ +void ut_init_member_with_rackv(rd_kafka_group_member_t *rkgm, + const char *member_id, + const rd_kafkap_str_t *rack_id, + ...) { + va_list ap; + va_start(ap, rack_id); + ut_init_member_internal(rkgm, member_id, rack_id, ap); + va_end(ap); +} + +/** + * @brief Initialize group member struct for testing with a rackid. + * + * Topics that the member is subscribed to are specified in an array with the + * size specified separately. + * + * Use rd_kafka_group_member_clear() to free fields. + */ +void ut_init_member_with_rack(rd_kafka_group_member_t *rkgm, + const char *member_id, + const rd_kafkap_str_t *rack_id, + char *topics[], + size_t topic_cnt) { + size_t i; + + memset(rkgm, 0, sizeof(*rkgm)); + + rkgm->rkgm_member_id = rd_kafkap_str_new(member_id, -1); + rkgm->rkgm_group_instance_id = rd_kafkap_str_new(member_id, -1); + rkgm->rkgm_rack_id = rack_id ? rd_kafkap_str_copy(rack_id) : NULL; + rd_list_init(&rkgm->rkgm_eligible, 0, NULL); + + rkgm->rkgm_subscription = rd_kafka_topic_partition_list_new(4); + + for (i = 0; i < topic_cnt; i++) { + rd_kafka_topic_partition_list_add( + rkgm->rkgm_subscription, topics[i], RD_KAFKA_PARTITION_UA); + } + rkgm->rkgm_assignment = + rd_kafka_topic_partition_list_new(rkgm->rkgm_subscription->size); +} + +/** + * @brief Verify that member's assignment matches the expected partitions. + * + * The va-list is a NULL-terminated list of (const char *topic, int partition) + * tuples. + * + * @returns 0 on success, else raises a unittest error and returns 1. + */ +int verifyAssignment0(const char *function, + int line, + rd_kafka_group_member_t *rkgm, + ...) { + va_list ap; + int cnt = 0; + const char *topic; + int fails = 0; + + va_start(ap, rkgm); + while ((topic = va_arg(ap, const char *))) { + int partition = va_arg(ap, int); + cnt++; + + if (!rd_kafka_topic_partition_list_find(rkgm->rkgm_assignment, + topic, partition)) { + RD_UT_WARN( + "%s:%d: Expected %s [%d] not found in %s's " + "assignment (%d partition(s))", + function, line, topic, partition, + rkgm->rkgm_member_id->str, + rkgm->rkgm_assignment->cnt); + fails++; + } + } + va_end(ap); + + if (cnt != rkgm->rkgm_assignment->cnt) { + RD_UT_WARN( + "%s:%d: " + "Expected %d assigned partition(s) for %s, not %d", + function, line, cnt, rkgm->rkgm_member_id->str, + rkgm->rkgm_assignment->cnt); + fails++; + } + + if (fails) + ut_print_toppar_list(rkgm->rkgm_assignment); + + RD_UT_ASSERT(!fails, "%s:%d: See previous errors", function, line); + + return 0; +} + +/** + * @brief Verify that all members' assignment matches the expected partitions. + * + * The va-list is a list of (const char *topic, int partition) + * tuples, and NULL to demarcate different members' assignment. + * + * @returns 0 on success, else raises a unittest error and returns 1. + */ +int verifyMultipleAssignment0(const char *function, + int line, + rd_kafka_group_member_t *rkgms, + size_t member_cnt, + ...) { + va_list ap; + const char *topic; + int fails = 0; + size_t i = 0; + + if (member_cnt == 0) { + return 0; + } + + va_start(ap, member_cnt); + for (i = 0; i < member_cnt; i++) { + rd_kafka_group_member_t *rkgm = &rkgms[i]; + int cnt = 0; + int local_fails = 0; + + while ((topic = va_arg(ap, const char *))) { + int partition = va_arg(ap, int); + cnt++; + + if (!rd_kafka_topic_partition_list_find( + rkgm->rkgm_assignment, topic, partition)) { + RD_UT_WARN( + "%s:%d: Expected %s [%d] not found in %s's " + "assignment (%d partition(s))", + function, line, topic, partition, + rkgm->rkgm_member_id->str, + rkgm->rkgm_assignment->cnt); + local_fails++; + } + } + + if (cnt != rkgm->rkgm_assignment->cnt) { + RD_UT_WARN( + "%s:%d: " + "Expected %d assigned partition(s) for %s, not %d", + function, line, cnt, rkgm->rkgm_member_id->str, + rkgm->rkgm_assignment->cnt); + fails++; + } + + if (local_fails) + ut_print_toppar_list(rkgm->rkgm_assignment); + fails += local_fails; + } + va_end(ap); + + RD_UT_ASSERT(!fails, "%s:%d: See previous errors", function, line); + + return 0; +} + + +#define verifyNumPartitionsWithRackMismatchPartition(rktpar, metadata, \ + increase) \ + do { \ + if (!rktpar) \ + break; \ + int i; \ + rd_bool_t noneMatch = rd_true; \ + rd_kafka_metadata_internal_t *metadata_internal = \ + rd_kafka_metadata_get_internal(metadata); \ + \ + for (i = 0; i < metadata->topics[j].partitions[k].replica_cnt; \ + i++) { \ + int32_t replica_id = \ + metadata->topics[j].partitions[k].replicas[i]; \ + rd_kafka_metadata_broker_internal_t *broker; \ + rd_kafka_metadata_broker_internal_find( \ + metadata_internal, replica_id, broker); \ + \ + if (broker && !strcmp(rack_id, broker->rack_id)) { \ + noneMatch = rd_false; \ + break; \ + } \ + } \ + \ + if (noneMatch) \ + increase++; \ + } while (0); + +/** + * @brief Verify number of partitions with rack mismatch. + */ +int verifyNumPartitionsWithRackMismatch0(const char *function, + int line, + rd_kafka_metadata_t *metadata, + rd_kafka_group_member_t *rkgms, + size_t member_cnt, + int expectedNumMismatch) { + size_t i; + int j, k; + + int numMismatched = 0; + for (i = 0; i < member_cnt; i++) { + rd_kafka_group_member_t *rkgm = &rkgms[i]; + const char *rack_id = rkgm->rkgm_rack_id->str; + if (rack_id) { + for (j = 0; j < metadata->topic_cnt; j++) { + for (k = 0; + k < metadata->topics[j].partition_cnt; + k++) { + rd_kafka_topic_partition_t *rktpar = + rd_kafka_topic_partition_list_find( + rkgm->rkgm_assignment, + metadata->topics[j].topic, k); + verifyNumPartitionsWithRackMismatchPartition( + rktpar, metadata, numMismatched); + } + } + } + } + + RD_UT_ASSERT(expectedNumMismatch == numMismatched, + "%s:%d: Expected %d mismatches, got %d", function, line, + expectedNumMismatch, numMismatched); + + return 0; +} + + +int verifyValidityAndBalance0(const char *func, + int line, + rd_kafka_group_member_t *members, + size_t member_cnt, + const rd_kafka_metadata_t *metadata) { + int fails = 0; + int i; + rd_bool_t verbose = rd_false; /* Enable for troubleshooting */ + + RD_UT_SAY("%s:%d: verifying assignment for %d member(s):", func, line, + (int)member_cnt); + + for (i = 0; i < (int)member_cnt; i++) { + const char *consumer = members[i].rkgm_member_id->str; + const rd_kafka_topic_partition_list_t *partitions = + members[i].rkgm_assignment; + int p, j; + + if (verbose) + RD_UT_SAY( + "%s:%d: " + "consumer \"%s\", %d subscribed topic(s), " + "%d assigned partition(s):", + func, line, consumer, + members[i].rkgm_subscription->cnt, partitions->cnt); + + for (p = 0; p < partitions->cnt; p++) { + const rd_kafka_topic_partition_t *partition = + &partitions->elems[p]; + + if (verbose) + RD_UT_SAY("%s:%d: %s [%" PRId32 "]", func, + line, partition->topic, + partition->partition); + + if (!rd_kafka_topic_partition_list_find( + members[i].rkgm_subscription, partition->topic, + RD_KAFKA_PARTITION_UA)) { + RD_UT_WARN("%s [%" PRId32 + "] is assigned to " + "%s but it is not subscribed to " + "that topic", + partition->topic, + partition->partition, consumer); + fails++; + } + } + + /* Update the member's owned partitions to match + * the assignment. */ + ut_set_owned(&members[i]); + + if (i == (int)member_cnt - 1) + continue; + + for (j = i + 1; j < (int)member_cnt; j++) { + const char *otherConsumer = + members[j].rkgm_member_id->str; + const rd_kafka_topic_partition_list_t *otherPartitions = + members[j].rkgm_assignment; + rd_bool_t balanced = + abs(partitions->cnt - otherPartitions->cnt) <= 1; + + for (p = 0; p < partitions->cnt; p++) { + const rd_kafka_topic_partition_t *partition = + &partitions->elems[p]; + + if (rd_kafka_topic_partition_list_find( + otherPartitions, partition->topic, + partition->partition)) { + RD_UT_WARN( + "Consumer %s and %s are both " + "assigned %s [%" PRId32 "]", + consumer, otherConsumer, + partition->topic, + partition->partition); + fails++; + } + + + /* If assignment is imbalanced and this topic + * is also subscribed by the other consumer + * it means the assignment strategy failed to + * properly balance the partitions. */ + if (!balanced && + rd_kafka_topic_partition_list_find_topic_by_name( + otherPartitions, partition->topic)) { + RD_UT_WARN( + "Some %s partition(s) can be " + "moved from " + "%s (%d partition(s)) to " + "%s (%d partition(s)) to " + "achieve a better balance", + partition->topic, consumer, + partitions->cnt, otherConsumer, + otherPartitions->cnt); + fails++; + } + } + } + } + + RD_UT_ASSERT(!fails, "%s:%d: See %d previous errors", func, line, + fails); + + return 0; +} + +/** + * @brief Checks that all assigned partitions are fully balanced. + * + * Only works for symmetrical subscriptions. + */ +int isFullyBalanced0(const char *function, + int line, + const rd_kafka_group_member_t *members, + size_t member_cnt) { + int min_assignment = INT_MAX; + int max_assignment = -1; + size_t i; + + for (i = 0; i < member_cnt; i++) { + int size = members[i].rkgm_assignment->cnt; + if (size < min_assignment) + min_assignment = size; + if (size > max_assignment) + max_assignment = size; + } + + RD_UT_ASSERT(max_assignment - min_assignment <= 1, + "%s:%d: Assignment not balanced: min %d, max %d", function, + line, min_assignment, max_assignment); + + return 0; +} /** @@ -879,6 +1574,7 @@ static int ut_assignors(void) { /* Run through test cases */ for (i = 0; tests[i].name; i++) { int ie, it, im; + rd_kafka_metadata_internal_t metadata_internal; rd_kafka_metadata_t metadata; rd_kafka_group_member_t *members; @@ -886,14 +1582,38 @@ static int ut_assignors(void) { metadata.topic_cnt = tests[i].topic_cnt; metadata.topics = rd_alloca(sizeof(*metadata.topics) * metadata.topic_cnt); + metadata_internal.topics = rd_alloca( + sizeof(*metadata_internal.topics) * metadata.topic_cnt); + memset(metadata.topics, 0, sizeof(*metadata.topics) * metadata.topic_cnt); + memset(metadata_internal.topics, 0, + sizeof(*metadata_internal.topics) * metadata.topic_cnt); + for (it = 0; it < metadata.topic_cnt; it++) { + int pt; metadata.topics[it].topic = (char *)tests[i].topics[it].name; metadata.topics[it].partition_cnt = tests[i].topics[it].partition_cnt; - metadata.topics[it].partitions = NULL; /* Not used */ + metadata.topics[it].partitions = + rd_alloca(metadata.topics[it].partition_cnt * + sizeof(rd_kafka_metadata_partition_t)); + metadata_internal.topics[it].partitions = rd_alloca( + metadata.topics[it].partition_cnt * + sizeof(rd_kafka_metadata_partition_internal_t)); + for (pt = 0; pt < metadata.topics[it].partition_cnt; + pt++) { + metadata.topics[it].partitions[pt].id = pt; + metadata.topics[it].partitions[pt].replica_cnt = + 0; + metadata_internal.topics[it] + .partitions[pt] + .racks_cnt = 0; + metadata_internal.topics[it] + .partitions[pt] + .racks = NULL; + } } /* Create members */ @@ -944,9 +1664,12 @@ static int ut_assignors(void) { } /* Run assignor */ - err = rd_kafka_assignor_run( - rk->rk_cgrp, rkas, &metadata, members, - tests[i].member_cnt, errstr, sizeof(errstr)); + metadata_internal.metadata = metadata; + err = rd_kafka_assignor_run( + rk->rk_cgrp, rkas, + (rd_kafka_metadata_t *)(&metadata_internal), + members, tests[i].member_cnt, errstr, + sizeof(errstr)); RD_UT_ASSERT(!err, "Assignor case %s for %s failed: %s", tests[i].name, diff --git a/src/third_party/librdkafka/dist/src/rdkafka_assignor.h b/src/third_party/librdkafka/dist/src/rdkafka_assignor.h index b90e7dc980d..6797e70b118 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_assignor.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_assignor.h @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,7 +29,7 @@ #ifndef _RDKAFKA_ASSIGNOR_H_ #define _RDKAFKA_ASSIGNOR_H_ - +#include "rdkafka_metadata.h" /*! * Enumerates the different rebalance protocol types. @@ -69,6 +70,8 @@ typedef struct rd_kafka_group_member_s { rd_kafkap_bytes_t *rkgm_member_metadata; /** Group generation id. */ int rkgm_generation; + /** Member rack id. */ + rd_kafkap_str_t *rkgm_rack_id; } rd_kafka_group_member_t; @@ -78,13 +81,13 @@ int rd_kafka_group_member_find_subscription(rd_kafka_t *rk, const rd_kafka_group_member_t *rkgm, const char *topic); - /** * Structure to hold metadata for a single topic and all its * subscribing members. */ typedef struct rd_kafka_assignor_topic_s { const rd_kafka_metadata_topic_t *metadata; + const rd_kafka_metadata_topic_internal_t *metadata_internal; rd_list_t members; /* rd_kafka_group_member_t * */ } rd_kafka_assignor_topic_t; @@ -120,7 +123,8 @@ typedef struct rd_kafka_assignor_s { const struct rd_kafka_assignor_s *rkas, void *assignor_state, const rd_list_t *topics, - const rd_kafka_topic_partition_list_t *owned_partitions); + const rd_kafka_topic_partition_list_t *owned_partitions, + const rd_kafkap_str_t *rack_id); void (*rkas_on_assignment_cb)( const struct rd_kafka_assignor_s *rkas, @@ -158,7 +162,8 @@ rd_kafka_resp_err_t rd_kafka_assignor_add( const struct rd_kafka_assignor_s *rkas, void *assignor_state, const rd_list_t *topics, - const rd_kafka_topic_partition_list_t *owned_partitions), + const rd_kafka_topic_partition_list_t *owned_partitions, + const rd_kafkap_str_t *rack_id), void (*on_assignment_cb)(const struct rd_kafka_assignor_s *rkas, void **assignor_state, const rd_kafka_topic_partition_list_t *assignment, @@ -172,13 +177,16 @@ rd_kafkap_bytes_t *rd_kafka_consumer_protocol_member_metadata_new( const rd_list_t *topics, const void *userdata, size_t userdata_size, - const rd_kafka_topic_partition_list_t *owned_partitions); + const rd_kafka_topic_partition_list_t *owned_partitions, + int generation, + const rd_kafkap_str_t *rack_id); rd_kafkap_bytes_t *rd_kafka_assignor_get_metadata_with_empty_userdata( const rd_kafka_assignor_t *rkas, void *assignor_state, const rd_list_t *topics, - const rd_kafka_topic_partition_list_t *owned_partitions); + const rd_kafka_topic_partition_list_t *owned_partitions, + const rd_kafkap_str_t *rack_id); void rd_kafka_assignor_update_subscription( @@ -208,5 +216,187 @@ void rd_kafka_group_member_clear(rd_kafka_group_member_t *rkgm); rd_kafka_resp_err_t rd_kafka_range_assignor_init(rd_kafka_t *rk); rd_kafka_resp_err_t rd_kafka_roundrobin_assignor_init(rd_kafka_t *rk); rd_kafka_resp_err_t rd_kafka_sticky_assignor_init(rd_kafka_t *rk); +rd_bool_t +rd_kafka_use_rack_aware_assignment(rd_kafka_assignor_topic_t **topics, + size_t topic_cnt, + const rd_kafka_metadata_internal_t *mdi); + +/** + * @name Common unit test functions, macros, and enums to use across assignors. + * + * + * + */ + +/* Tests can be parametrized to contain either only broker racks, only consumer + * racks or both.*/ +typedef enum { + RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK = 0, + RD_KAFKA_RANGE_ASSIGNOR_UT_NO_CONSUMER_RACK = 1, + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK = 2, + RD_KAFKA_RANGE_ASSIGNOR_UT_CONFIG_CNT = 3, +} rd_kafka_assignor_ut_rack_config_t; + + +void ut_populate_internal_broker_metadata(rd_kafka_metadata_internal_t *mdi, + int num_broker_racks, + rd_kafkap_str_t *all_racks[], + size_t all_racks_cnt); + +void ut_populate_internal_topic_metadata(rd_kafka_metadata_internal_t *mdi); + +void ut_destroy_metadata(rd_kafka_metadata_t *md); + +void ut_set_owned(rd_kafka_group_member_t *rkgm); + +void ut_print_toppar_list(const rd_kafka_topic_partition_list_t *partitions); + +void ut_init_member(rd_kafka_group_member_t *rkgm, const char *member_id, ...); + +void ut_init_member_with_rackv(rd_kafka_group_member_t *rkgm, + const char *member_id, + const rd_kafkap_str_t *rack_id, + ...); + +void ut_init_member_with_rack(rd_kafka_group_member_t *rkgm, + const char *member_id, + const rd_kafkap_str_t *rack_id, + char *topics[], + size_t topic_cnt); + +int verifyAssignment0(const char *function, + int line, + rd_kafka_group_member_t *rkgm, + ...); + +int verifyMultipleAssignment0(const char *function, + int line, + rd_kafka_group_member_t *rkgms, + size_t member_cnt, + ...); + +int verifyNumPartitionsWithRackMismatch0(const char *function, + int line, + rd_kafka_metadata_t *metadata, + rd_kafka_group_member_t *rkgms, + size_t member_cnt, + int expectedNumMismatch); + +#define verifyAssignment(rkgm, ...) \ + do { \ + if (verifyAssignment0(__FUNCTION__, __LINE__, rkgm, \ + __VA_ARGS__)) \ + return 1; \ + } while (0) + +#define verifyMultipleAssignment(rkgms, member_cnt, ...) \ + do { \ + if (verifyMultipleAssignment0(__FUNCTION__, __LINE__, rkgms, \ + member_cnt, __VA_ARGS__)) \ + return 1; \ + } while (0) + +#define verifyNumPartitionsWithRackMismatch(metadata, rkgms, member_cnt, \ + expectedNumMismatch) \ + do { \ + if (verifyNumPartitionsWithRackMismatch0( \ + __FUNCTION__, __LINE__, metadata, rkgms, member_cnt, \ + expectedNumMismatch)) \ + return 1; \ + } while (0) + +int verifyValidityAndBalance0(const char *func, + int line, + rd_kafka_group_member_t *members, + size_t member_cnt, + const rd_kafka_metadata_t *metadata); + +#define verifyValidityAndBalance(members, member_cnt, metadata) \ + do { \ + if (verifyValidityAndBalance0(__FUNCTION__, __LINE__, members, \ + member_cnt, metadata)) \ + return 1; \ + } while (0) + +int isFullyBalanced0(const char *function, + int line, + const rd_kafka_group_member_t *members, + size_t member_cnt); + +#define isFullyBalanced(members, member_cnt) \ + do { \ + if (isFullyBalanced0(__FUNCTION__, __LINE__, members, \ + member_cnt)) \ + return 1; \ + } while (0) + +/* Helper macro to initialize a consumer with or without a rack depending on the + * value of parametrization. */ +#define ut_initMemberConditionalRack(member_ptr, member_id, rack, \ + parametrization, ...) \ + do { \ + if (parametrization == \ + RD_KAFKA_RANGE_ASSIGNOR_UT_NO_CONSUMER_RACK) { \ + ut_init_member(member_ptr, member_id, __VA_ARGS__); \ + } else { \ + ut_init_member_with_rackv(member_ptr, member_id, rack, \ + __VA_ARGS__); \ + } \ + } while (0) + +/* Helper macro to initialize rd_kafka_metadata_t* with or without replicas + * depending on the value of parametrization. This accepts variadic arguments + * for topics. */ +#define ut_initMetadataConditionalRack(metadataPtr, replication_factor, \ + num_broker_racks, all_racks, \ + all_racks_cnt, parametrization, ...) \ + do { \ + int num_brokers = num_broker_racks > 0 \ + ? replication_factor * num_broker_racks \ + : replication_factor; \ + if (parametrization == \ + RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK) { \ + *(metadataPtr) = \ + rd_kafka_metadata_new_topic_mockv(__VA_ARGS__); \ + } else { \ + *(metadataPtr) = \ + rd_kafka_metadata_new_topic_with_partition_replicas_mockv( \ + replication_factor, num_brokers, __VA_ARGS__); \ + ut_populate_internal_broker_metadata( \ + rd_kafka_metadata_get_internal(*(metadataPtr)), \ + num_broker_racks, all_racks, all_racks_cnt); \ + ut_populate_internal_topic_metadata( \ + rd_kafka_metadata_get_internal(*(metadataPtr))); \ + } \ + } while (0) + + +/* Helper macro to initialize rd_kafka_metadata_t* with or without replicas + * depending on the value of parametrization. This accepts a list of topics, + * rather than being variadic. + */ +#define ut_initMetadataConditionalRack0( \ + metadataPtr, replication_factor, num_broker_racks, all_racks, \ + all_racks_cnt, parametrization, topics, topic_cnt) \ + do { \ + int num_brokers = num_broker_racks > 0 \ + ? replication_factor * num_broker_racks \ + : replication_factor; \ + if (parametrization == \ + RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK) { \ + *(metadataPtr) = rd_kafka_metadata_new_topic_mock( \ + topics, topic_cnt, -1, 0); \ + } else { \ + *(metadataPtr) = rd_kafka_metadata_new_topic_mock( \ + topics, topic_cnt, replication_factor, \ + num_brokers); \ + ut_populate_internal_broker_metadata( \ + rd_kafka_metadata_get_internal(*(metadataPtr)), \ + num_broker_racks, all_racks, all_racks_cnt); \ + ut_populate_internal_topic_metadata( \ + rd_kafka_metadata_get_internal(*(metadataPtr))); \ + } \ + } while (0) + #endif /* _RDKAFKA_ASSIGNOR_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_aux.c b/src/third_party/librdkafka/dist/src/rdkafka_aux.c index 753f03d6782..7d5ccb5b2f1 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_aux.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_aux.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -234,19 +235,60 @@ void rd_kafka_acl_result_free(void *ptr) { * @return A new allocated Node object. * Use rd_kafka_Node_destroy() to free when done. */ -rd_kafka_Node_t *rd_kafka_Node_new(int id, +rd_kafka_Node_t *rd_kafka_Node_new(int32_t id, const char *host, uint16_t port, - const char *rack_id) { + const char *rack) { rd_kafka_Node_t *ret = rd_calloc(1, sizeof(*ret)); ret->id = id; ret->port = port; ret->host = rd_strdup(host); - if (rack_id != NULL) - ret->rack_id = rd_strdup(rack_id); + if (rack != NULL) + ret->rack = rd_strdup(rack); return ret; } +/** + * @brief Create a new Node object given a node id, and use broker information + * to populate other fields. + * + * @return A new allocated Node object. + * Use rd_kafka_Node_destroy() to free when done. + * @remark The \p brokers_sorted and \p brokers_internal arrays are asumed to be + * sorted by id. + */ +rd_kafka_Node_t *rd_kafka_Node_new_from_brokers( + int32_t id, + const struct rd_kafka_metadata_broker *brokers_sorted, + const rd_kafka_metadata_broker_internal_t *brokers_internal, + int broker_cnt) { + rd_kafka_Node_t *node = rd_calloc(1, sizeof(*node)); + struct rd_kafka_metadata_broker key_sorted = {.id = id}; + rd_kafka_metadata_broker_internal_t key_internal = {.id = id}; + + struct rd_kafka_metadata_broker *broker = + bsearch(&key_sorted, brokers_sorted, broker_cnt, + sizeof(struct rd_kafka_metadata_broker), + rd_kafka_metadata_broker_cmp); + + rd_kafka_metadata_broker_internal_t *broker_internal = + bsearch(&key_internal, brokers_internal, broker_cnt, + sizeof(rd_kafka_metadata_broker_internal_t), + rd_kafka_metadata_broker_internal_cmp); + + node->id = id; + + if (!broker) + return node; + + node->host = rd_strdup(broker->host); + node->port = broker->port; + if (broker_internal && broker_internal->rack_id) + node->rack = rd_strdup(broker_internal->rack_id); + + return node; +} + /** * @brief Copy \p src Node object * @@ -255,16 +297,26 @@ rd_kafka_Node_t *rd_kafka_Node_new(int id, * Use rd_kafka_Node_destroy() to free when done. */ rd_kafka_Node_t *rd_kafka_Node_copy(const rd_kafka_Node_t *src) { - return rd_kafka_Node_new(src->id, src->host, src->port, src->rack_id); + return rd_kafka_Node_new(src->id, src->host, src->port, src->rack); } void rd_kafka_Node_destroy(rd_kafka_Node_t *node) { rd_free(node->host); - if (node->rack_id) - rd_free(node->rack_id); + if (node->rack) + rd_free(node->rack); rd_free(node); } +/** + * @brief Same as rd_kafka_Node_destroy, but for use as callback which accepts + * (void *) arguments. + * + * @param node + */ +void rd_kafka_Node_free(void *node) { + rd_kafka_Node_destroy((rd_kafka_Node_t *)node); +} + int rd_kafka_Node_id(const rd_kafka_Node_t *node) { return node->id; } @@ -276,3 +328,82 @@ const char *rd_kafka_Node_host(const rd_kafka_Node_t *node) { uint16_t rd_kafka_Node_port(const rd_kafka_Node_t *node) { return node->port; } + +const char *rd_kafka_Node_rack(const rd_kafka_Node_t *node) { + return node->rack; +} + +/** + * @brief Creates a new rd_kafka_topic_partition_result_t object. + */ + +rd_kafka_topic_partition_result_t * +rd_kafka_topic_partition_result_new(const char *topic, + int32_t partition, + rd_kafka_resp_err_t err, + const char *errstr) { + + rd_kafka_topic_partition_result_t *new_result; + + new_result = rd_calloc(1, sizeof(*new_result)); + new_result->topic_partition = + rd_kafka_topic_partition_new(topic, partition); + new_result->topic_partition->err = err; + new_result->error = rd_kafka_error_new(err, "%s", errstr); + + return new_result; +} + +const rd_kafka_topic_partition_t *rd_kafka_topic_partition_result_partition( + const rd_kafka_topic_partition_result_t *partition_result) { + return partition_result->topic_partition; +} + +const rd_kafka_error_t *rd_kafka_topic_partition_result_error( + const rd_kafka_topic_partition_result_t *partition_result) { + return partition_result->error; +} + +/** + * @brief Destroys the rd_kafka_topic_partition_result_t object. + */ +void rd_kafka_topic_partition_result_destroy( + rd_kafka_topic_partition_result_t *partition_result) { + rd_kafka_topic_partition_destroy(partition_result->topic_partition); + rd_kafka_error_destroy(partition_result->error); + rd_free(partition_result); +} + +/** + * @brief Destroys the array of rd_kafka_topic_partition_result_t objects. + */ +void rd_kafka_topic_partition_result_destroy_array( + rd_kafka_topic_partition_result_t **partition_results, + int32_t partition_results_cnt) { + int32_t i; + for (i = 0; i < partition_results_cnt; i++) { + rd_kafka_topic_partition_result_destroy(partition_results[i]); + } +} + +rd_kafka_topic_partition_result_t *rd_kafka_topic_partition_result_copy( + const rd_kafka_topic_partition_result_t *src) { + return rd_kafka_topic_partition_result_new( + src->topic_partition->topic, src->topic_partition->partition, + src->topic_partition->err, src->error->errstr); +} + +void *rd_kafka_topic_partition_result_copy_opaque(const void *src, + void *opaque) { + return rd_kafka_topic_partition_result_copy( + (const rd_kafka_topic_partition_result_t *)src); +} + +/** + * @brief Frees the memory allocated for a + * topic partition result object by calling + * its destroy function. + */ +void rd_kafka_topic_partition_result_free(void *ptr) { + rd_kafka_topic_partition_result_destroy(ptr); +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_aux.h b/src/third_party/librdkafka/dist/src/rdkafka_aux.h index ccf18e91e7c..340fcf708d6 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_aux.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_aux.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -107,14 +108,67 @@ typedef struct rd_kafka_Node_s { int id; /*< Node id */ char *host; /*< Node host */ uint16_t port; /*< Node port */ - char *rack_id; /*< (optional) Node rack id */ + char *rack; /*< (optional) Node rack id */ } rd_kafka_Node_t; -rd_kafka_Node_t * -rd_kafka_Node_new(int id, const char *host, uint16_t port, const char *rack_id); +rd_kafka_Node_t *rd_kafka_Node_new(int32_t id, + const char *host, + uint16_t port, + const char *rack_id); + +rd_kafka_Node_t *rd_kafka_Node_new_from_brokers( + int32_t id, + const struct rd_kafka_metadata_broker *brokers_sorted, + const rd_kafka_metadata_broker_internal_t *brokers_internal, + int broker_cnt); rd_kafka_Node_t *rd_kafka_Node_copy(const rd_kafka_Node_t *src); void rd_kafka_Node_destroy(rd_kafka_Node_t *node); +void rd_kafka_Node_free(void *node); + +/** + * @brief Represents a topic partition result. + * + * @remark Public Type + */ +struct rd_kafka_topic_partition_result_s { + rd_kafka_topic_partition_t *topic_partition; + rd_kafka_error_t *error; +}; + +/** + * @brief Create a new rd_kafka_topic_partition_result_t object. + * + * @param topic The topic name. + * @param partition The partition number. + * @param err The error code. + * @param errstr The error string. + * + * @returns a newly allocated rd_kafka_topic_partition_result_t object. + * Use rd_kafka_topic_partition_result_destroy() to free object when + * done. + */ +rd_kafka_topic_partition_result_t * +rd_kafka_topic_partition_result_new(const char *topic, + int32_t partition, + rd_kafka_resp_err_t err, + const char *errstr); + +rd_kafka_topic_partition_result_t *rd_kafka_topic_partition_result_copy( + const rd_kafka_topic_partition_result_t *src); + +void *rd_kafka_topic_partition_result_copy_opaque(const void *src, + void *opaque); + +void rd_kafka_topic_partition_result_destroy( + rd_kafka_topic_partition_result_t *partition_result); + +void rd_kafka_topic_partition_result_destroy_array( + rd_kafka_topic_partition_result_t **partition_results, + int32_t partition_results_cnt); + +void rd_kafka_topic_partition_result_free(void *ptr); + #endif /* _RDKAFKA_AUX_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_background.c b/src/third_party/librdkafka/dist/src/rdkafka_background.c index c69ec1767dd..a9c96606c0d 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_background.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_background.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_broker.c b/src/third_party/librdkafka/dist/src/rdkafka_broker.c index a32d08d24d0..48a67a5cb0a 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_broker.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_broker.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -49,6 +50,7 @@ #include #include "rd.h" +#include "rdaddr.h" #include "rdkafka_int.h" #include "rdkafka_msg.h" #include "rdkafka_msgset.h" @@ -56,6 +58,7 @@ #include "rdkafka_partition.h" #include "rdkafka_broker.h" #include "rdkafka_offset.h" +#include "rdkafka_telemetry.h" #include "rdkafka_transport.h" #include "rdkafka_proto.h" #include "rdkafka_buf.h" @@ -79,9 +82,9 @@ static const int rd_kafka_max_block_ms = 1000; const char *rd_kafka_broker_state_names[] = { - "INIT", "DOWN", "TRY_CONNECT", "CONNECT", "SSL_HANDSHAKE", - "AUTH_LEGACY", "UP", "UPDATE", "APIVERSION_QUERY", "AUTH_HANDSHAKE", - "AUTH_REQ"}; + "INIT", "DOWN", "TRY_CONNECT", "CONNECT", + "SSL_HANDSHAKE", "AUTH_LEGACY", "UP", "APIVERSION_QUERY", + "AUTH_HANDSHAKE", "AUTH_REQ", "REAUTH"}; const char *rd_kafka_secproto_names[] = { [RD_KAFKA_PROTO_PLAINTEXT] = "plaintext", @@ -108,7 +111,6 @@ rd_kafka_broker_needs_persistent_connection(rd_kafka_broker_t *rkb) { rd_atomic32_get(&rkb->rkb_persistconn.coord); } - /** * @returns > 0 if a connection to this broker is needed, else 0. * @locality broker thread @@ -116,7 +118,7 @@ rd_kafka_broker_needs_persistent_connection(rd_kafka_broker_t *rkb) { */ static RD_INLINE int rd_kafka_broker_needs_connection(rd_kafka_broker_t *rkb) { return rkb->rkb_state == RD_KAFKA_BROKER_STATE_INIT && - !rd_kafka_terminating(rkb->rkb_rk) && + !rd_kafka_broker_or_instance_terminating(rkb) && !rd_kafka_fatal_error_code(rkb->rkb_rk) && (!rkb->rkb_rk->rk_conf.sparse_connections || rd_kafka_broker_needs_persistent_connection(rkb)); @@ -232,31 +234,37 @@ static void rd_kafka_broker_features_set(rd_kafka_broker_t *rkb, int features) { rd_kafka_features2str(rkb->rkb_features)); } - /** * @brief Check and return supported ApiVersion for \p ApiKey. * * @returns the highest supported ApiVersion in the specified range (inclusive) * or -1 if the ApiKey is not supported or no matching ApiVersion. * The current feature set is also returned in \p featuresp - * @locks none + * + * @remark Same as rd_kafka_broker_ApiVersion_supported except for locking. + * + * @locks rd_kafka_broker_lock() if do_lock is rd_false + * @locks_acquired rd_kafka_broker_lock() if do_lock is rd_true * @locality any */ -int16_t rd_kafka_broker_ApiVersion_supported(rd_kafka_broker_t *rkb, - int16_t ApiKey, - int16_t minver, - int16_t maxver, - int *featuresp) { +int16_t rd_kafka_broker_ApiVersion_supported0(rd_kafka_broker_t *rkb, + int16_t ApiKey, + int16_t minver, + int16_t maxver, + int *featuresp, + rd_bool_t do_lock) { struct rd_kafka_ApiVersion skel = {.ApiKey = ApiKey}; struct rd_kafka_ApiVersion ret = RD_ZERO_INIT, *retp; - rd_kafka_broker_lock(rkb); + if (do_lock) + rd_kafka_broker_lock(rkb); if (featuresp) *featuresp = rkb->rkb_features; if (rkb->rkb_features & RD_KAFKA_FEATURE_UNITTEST) { /* For unit tests let the broker support everything. */ - rd_kafka_broker_unlock(rkb); + if (do_lock) + rd_kafka_broker_unlock(rkb); return maxver; } @@ -265,7 +273,9 @@ int16_t rd_kafka_broker_ApiVersion_supported(rd_kafka_broker_t *rkb, sizeof(*rkb->rkb_ApiVersions), rd_kafka_ApiVersion_key_cmp); if (retp) ret = *retp; - rd_kafka_broker_unlock(rkb); + + if (do_lock) + rd_kafka_broker_unlock(rkb); if (!retp) return -1; @@ -281,6 +291,24 @@ int16_t rd_kafka_broker_ApiVersion_supported(rd_kafka_broker_t *rkb, return maxver; } +/** + * @brief Check and return supported ApiVersion for \p ApiKey. + * + * @returns the highest supported ApiVersion in the specified range (inclusive) + * or -1 if the ApiKey is not supported or no matching ApiVersion. + * The current feature set is also returned in \p featuresp + * @locks none + * @locks_acquired rd_kafka_broker_lock() + * @locality any + */ +int16_t rd_kafka_broker_ApiVersion_supported(rd_kafka_broker_t *rkb, + int16_t ApiKey, + int16_t minver, + int16_t maxver, + int *featuresp) { + return rd_kafka_broker_ApiVersion_supported0( + rkb, ApiKey, minver, maxver, featuresp, rd_true /* do_lock */); +} /** * @brief Set broker state. @@ -302,9 +330,10 @@ void rd_kafka_broker_set_state(rd_kafka_broker_t *rkb, int state) { rd_kafka_broker_state_names[rkb->rkb_state], rd_kafka_broker_state_names[state]); - if (rkb->rkb_source == RD_KAFKA_INTERNAL) { + if (rkb->rkb_source == RD_KAFKA_INTERNAL || + RD_KAFKA_BROKER_IS_LOGICAL(rkb)) { /* no-op */ - } else if (state == RD_KAFKA_BROKER_STATE_DOWN && + } else if (rd_kafka_broker_state_is_down(state) && !rkb->rkb_down_reported) { /* Propagate ALL_BROKERS_DOWN event if all brokers are * now down, unless we're terminating. @@ -313,16 +342,17 @@ void rd_kafka_broker_set_state(rd_kafka_broker_t *rkb, int state) { if (rd_atomic32_add(&rkb->rkb_rk->rk_broker_down_cnt, 1) == rd_atomic32_get(&rkb->rkb_rk->rk_broker_cnt) - rd_atomic32_get( - &rkb->rkb_rk->rk_broker_addrless_cnt) && - !rd_kafka_broker_is_addrless(rkb) && - !rd_kafka_terminating(rkb->rkb_rk)) + &rkb->rkb_rk->rk_logical_broker_cnt) && + !rd_kafka_terminating(rkb->rkb_rk)) { + rd_kafka_rebootstrap(rkb->rkb_rk); rd_kafka_op_err( rkb->rkb_rk, RD_KAFKA_RESP_ERR__ALL_BROKERS_DOWN, "%i/%i brokers are down", rd_atomic32_get(&rkb->rkb_rk->rk_broker_down_cnt), rd_atomic32_get(&rkb->rkb_rk->rk_broker_cnt) - rd_atomic32_get( - &rkb->rkb_rk->rk_broker_addrless_cnt)); + &rkb->rkb_rk->rk_logical_broker_cnt)); + } rkb->rkb_down_reported = 1; } else if (rd_kafka_broker_state_is_up(state) && @@ -335,24 +365,20 @@ void rd_kafka_broker_set_state(rd_kafka_broker_t *rkb, int state) { if (rd_kafka_broker_state_is_up(state) && !rd_kafka_broker_state_is_up(rkb->rkb_state)) { /* Up -> Down */ - rd_atomic32_add(&rkb->rkb_rk->rk_broker_up_cnt, 1); + if (!RD_KAFKA_BROKER_IS_LOGICAL(rkb)) + rd_atomic32_add(&rkb->rkb_rk->rk_broker_up_cnt, + 1); trigger_monitors = rd_true; - if (RD_KAFKA_BROKER_IS_LOGICAL(rkb)) - rd_atomic32_add( - &rkb->rkb_rk->rk_logical_broker_up_cnt, 1); - } else if (rd_kafka_broker_state_is_up(rkb->rkb_state) && !rd_kafka_broker_state_is_up(state)) { /* ~Down(!Up) -> Up */ - rd_atomic32_sub(&rkb->rkb_rk->rk_broker_up_cnt, 1); + if (!RD_KAFKA_BROKER_IS_LOGICAL(rkb)) + rd_atomic32_sub(&rkb->rkb_rk->rk_broker_up_cnt, + 1); trigger_monitors = rd_true; - - if (RD_KAFKA_BROKER_IS_LOGICAL(rkb)) - rd_atomic32_sub( - &rkb->rkb_rk->rk_logical_broker_up_cnt, 1); } /* If the connection or connection attempt failed and there @@ -554,6 +580,7 @@ void rd_kafka_broker_fail(rd_kafka_broker_t *rkb, va_list ap; rd_kafka_bufq_t tmpq_waitresp, tmpq; int old_state; + rd_kafka_toppar_t *rktp; rd_kafka_assert(rkb->rkb_rk, thrd_is_current(rkb->rkb_thread)); @@ -572,6 +599,8 @@ void rd_kafka_broker_fail(rd_kafka_broker_t *rkb, rkb->rkb_recv_buf = NULL; } + rkb->rkb_reauth_in_progress = rd_false; + va_start(ap, fmt); rd_kafka_broker_set_error(rkb, level, err, fmt, ap); va_end(ap); @@ -590,6 +619,11 @@ void rd_kafka_broker_fail(rd_kafka_broker_t *rkb, old_state = rkb->rkb_state; rd_kafka_broker_set_state(rkb, RD_KAFKA_BROKER_STATE_DOWN); + /* Stop any pending reauth timer, since a teardown/reconnect will + * require a new timer. */ + rd_kafka_timer_stop(&rkb->rkb_rk->rk_timers, &rkb->rkb_sasl_reauth_tmr, + 1 /*lock*/); + /* Unlock broker since a requeue will try to lock it. */ rd_kafka_broker_unlock(rkb); @@ -641,9 +675,46 @@ void rd_kafka_broker_fail(rd_kafka_broker_t *rkb, rd_kafka_bufq_dump(rkb, "BRKOUTBUFS", &rkb->rkb_outbufs); } + /* If this broker acts as the preferred (follower) replica for any + * partition, delegate the partition back to the leader. */ + TAILQ_FOREACH(rktp, &rkb->rkb_toppars, rktp_rkblink) { + rd_kafka_toppar_lock(rktp); + if (unlikely(rktp->rktp_broker != rkb)) { + /* Currently migrating away from this + * broker, skip. */ + rd_kafka_toppar_unlock(rktp); + continue; + } + rd_kafka_toppar_unlock(rktp); + + if (rktp->rktp_leader_id != rktp->rktp_broker_id) { + rd_kafka_toppar_delegate_to_leader(rktp); + } else if (rd_kafka_broker_termination_in_progress(rkb)) { + /* Remove `rktp_broker` and `rktp_leader` + * references in `rktp`, even if this broker + * is still the leader, to allow to + * decommission it. */ + rd_kafka_toppar_undelegate(rktp); + rd_kafka_toppar_forget_leader(rktp); + } + } + + /* If the broker is the preferred telemetry broker, remove it. */ + /* TODO(milind): check if this right. */ + mtx_lock(&rkb->rkb_rk->rk_telemetry.lock); + if (rkb->rkb_rk->rk_telemetry.preferred_broker == rkb) { + rd_kafka_dbg(rkb->rkb_rk, TELEMETRY, "TELBRKLOST", + "Lost telemetry broker %s due to state change", + rkb->rkb_name); + rd_kafka_broker_destroy( + rkb->rkb_rk->rk_telemetry.preferred_broker); + rkb->rkb_rk->rk_telemetry.preferred_broker = NULL; + } + mtx_unlock(&rkb->rkb_rk->rk_telemetry.lock); /* Query for topic leaders to quickly pick up on failover. */ if (err != RD_KAFKA_RESP_ERR__DESTROY && + err != RD_KAFKA_RESP_ERR__DESTROY_BROKER && old_state >= RD_KAFKA_BROKER_STATE_UP) rd_kafka_metadata_refresh_known_topics( rkb->rkb_rk, NULL, rd_true /*force*/, "broker down"); @@ -915,11 +986,22 @@ static void rd_kafka_broker_timeout_scan(rd_kafka_broker_t *rkb, rd_ts_t now) { char rttinfo[32]; /* Print average RTT (if avail) to help diagnose. */ rd_avg_calc(&rkb->rkb_avg_rtt, now); + rd_avg_calc( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt, + now); if (rkb->rkb_avg_rtt.ra_v.avg) rd_snprintf(rttinfo, sizeof(rttinfo), " (average rtt %.3fms)", (float)(rkb->rkb_avg_rtt.ra_v.avg / 1000.0f)); + else if (rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt + .ra_v.avg) + rd_snprintf( + rttinfo, sizeof(rttinfo), + " (average rtt %.3fms)", + (float)(rkb->rkb_telemetry.rd_avg_current + .rkb_avg_rtt.ra_v.avg / + 1000.0f)); else rttinfo[0] = 0; rd_kafka_broker_fail(rkb, LOG_ERR, @@ -1312,26 +1394,27 @@ void rd_kafka_brokers_broadcast_state_change(rd_kafka_t *rk) { * @locks rd_kafka_*lock() MUST be held * @locality any */ -static rd_kafka_broker_t * -rd_kafka_broker_random0(const char *func, - int line, - rd_kafka_t *rk, - rd_bool_t is_up, - int state, - int *filtered_cnt, - int (*filter)(rd_kafka_broker_t *rk, void *opaque), - void *opaque) { +rd_kafka_broker_t *rd_kafka_broker_random0(const char *func, + int line, + rd_kafka_t *rk, + rd_bool_t is_up, + int state, + int *filtered_cnt, + int (*filter)(rd_kafka_broker_t *rk, + void *opaque), + void *opaque) { rd_kafka_broker_t *rkb, *good = NULL; int cnt = 0; int fcnt = 0; TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { - if (RD_KAFKA_BROKER_IS_LOGICAL(rkb)) + if (rd_kafka_broker_or_instance_terminating(rkb) || + RD_KAFKA_BROKER_IS_LOGICAL(rkb)) continue; rd_kafka_broker_lock(rkb); if ((is_up && rd_kafka_broker_state_is_up(rkb->rkb_state)) || - (!is_up && (int)rkb->rkb_state == state)) { + (!is_up && (state == -1 || (int)rkb->rkb_state == state))) { if (filter && filter(rkb, opaque)) { /* Filtered out */ fcnt++; @@ -1355,11 +1438,6 @@ rd_kafka_broker_random0(const char *func, return good; } -#define rd_kafka_broker_random(rk, state, filter, opaque) \ - rd_kafka_broker_random0(__FUNCTION__, __LINE__, rk, rd_false, state, \ - NULL, filter, opaque) - - /** * @returns the broker (with refcnt increased) with the highest weight based * based on the provided weighing function. @@ -1387,6 +1465,8 @@ rd_kafka_broker_weighted(rd_kafka_t *rk, TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { int weight; + if (rd_kafka_broker_or_instance_terminating(rkb)) + continue; rd_kafka_broker_lock(rkb); if (features && (rkb->rkb_features & features) != features) @@ -1440,12 +1520,11 @@ rd_kafka_broker_weighted(rd_kafka_t *rk, static int rd_kafka_broker_weight_usable(rd_kafka_broker_t *rkb) { int weight = 0; - if (!rd_kafka_broker_state_is_up(rkb->rkb_state)) + if (!rd_kafka_broker_state_is_up(rkb->rkb_state) || + RD_KAFKA_BROKER_IS_LOGICAL(rkb)) return 0; - weight += - 2000 * (rkb->rkb_nodeid != -1 && !RD_KAFKA_BROKER_IS_LOGICAL(rkb)); - weight += 10 * !RD_KAFKA_BROKER_IS_LOGICAL(rkb); + weight += 2000; if (likely(!rd_atomic32_get(&rkb->rkb_blocking_request_cnt))) { rd_ts_t tx_last = rd_atomic64_get(&rkb->rkb_c.ts_send); @@ -1454,6 +1533,9 @@ static int rd_kafka_broker_weight_usable(rd_kafka_broker_t *rkb) { 1000000); weight += 1; /* is not blocking */ + if (rkb->rkb_source == RD_KAFKA_LEARNED) + /* Prefer learned brokers */ + weight += 1000; /* Prefer least idle broker (based on last 10 minutes use) */ if (idle < 0) @@ -1668,6 +1750,9 @@ rd_list_t *rd_kafka_brokers_get_nodeids_async(rd_kafka_t *rk, } i = 0; TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + if (rd_kafka_broker_or_instance_terminating(rkb)) + continue; + rd_kafka_broker_lock(rkb); if (rkb->rkb_nodeid != -1 && !RD_KAFKA_BROKER_IS_LOGICAL(rkb)) { @@ -1799,6 +1884,32 @@ static rd_kafka_buf_t *rd_kafka_waitresp_find(rd_kafka_broker_t *rkb, /* Convert ts_sent to RTT */ rkbuf->rkbuf_ts_sent = now - rkbuf->rkbuf_ts_sent; rd_avg_add(&rkb->rkb_avg_rtt, rkbuf->rkbuf_ts_sent); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt, + rkbuf->rkbuf_ts_sent); + + switch (rkbuf->rkbuf_reqhdr.ApiKey) { + case RD_KAFKAP_Fetch: + if (rkb->rkb_rk->rk_type == RD_KAFKA_CONSUMER) + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current + .rkb_avg_fetch_latency, + rkbuf->rkbuf_ts_sent); + break; + case RD_KAFKAP_OffsetCommit: + if (rkb->rkb_rk->rk_type == RD_KAFKA_CONSUMER) + rd_avg_add( + &rkb->rkb_rk->rk_telemetry.rd_avg_current + .rk_avg_commit_latency, + rkbuf->rkbuf_ts_sent); + break; + case RD_KAFKAP_Produce: + if (rkb->rkb_rk->rk_type == RD_KAFKA_PRODUCER) + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current + .rkb_avg_produce_latency, + rkbuf->rkbuf_ts_sent); + break; + default: + break; + } if (rkbuf->rkbuf_flags & RD_KAFKA_OP_F_BLOCKING && rd_atomic32_sub(&rkb->rkb_blocking_request_cnt, 1) == 1) @@ -1817,7 +1928,7 @@ static rd_kafka_buf_t *rd_kafka_waitresp_find(rd_kafka_broker_t *rkb, */ static int rd_kafka_req_response(rd_kafka_broker_t *rkb, rd_kafka_buf_t *rkbuf) { - rd_kafka_buf_t *req; + rd_kafka_buf_t *req = NULL; int log_decode_errors = LOG_ERR; rd_kafka_assert(rkb->rkb_rk, thrd_is_current(rkb->rkb_thread)); @@ -2110,7 +2221,7 @@ rd_kafka_broker_reconnect_backoff(const rd_kafka_broker_t *rkb, rd_ts_t now) { static int rd_ut_reconnect_backoff(void) { rd_kafka_broker_t rkb = RD_ZERO_INIT; rd_kafka_conf_t conf = {.reconnect_backoff_ms = 10, - .reconnect_backoff_max_ms = 90}; + .reconnect_backoff_max_ms = 90}; rd_ts_t now = 1000000; int backoff; @@ -2219,8 +2330,10 @@ static int rd_kafka_broker_connect(rd_kafka_broker_t *rkb) { * @locality Broker thread */ void rd_kafka_broker_connect_up(rd_kafka_broker_t *rkb) { + int features; - rkb->rkb_max_inflight = rkb->rkb_rk->rk_conf.max_inflight; + rkb->rkb_max_inflight = rkb->rkb_rk->rk_conf.max_inflight; + rkb->rkb_reauth_in_progress = rd_false; rd_kafka_broker_lock(rkb); rd_kafka_broker_set_state(rkb, RD_KAFKA_BROKER_STATE_UP); @@ -2233,6 +2346,19 @@ void rd_kafka_broker_connect_up(rd_kafka_broker_t *rkb) { NULL, rkb, rd_false /*dont force*/, "connected") == RD_KAFKA_RESP_ERR__UNKNOWN_TOPIC) rd_kafka_metadata_refresh_brokers(NULL, rkb, "connected"); + + if (rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_GetTelemetrySubscriptions, 0, 0, &features) != + -1 && + rkb->rkb_source == RD_KAFKA_LEARNED && + rkb->rkb_rk->rk_conf.enable_metrics_push) { + rd_kafka_t *rk = rkb->rkb_rk; + rd_kafka_op_t *rko = + rd_kafka_op_new(RD_KAFKA_OP_SET_TELEMETRY_BROKER); + rd_kafka_broker_keep(rkb); + rko->rko_u.telemetry_broker.rkb = rkb; + rd_kafka_q_enq(rk->rk_ops, rko); + } } @@ -2258,7 +2384,7 @@ static void rd_kafka_broker_handle_SaslHandshake(rd_kafka_t *rk, char *mechs = "(n/a)"; size_t msz, mof = 0; - if (err == RD_KAFKA_RESP_ERR__DESTROY) + if (rd_kafka_broker_is_any_err_destroy(err)) return; if (err) @@ -2437,7 +2563,7 @@ static void rd_kafka_broker_handle_ApiVersion(rd_kafka_t *rk, size_t api_cnt = 0; int16_t retry_ApiVersion = -1; - if (err == RD_KAFKA_RESP_ERR__DESTROY) + if (rd_kafka_broker_is_any_err_destroy(err)) return; err = rd_kafka_handle_ApiVersion(rk, rkb, err, rkbuf, request, &apis, @@ -2771,6 +2897,10 @@ int rd_kafka_send(rd_kafka_broker_t *rkb) { /* Add to outbuf_latency averager */ rd_avg_add(&rkb->rkb_avg_outbuf_latency, rkbuf->rkbuf_ts_sent - rkbuf->rkbuf_ts_enq); + rd_avg_add( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency, + rkbuf->rkbuf_ts_sent - rkbuf->rkbuf_ts_enq); + if (rkbuf->rkbuf_flags & RD_KAFKA_OP_F_BLOCKING && rd_atomic32_add(&rkb->rkb_blocking_request_cnt, 1) == 1) @@ -2796,6 +2926,7 @@ int rd_kafka_send(rd_kafka_broker_t *rkb) { */ void rd_kafka_broker_buf_retry(rd_kafka_broker_t *rkb, rd_kafka_buf_t *rkbuf) { + int64_t backoff = 0; /* Restore original replyq since replyq.q will have been NULLed * by buf_callback()/replyq_enq(). */ if (!rkbuf->rkbuf_replyq.q && rkbuf->rkbuf_orig_replyq.q) { @@ -2823,9 +2954,24 @@ void rd_kafka_broker_buf_retry(rd_kafka_broker_t *rkb, rd_kafka_buf_t *rkbuf) { rkb->rkb_rk->rk_conf.retry_backoff_ms); rd_atomic64_add(&rkb->rkb_c.tx_retries, 1); + /* In some cases, failed Produce requests do not increment the retry + * count, see rd_kafka_handle_Produce_error. */ + if (rkbuf->rkbuf_retries > 0) + backoff = (1 << (rkbuf->rkbuf_retries - 1)) * + (rkb->rkb_rk->rk_conf.retry_backoff_ms); + else + backoff = rkb->rkb_rk->rk_conf.retry_backoff_ms; - rkbuf->rkbuf_ts_retry = - rd_clock() + (rkb->rkb_rk->rk_conf.retry_backoff_ms * 1000); + /* We are multiplying by 10 as (backoff_ms * percent * 1000)/100 -> + * backoff_ms * jitter * 10 */ + backoff = rd_jitter(100 - RD_KAFKA_RETRY_JITTER_PERCENT, + 100 + RD_KAFKA_RETRY_JITTER_PERCENT) * + backoff * 10; + + if (backoff > rkb->rkb_rk->rk_conf.retry_backoff_max_ms * 1000) + backoff = rkb->rkb_rk->rk_conf.retry_backoff_max_ms * 1000; + + rkbuf->rkbuf_ts_retry = rd_clock() + backoff; /* Precaution: time out the request if it hasn't moved from the * retry queue within the retry interval (such as when the broker is * down). */ @@ -2878,9 +3024,10 @@ static void rd_kafka_broker_retry_bufs_move(rd_kafka_broker_t *rkb, * To avoid extra iterations, the \p err and \p status are set on * the message as they are popped off the OP_DR msgq in rd_kafka_poll() et.al */ -void rd_kafka_dr_msgq(rd_kafka_topic_t *rkt, - rd_kafka_msgq_t *rkmq, - rd_kafka_resp_err_t err) { +void rd_kafka_dr_msgq0(rd_kafka_topic_t *rkt, + rd_kafka_msgq_t *rkmq, + rd_kafka_resp_err_t err, + const rd_kafka_Produce_result_t *presult) { rd_kafka_t *rk = rkt->rkt_rk; if (unlikely(rd_kafka_msgq_len(rkmq) == 0)) @@ -2891,7 +3038,11 @@ void rd_kafka_dr_msgq(rd_kafka_topic_t *rkt, rd_kafka_msgq_len(rkmq)); /* Call on_acknowledgement() interceptors */ - rd_kafka_interceptors_on_acknowledgement_queue(rk, rkmq, err); + rd_kafka_interceptors_on_acknowledgement_queue( + rk, rkmq, + (presult && presult->record_errors_cnt > 1) + ? RD_KAFKA_RESP_ERR_NO_ERROR + : err); if (rk->rk_drmode != RD_KAFKA_DR_MODE_NONE && (!rk->rk_conf.dr_err_only || err)) { @@ -2901,6 +3052,9 @@ void rd_kafka_dr_msgq(rd_kafka_topic_t *rkt, rko = rd_kafka_op_new(RD_KAFKA_OP_DR); rko->rko_err = err; rko->rko_u.dr.rkt = rd_kafka_topic_keep(rkt); + if (presult) + rko->rko_u.dr.presult = + rd_kafka_Produce_result_copy(presult); rd_kafka_msgq_init(&rko->rko_u.dr.msgq); /* Move all messages to op's msgq */ @@ -2961,53 +3115,6 @@ void rd_kafka_dr_implicit_ack(rd_kafka_broker_t *rkb, rd_kafka_dr_msgq(rktp->rktp_rkt, &acked, RD_KAFKA_RESP_ERR_NO_ERROR); } - - -/** - * @brief Map existing partitions to this broker using the - * toppar's leader_id. Only undelegated partitions - * matching this broker are mapped. - * - * @locks none - * @locality any - */ -static void rd_kafka_broker_map_partitions(rd_kafka_broker_t *rkb) { - rd_kafka_t *rk = rkb->rkb_rk; - rd_kafka_topic_t *rkt; - int cnt = 0; - - if (rkb->rkb_nodeid == -1 || RD_KAFKA_BROKER_IS_LOGICAL(rkb)) - return; - - rd_kafka_rdlock(rk); - TAILQ_FOREACH(rkt, &rk->rk_topics, rkt_link) { - int i; - - rd_kafka_topic_wrlock(rkt); - for (i = 0; i < rkt->rkt_partition_cnt; i++) { - rd_kafka_toppar_t *rktp = rkt->rkt_p[i]; - - /* Only map undelegated partitions matching this - * broker*/ - rd_kafka_toppar_lock(rktp); - if (rktp->rktp_leader_id == rkb->rkb_nodeid && - !(rktp->rktp_broker && rktp->rktp_next_broker)) { - rd_kafka_toppar_broker_update( - rktp, rktp->rktp_leader_id, rkb, - "broker node information updated"); - cnt++; - } - rd_kafka_toppar_unlock(rktp); - } - rd_kafka_topic_wrunlock(rkt); - } - rd_kafka_rdunlock(rk); - - rd_rkb_dbg(rkb, TOPIC | RD_KAFKA_DBG_BROKER, "LEADER", - "Mapped %d partition(s) to broker", cnt); -} - - /** * @brief Broker id comparator */ @@ -3048,6 +3155,10 @@ static void rd_kafka_broker_prepare_destroy(rd_kafka_broker_t *rkb) { rd_kafka_broker_monitor_del(&rkb->rkb_coord_monitor); } +static rd_kafka_resp_err_t rd_kafka_broker_destroy_error(rd_kafka_t *rk) { + return rd_kafka_terminating(rk) ? RD_KAFKA_RESP_ERR__DESTROY + : RD_KAFKA_RESP_ERR__DESTROY_BROKER; +} /** * @brief Serve a broker op (an op posted by another thread to be handled by @@ -3067,7 +3178,7 @@ rd_kafka_broker_op_serve(rd_kafka_broker_t *rkb, rd_kafka_op_t *rko) { switch (rko->rko_type) { case RD_KAFKA_OP_NODE_UPDATE: { - enum { _UPD_NAME = 0x1, _UPD_ID = 0x2 } updated = 0; + rd_bool_t updated = rd_false; char brokername[RD_KAFKA_NODENAME_SIZE]; /* Need kafka_wrlock for updating rk_broker_by_id */ @@ -3081,31 +3192,7 @@ rd_kafka_broker_op_serve(rd_kafka_broker_t *rkb, rd_kafka_op_t *rko) { rd_strlcpy(rkb->rkb_nodename, rko->rko_u.node.nodename, sizeof(rkb->rkb_nodename)); rkb->rkb_nodename_epoch++; - updated |= _UPD_NAME; - } - - if (rko->rko_u.node.nodeid != -1 && - !RD_KAFKA_BROKER_IS_LOGICAL(rkb) && - rko->rko_u.node.nodeid != rkb->rkb_nodeid) { - int32_t old_nodeid = rkb->rkb_nodeid; - rd_rkb_dbg(rkb, BROKER, "UPDATE", - "NodeId changed from %" PRId32 - " to %" PRId32, - rkb->rkb_nodeid, rko->rko_u.node.nodeid); - - rkb->rkb_nodeid = rko->rko_u.node.nodeid; - - /* Update system thread name */ - rd_kafka_set_thread_sysname("rdk:broker%" PRId32, - rkb->rkb_nodeid); - - /* Update broker_by_id sorted list */ - if (old_nodeid == -1) - rd_list_add(&rkb->rkb_rk->rk_broker_by_id, rkb); - rd_list_sort(&rkb->rkb_rk->rk_broker_by_id, - rd_kafka_broker_cmp_by_id); - - updated |= _UPD_ID; + updated = rd_true; } rd_kafka_mk_brokername(brokername, sizeof(brokername), @@ -3124,22 +3211,10 @@ rd_kafka_broker_op_serve(rd_kafka_broker_t *rkb, rd_kafka_op_t *rko) { rd_kafka_broker_unlock(rkb); rd_kafka_wrunlock(rkb->rkb_rk); - if (updated & _UPD_NAME) + if (updated) { rd_kafka_broker_fail(rkb, LOG_DEBUG, RD_KAFKA_RESP_ERR__TRANSPORT, "Broker hostname updated"); - else if (updated & _UPD_ID) { - /* Map existing partitions to this broker. */ - rd_kafka_broker_map_partitions(rkb); - - /* If broker is currently in state up we need - * to trigger a state change so it exits its - * state&type based .._serve() loop. */ - rd_kafka_broker_lock(rkb); - if (rkb->rkb_state == RD_KAFKA_BROKER_STATE_UP) - rd_kafka_broker_set_state( - rkb, RD_KAFKA_BROKER_STATE_UPDATE); - rd_kafka_broker_unlock(rkb); } rd_kafka_brokers_broadcast_state_change(rkb->rkb_rk); @@ -3168,17 +3243,20 @@ rd_kafka_broker_op_serve(rd_kafka_broker_t *rkb, rd_kafka_op_t *rko) { rd_kafka_toppar_lock(rktp); /* Abort join if instance is terminating */ - if (rd_kafka_terminating(rkb->rkb_rk) || + if (rd_kafka_broker_or_instance_terminating(rkb) || (rktp->rktp_flags & RD_KAFKA_TOPPAR_F_REMOVE)) { - rd_rkb_dbg(rkb, BROKER | RD_KAFKA_DBG_TOPIC, "TOPBRK", - "Topic %s [%" PRId32 - "]: not joining broker: " - "%s", - rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition, - rd_kafka_terminating(rkb->rkb_rk) - ? "instance is terminating" - : "partition removed"); + rd_rkb_dbg( + rkb, BROKER | RD_KAFKA_DBG_TOPIC, "TOPBRK", + "Topic %s [%" PRId32 + "]: not joining broker: " + "%s", + rktp->rktp_rkt->rkt_topic->str, + rktp->rktp_partition, + rd_kafka_terminating(rkb->rkb_rk) + ? "instance is terminating" + : rd_kafka_broker_termination_in_progress(rkb) + ? "broker is terminating" + : "partition removed"); rd_kafka_broker_destroy(rktp->rktp_next_broker); rktp->rktp_next_broker = NULL; @@ -3246,6 +3324,8 @@ rd_kafka_broker_op_serve(rd_kafka_broker_t *rkb, rd_kafka_op_t *rko) { "finish before producing to " "new leader"); } + } else if (rkb->rkb_rk->rk_type == RD_KAFKA_CONSUMER) { + rktp->rktp_ts_fetch_backoff = 0; } rd_kafka_broker_destroy(rktp->rktp_next_broker); @@ -3356,6 +3436,11 @@ rd_kafka_broker_op_serve(rd_kafka_broker_t *rkb, rd_kafka_op_t *rko) { : (topic_err ? topic_err : RD_KAFKA_RESP_ERR__UNKNOWN_PARTITION)); + + if (rkb->rkb_rk->rk_type == RD_KAFKA_CONSUMER) { + rd_kafka_toppar_purge_internal_fetch_queue_maybe( + rktp); + } } rd_kafka_toppar_unlock(rktp); @@ -3380,10 +3465,13 @@ rd_kafka_broker_op_serve(rd_kafka_broker_t *rkb, rd_kafka_op_t *rko) { * and trigger a state change. * This makes sure any eonce dependent on state changes * are triggered. */ - rd_kafka_broker_fail(rkb, LOG_DEBUG, RD_KAFKA_RESP_ERR__DESTROY, - "Client is terminating"); + rd_kafka_broker_fail(rkb, LOG_DEBUG, + rd_kafka_broker_destroy_error(rkb->rkb_rk), + "Decommissioning this broker"); rd_kafka_broker_prepare_destroy(rkb); + /* Release main thread reference here */ + rd_kafka_broker_destroy(rkb); wakeup = rd_true; break; @@ -3434,6 +3522,20 @@ rd_kafka_broker_op_serve(rd_kafka_broker_t *rkb, rd_kafka_op_t *rko) { wakeup = rd_true; break; + case RD_KAFKA_OP_SASL_REAUTH: + rd_rkb_dbg(rkb, BROKER, "REAUTH", "Received REAUTH op"); + + /* We don't need a lock for rkb_max_inflight. It's changed only + * on the broker thread. */ + rkb->rkb_max_inflight = 1; + + rd_kafka_broker_lock(rkb); + rd_kafka_broker_set_state(rkb, RD_KAFKA_BROKER_STATE_REAUTH); + rd_kafka_broker_unlock(rkb); + + wakeup = rd_true; + break; + default: rd_kafka_assert(rkb->rkb_rk, !*"unhandled op type"); break; @@ -3487,7 +3589,7 @@ rd_kafka_broker_ops_io_serve(rd_kafka_broker_t *rkb, rd_ts_t abs_timeout) { rd_ts_t now; rd_bool_t wakeup; - if (unlikely(rd_kafka_terminating(rkb->rkb_rk))) + if (unlikely(rd_kafka_broker_or_instance_terminating(rkb))) abs_timeout = rd_clock() + 1000; else if (unlikely(rd_kafka_broker_needs_connection(rkb))) abs_timeout = RD_POLL_NOWAIT; @@ -4326,7 +4428,7 @@ static RD_INLINE void rd_kafka_broker_idle_check(rd_kafka_broker_t *rkb) { static void rd_kafka_broker_serve(rd_kafka_broker_t *rkb, int timeout_ms) { rd_ts_t abs_timeout; - if (unlikely(rd_kafka_terminating(rkb->rkb_rk) || + if (unlikely(rd_kafka_broker_or_instance_terminating(rkb) || timeout_ms == RD_POLL_NOWAIT)) timeout_ms = 1; else if (timeout_ms == RD_POLL_INFINITE) @@ -4378,6 +4480,7 @@ rd_kafka_broker_addresses_exhausted(const rd_kafka_broker_t *rkb) { static int rd_kafka_broker_thread_main(void *arg) { rd_kafka_broker_t *rkb = arg; rd_kafka_t *rk = rkb->rkb_rk; + rd_kafka_op_t *terminate_op; rd_kafka_set_thread_name("%s", rkb->rkb_name); rd_kafka_set_thread_sysname("rdk:broker%" PRId32, rkb->rkb_nodeid); @@ -4444,8 +4547,11 @@ static int rd_kafka_broker_thread_main(void *arg) { break; } - if (unlikely(rd_kafka_terminating(rkb->rkb_rk))) + if (unlikely( + rd_kafka_broker_or_instance_terminating(rkb))) { rd_kafka_broker_serve(rkb, 1000); + break; + } if (!rd_kafka_sasl_ready(rkb->rkb_rk)) { /* SASL provider not yet ready. */ @@ -4511,8 +4617,15 @@ static int rd_kafka_broker_thread_main(void *arg) { rd_kafka_broker_addresses_exhausted(rkb)) rd_kafka_broker_update_reconnect_backoff( rkb, &rkb->rkb_rk->rk_conf, rd_clock()); + /* If we haven't made progress from the last state, and + * if we have exceeded + * socket_connection_setup_timeout_ms, then error out. + * Don't error out in case this is a reauth, for which + * socket_connection_setup_timeout_ms is not + * applicable. */ else if ( rkb->rkb_state == orig_state && + !rkb->rkb_reauth_in_progress && rd_clock() >= (rkb->rkb_ts_connect + (rd_ts_t)rk->rk_conf @@ -4527,30 +4640,37 @@ static int rd_kafka_broker_thread_main(void *arg) { break; - case RD_KAFKA_BROKER_STATE_UPDATE: - /* FALLTHRU */ + case RD_KAFKA_BROKER_STATE_REAUTH: + /* Since we've already authenticated once, the provider + * should be ready. */ + rd_assert(rd_kafka_sasl_ready(rkb->rkb_rk)); + + /* Since we aren't disconnecting, the transport isn't + * destroyed, and as a consequence, some of the SASL + * state leaks unless we destroy it before the reauth. + */ + rd_kafka_sasl_close(rkb->rkb_transport); + + rkb->rkb_reauth_in_progress = rd_true; + + rd_kafka_broker_connect_auth(rkb); + break; + case RD_KAFKA_BROKER_STATE_UP: rd_kafka_broker_serve(rkb, rd_kafka_max_block_ms); - - if (rkb->rkb_state == RD_KAFKA_BROKER_STATE_UPDATE) { - rd_kafka_broker_lock(rkb); - rd_kafka_broker_set_state( - rkb, RD_KAFKA_BROKER_STATE_UP); - rd_kafka_broker_unlock(rkb); - } break; } - if (rd_kafka_terminating(rkb->rkb_rk)) { + if (rd_kafka_broker_or_instance_terminating(rkb)) { /* Handle is terminating: fail the send+retry queue * to speed up termination, otherwise we'll * need to wait for request timeouts. */ r = rd_kafka_broker_bufq_timeout_scan( rkb, 0, &rkb->rkb_outbufs, NULL, -1, - RD_KAFKA_RESP_ERR__DESTROY, 0, NULL, 0); + rd_kafka_broker_destroy_error(rk), 0, NULL, 0); r += rd_kafka_broker_bufq_timeout_scan( rkb, 0, &rkb->rkb_retrybufs, NULL, -1, - RD_KAFKA_RESP_ERR__DESTROY, 0, NULL, 0); + rd_kafka_broker_destroy_error(rk), 0, NULL, 0); rd_rkb_dbg( rkb, BROKER, "TERMINATE", "Handle is terminating in state %s: " @@ -4567,28 +4687,45 @@ static int rd_kafka_broker_thread_main(void *arg) { } } - if (rkb->rkb_source != RD_KAFKA_INTERNAL) { - rd_kafka_wrlock(rkb->rkb_rk); - TAILQ_REMOVE(&rkb->rkb_rk->rk_brokers, rkb, rkb_link); - if (rkb->rkb_nodeid != -1 && !RD_KAFKA_BROKER_IS_LOGICAL(rkb)) - rd_list_remove(&rkb->rkb_rk->rk_broker_by_id, rkb); - (void)rd_atomic32_sub(&rkb->rkb_rk->rk_broker_cnt, 1); - rd_kafka_wrunlock(rkb->rkb_rk); - } - - rd_kafka_broker_fail(rkb, LOG_DEBUG, RD_KAFKA_RESP_ERR__DESTROY, - "Broker handle is terminating"); - /* Disable and drain ops queue. * Simply purging the ops queue risks leaving dangling references * for ops such as PARTITION_JOIN/PARTITION_LEAVE where the broker * reference is not maintained in the rko (but in rktp_next_leader). - * #1596 */ + * #1596. + * Do this before failing the broker to make sure no buffers + * are enqueued after that. */ rd_kafka_q_disable(rkb->rkb_ops); while (rd_kafka_broker_ops_serve(rkb, RD_POLL_NOWAIT)) ; - rd_kafka_broker_destroy(rkb); + rd_kafka_broker_fail(rkb, LOG_DEBUG, rd_kafka_broker_destroy_error(rk), + "Broker handle is terminating"); + + rd_rkb_dbg(rkb, BROKER, "TERMINATE", + "Handle terminates in state %s: " + "%d refcnts (%p), %d toppar(s), " + "%d active toppar(s), " + "%d outbufs, %d waitresps, %d retrybufs", + rd_kafka_broker_state_names[rkb->rkb_state], + rd_refcnt_get(&rkb->rkb_refcnt), &rkb->rkb_refcnt, + rkb->rkb_toppar_cnt, rkb->rkb_active_toppar_cnt, + (int)rd_kafka_bufq_cnt(&rkb->rkb_outbufs), + (int)rd_kafka_bufq_cnt(&rkb->rkb_waitresps), + (int)rd_kafka_bufq_cnt(&rkb->rkb_retrybufs)); + + rd_dassert(rkb->rkb_state == RD_KAFKA_BROKER_STATE_DOWN); + if (rkb->rkb_source != RD_KAFKA_INTERNAL) { + rd_kafka_wrlock(rkb->rkb_rk); + TAILQ_REMOVE(&rkb->rkb_rk->rk_brokers, rkb, rkb_link); + + if (RD_KAFKA_BROKER_IS_LOGICAL(rkb)) { + rd_atomic32_sub(&rkb->rkb_rk->rk_logical_broker_cnt, 1); + } else if (rkb->rkb_down_reported) { + rd_atomic32_sub(&rkb->rkb_rk->rk_broker_down_cnt, 1); + } + rd_atomic32_sub(&rkb->rkb_rk->rk_broker_cnt, 1); + rd_kafka_wrunlock(rkb->rkb_rk); + } #if WITH_SSL /* Remove OpenSSL per-thread error state to avoid memory leaks */ @@ -4604,6 +4741,15 @@ static int rd_kafka_broker_thread_main(void *arg) { rd_atomic32_sub(&rd_kafka_thread_cnt_curr, 1); + terminate_op = rd_kafka_op_new(RD_KAFKA_OP_TERMINATE); + terminate_op->rko_u.terminated.rkb = rkb; + terminate_op->rko_u.terminated.cb = + rd_kafka_decommissioned_broker_thread_join; + rd_kafka_q_enq(rk->rk_ops, terminate_op); + + /* Release broker thread reference here and call destroy final. */ + rd_kafka_broker_destroy(rkb); + return 0; } @@ -4648,6 +4794,27 @@ void rd_kafka_broker_destroy_final(rd_kafka_broker_t *rkb) { rd_avg_destroy(&rkb->rkb_avg_outbuf_latency); rd_avg_destroy(&rkb->rkb_avg_rtt); rd_avg_destroy(&rkb->rkb_avg_throttle); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_rtt); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_throttle); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_outbuf_latency); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency); + + if (rkb->rkb_rk->rk_type == RD_KAFKA_CONSUMER) { + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_fetch_latency); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_fetch_latency); + } else if (rkb->rkb_rk->rk_type == RD_KAFKA_PRODUCER) { + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_produce_latency); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_rollover + .rkb_avg_produce_latency); + } + mtx_lock(&rkb->rkb_logname_lock); rd_free(rkb->rkb_logname); @@ -4655,6 +4822,9 @@ void rd_kafka_broker_destroy_final(rd_kafka_broker_t *rkb) { mtx_unlock(&rkb->rkb_logname_lock); mtx_destroy(&rkb->rkb_logname_lock); + rd_kafka_timer_stop(&rkb->rkb_rk->rk_timers, &rkb->rkb_sasl_reauth_tmr, + 1 /*lock*/); + mtx_destroy(&rkb->rkb_lock); rd_refcnt_destroy(&rkb->rkb_refcnt); @@ -4732,18 +4902,56 @@ rd_kafka_broker_t *rd_kafka_broker_add(rd_kafka_t *rk, rd_kafka_bufq_init(&rkb->rkb_retrybufs); rkb->rkb_ops = rd_kafka_q_new(rk); rd_avg_init(&rkb->rkb_avg_int_latency, RD_AVG_GAUGE, 0, 100 * 1000, 2, - rk->rk_conf.stats_interval_ms ? 1 : 0); + rk->rk_conf.stats_interval_ms); rd_avg_init(&rkb->rkb_avg_outbuf_latency, RD_AVG_GAUGE, 0, 100 * 1000, - 2, rk->rk_conf.stats_interval_ms ? 1 : 0); + 2, rk->rk_conf.stats_interval_ms); rd_avg_init(&rkb->rkb_avg_rtt, RD_AVG_GAUGE, 0, 500 * 1000, 2, - rk->rk_conf.stats_interval_ms ? 1 : 0); + rk->rk_conf.stats_interval_ms); rd_avg_init(&rkb->rkb_avg_throttle, RD_AVG_GAUGE, 0, 5000 * 1000, 2, - rk->rk_conf.stats_interval_ms ? 1 : 0); + rk->rk_conf.stats_interval_ms); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_rtt, + RD_AVG_GAUGE, 0, 500 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt, + RD_AVG_GAUGE, 0, 500 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_throttle, + RD_AVG_GAUGE, 0, 5000 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle, + RD_AVG_GAUGE, 0, 5000 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_outbuf_latency, + RD_AVG_GAUGE, 0, 100 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency, + RD_AVG_GAUGE, 0, 100 * 1000, 2, + rk->rk_conf.enable_metrics_push); + + if (rk->rk_type == RD_KAFKA_CONSUMER) { + rd_avg_init( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_fetch_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, + rk->rk_conf.enable_metrics_push); + rd_avg_init( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_fetch_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, + rk->rk_conf.enable_metrics_push); + } else if (rk->rk_type == RD_KAFKA_PRODUCER) { + rd_avg_init( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_produce_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_produce_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + } + rd_refcnt_init(&rkb->rkb_refcnt, 0); rd_kafka_broker_keep(rkb); /* rk_broker's refcount */ rkb->rkb_reconnect_backoff_ms = rk->rk_conf.reconnect_backoff_ms; rd_atomic32_init(&rkb->rkb_persistconn.coord, 0); + rd_atomic32_init(&rkb->termination_in_progress, 0); rd_atomic64_init(&rkb->rkb_c.ts_send, 0); rd_atomic64_init(&rkb->rkb_c.ts_recv, 0); @@ -4912,7 +5120,7 @@ rd_kafka_broker_t *rd_kafka_broker_add_logical(rd_kafka_t *rk, rd_assert(rkb && *"failed to create broker thread"); rd_kafka_wrunlock(rk); - rd_atomic32_add(&rk->rk_broker_addrless_cnt, 1); + rd_atomic32_add(&rk->rk_logical_broker_cnt, 1); rd_dassert(RD_KAFKA_BROKER_IS_LOGICAL(rkb)); rd_kafka_broker_keep(rkb); @@ -4971,14 +5179,6 @@ void rd_kafka_broker_set_nodename(rd_kafka_broker_t *rkb, rkb->rkb_nodename_epoch++; changed = rd_true; } - - if (rkb->rkb_nodeid != nodeid) { - rd_rkb_dbg(rkb, BROKER, "NODEID", - "Broker nodeid changed from %" PRId32 " to %" PRId32, - rkb->rkb_nodeid, nodeid); - rkb->rkb_nodeid = nodeid; - } - rd_kafka_broker_unlock(rkb); /* Update the log name to include (or exclude) the nodeid. @@ -4991,11 +5191,6 @@ void rd_kafka_broker_set_nodename(rd_kafka_broker_t *rkb, if (!changed) return; - if (!rd_kafka_broker_is_addrless(rkb)) - rd_atomic32_sub(&rkb->rkb_rk->rk_broker_addrless_cnt, 1); - else - rd_atomic32_add(&rkb->rkb_rk->rk_broker_addrless_cnt, 1); - /* Trigger a disconnect & reconnect */ rd_kafka_broker_schedule_connection(rkb); } @@ -5062,11 +5257,12 @@ static rd_kafka_broker_t *rd_kafka_broker_find(rd_kafka_t *rk, rd_kafka_mk_nodename(nodename, sizeof(nodename), name, port); TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { - if (RD_KAFKA_BROKER_IS_LOGICAL(rkb)) + if (rd_kafka_broker_or_instance_terminating(rkb) || + RD_KAFKA_BROKER_IS_LOGICAL(rkb)) continue; rd_kafka_broker_lock(rkb); - if (!rd_kafka_terminating(rk) && rkb->rkb_proto == proto && + if (rkb->rkb_proto == proto && !strcmp(rkb->rkb_nodename, nodename)) { rd_kafka_broker_keep(rkb); rd_kafka_broker_unlock(rkb); @@ -5191,6 +5387,31 @@ static int rd_kafka_broker_name_parse(rd_kafka_t *rk, return 0; } +/** + * @brief Add a broker from a string of type "[proto://]host[:port]" to the list + * of brokers. *cnt is increased by one if a broker was added, else not. + */ +static void rd_kafka_find_or_add_broker(rd_kafka_t *rk, + rd_kafka_secproto_t proto, + const char *host, + uint16_t port, + int *cnt) { + rd_kafka_broker_t *rkb = NULL; + + if ((rkb = rd_kafka_broker_find(rk, proto, host, port)) && + rkb->rkb_source == RD_KAFKA_CONFIGURED) { + (*cnt)++; + } else if (rd_kafka_broker_add(rk, RD_KAFKA_CONFIGURED, proto, host, + port, RD_KAFKA_NODEID_UA) != NULL) + (*cnt)++; + + /* If rd_kafka_broker_find returned a broker its + * reference needs to be released + * See issue #193 */ + if (rkb) + rd_kafka_broker_destroy(rkb); +} + /** * @brief Adds a (csv list of) broker(s). * Returns the number of brokers succesfully added. @@ -5198,17 +5419,22 @@ static int rd_kafka_broker_name_parse(rd_kafka_t *rk, * @locality any thread * @locks none */ -int rd_kafka_brokers_add0(rd_kafka_t *rk, const char *brokerlist) { +int rd_kafka_brokers_add0(rd_kafka_t *rk, + const char *brokerlist, + rd_bool_t is_bootstrap_server_list) { char *s_copy = rd_strdup(brokerlist); char *s = s_copy; int cnt = 0; - rd_kafka_broker_t *rkb; - int pre_cnt = rd_atomic32_get(&rk->rk_broker_cnt); + int pre_cnt = rd_atomic32_get(&rk->rk_broker_cnt); + rd_sockaddr_inx_t *sinx; + rd_sockaddr_list_t *sockaddr_list; /* Parse comma-separated list of brokers. */ while (*s) { uint16_t port; const char *host; + const char *err_str; + const char *resolved_FQDN; rd_kafka_secproto_t proto; if (*s == ',' || *s == ' ') { @@ -5221,20 +5447,43 @@ int rd_kafka_brokers_add0(rd_kafka_t *rk, const char *brokerlist) { break; rd_kafka_wrlock(rk); + if (is_bootstrap_server_list && + rk->rk_conf.client_dns_lookup == + RD_KAFKA_RESOLVE_CANONICAL_BOOTSTRAP_SERVERS_ONLY) { + rd_kafka_dbg(rk, ALL, "INIT", + "Canonicalizing bootstrap broker %s:%d", + host, port); + sockaddr_list = rd_getaddrinfo( + host, RD_KAFKA_PORT_STR, AI_ADDRCONFIG, + rk->rk_conf.broker_addr_family, SOCK_STREAM, + IPPROTO_TCP, rk->rk_conf.resolve_cb, + rk->rk_conf.opaque, &err_str); - if ((rkb = rd_kafka_broker_find(rk, proto, host, port)) && - rkb->rkb_source == RD_KAFKA_CONFIGURED) { - cnt++; - } else if (rd_kafka_broker_add(rk, RD_KAFKA_CONFIGURED, proto, - host, port, - RD_KAFKA_NODEID_UA) != NULL) - cnt++; + if (!sockaddr_list) { + rd_kafka_log(rk, LOG_WARNING, "BROKER", + "Failed to resolve '%s': %s", host, + err_str); + rd_kafka_wrunlock(rk); + continue; + } - /* If rd_kafka_broker_find returned a broker its - * reference needs to be released - * See issue #193 */ - if (rkb) - rd_kafka_broker_destroy(rkb); + RD_SOCKADDR_LIST_FOREACH(sinx, sockaddr_list) { + resolved_FQDN = rd_sockaddr2str( + sinx, RD_SOCKADDR2STR_F_RESOLVE); + rd_kafka_dbg( + rk, ALL, "INIT", + "Adding broker with resolved hostname %s", + resolved_FQDN); + + rd_kafka_find_or_add_broker( + rk, proto, resolved_FQDN, port, &cnt); + }; + + rd_sockaddr_list_destroy(sockaddr_list); + } else { + rd_kafka_find_or_add_broker(rk, proto, host, port, + &cnt); + } rd_kafka_wrunlock(rk); } @@ -5256,7 +5505,10 @@ int rd_kafka_brokers_add0(rd_kafka_t *rk, const char *brokerlist) { int rd_kafka_brokers_add(rd_kafka_t *rk, const char *brokerlist) { - return rd_kafka_brokers_add0(rk, brokerlist); + rd_kafka_wrlock(rk); + rd_list_add(&rk->additional_brokerlists, rd_strdup(brokerlist)); + rd_kafka_wrunlock(rk); + return rd_kafka_brokers_add0(rk, brokerlist, rd_false); } @@ -5294,12 +5546,6 @@ void rd_kafka_broker_update(rd_kafka_t *rk, * the hostname. */ if (strcmp(rkb->rkb_nodename, nodename)) needs_update = 1; - } else if ((rkb = rd_kafka_broker_find(rk, proto, mdb->host, - mdb->port))) { - /* Broker matched by hostname (but not by nodeid), - * update the nodeid. */ - needs_update = 1; - } else if ((rkb = rd_kafka_broker_add(rk, RD_KAFKA_LEARNED, proto, mdb->host, mdb->port, mdb->id))) { rd_kafka_broker_keep(rkb); @@ -5314,7 +5560,6 @@ void rd_kafka_broker_update(rd_kafka_t *rk, rko = rd_kafka_op_new(RD_KAFKA_OP_NODE_UPDATE); rd_strlcpy(rko->rko_u.node.nodename, nodename, sizeof(rko->rko_u.node.nodename)); - rko->rko_u.node.nodeid = mdb->id; /* Perform a blocking op request so that all * broker-related state, such as the rk broker list, * is up to date by the time this call returns. @@ -5335,24 +5580,12 @@ void rd_kafka_broker_update(rd_kafka_t *rk, * @returns the broker id, or RD_KAFKA_NODEID_UA if \p rkb is NULL. * * @locality any - * @locks_required none - * @locks_acquired rkb_lock */ int32_t rd_kafka_broker_id(rd_kafka_broker_t *rkb) { - int32_t broker_id; - if (unlikely(!rkb)) return RD_KAFKA_NODEID_UA; - /* Avoid locking if already on the broker thread */ - if (thrd_is_current(rkb->rkb_thread)) - return rkb->rkb_nodeid; - - rd_kafka_broker_lock(rkb); - broker_id = rkb->rkb_nodeid; - rd_kafka_broker_unlock(rkb); - - return broker_id; + return rkb->rkb_nodeid; } @@ -5437,6 +5670,36 @@ static int rd_kafka_broker_filter_never_connected(rd_kafka_broker_t *rkb, return rd_atomic32_get(&rkb->rkb_c.connects); } +/** + * @brief Filter out brokers that aren't learned ones. + */ +static int rd_kafka_broker_filter_learned(rd_kafka_broker_t *rkb, + void *opaque) { + return rkb->rkb_source != RD_KAFKA_LEARNED; +} + +/** + * @brief Filter out brokers that aren't learned ones or + * that have at least one connection attempt. + */ +static int +rd_kafka_broker_filter_learned_never_connected(rd_kafka_broker_t *rkb, + void *opaque) { + return rd_atomic32_get(&rkb->rkb_c.connects) || + rkb->rkb_source != RD_KAFKA_LEARNED; +} + +static void rd_kafka_connect_any_timer_cb(rd_kafka_timers_t *rkts, void *arg) { + const char *reason = (const char *)arg; + rd_kafka_t *rk = rkts->rkts_rk; + if (rd_kafka_terminating(rk)) + return; + + /* Acquire the read lock for `rd_kafka_connect_any` */ + rd_kafka_rdlock(rk); + rd_kafka_connect_any(rk, reason); + rd_kafka_rdunlock(rk); +} /** * @brief Sparse connections: @@ -5456,15 +5719,14 @@ void rd_kafka_connect_any(rd_kafka_t *rk, const char *reason) { * a specific purpose (group coordinator) and their connections * should not be reused for other purposes. * rd_kafka_broker_random() will not return LOGICAL brokers. */ - if (rd_atomic32_get(&rk->rk_broker_up_cnt) - - rd_atomic32_get(&rk->rk_logical_broker_up_cnt) > - 0 || + if (rd_atomic32_get(&rk->rk_broker_up_cnt) > 0 || rd_atomic32_get(&rk->rk_broker_cnt) - - rd_atomic32_get(&rk->rk_broker_addrless_cnt) == + rd_atomic32_get(&rk->rk_logical_broker_cnt) == 0) return; mtx_lock(&rk->rk_suppress.sparse_connect_lock); + suppr = rd_interval(&rk->rk_suppress.sparse_connect_random, rk->rk_conf.sparse_connect_intvl * 1000, 0); mtx_unlock(&rk->rk_suppress.sparse_connect_lock); @@ -5474,23 +5736,78 @@ void rd_kafka_connect_any(rd_kafka_t *rk, const char *reason) { "Not selecting any broker for cluster connection: " "still suppressed for %" PRId64 "ms: %s", -suppr / 1000, reason); + /* Retry after interval + 1ms has passed */ + rd_kafka_timer_start_oneshot( + &rk->rk_timers, &rk->rk_suppress.sparse_connect_random_tmr, + rd_false /* don't restart */, 1000LL - suppr, + rd_kafka_connect_any_timer_cb, (void *)reason); return; } - /* First pass: only match brokers never connected to, + /* In case there no learned brokers never connected to, + * 90% of times select a learned broker in init state. + * + * This avoids problems after re-bootstrapping that cause + * the bootstrap brokers to be always preferred + * given there are learned brokers that already connected and + * caused ALL_BROKERS_DOWN. + * + * If that happens those learned brokers + * that already connected are never selected unless + * they disappear and re-appear again as new brokers with 0 connects, + * so we have to assing a higher probability to it. + * + * Additionally we cannot always prefer the learned + * brokers as their address could have changed and we need to + * connect to the bootstrap brokers to know that. + * KIP-1102 `metadata.recovery.rebootstrap.trigger.ms` would + * be triggered in this case after 5 mins + * but it's a long time to wait. + */ + + /* First pass: only match learned brokers never connected to + * in state INIT, to try to exhaust + * the available brokers so that an + * ERR_ALL_BROKERS_DOWN error can be raised. */ + rkb = rd_kafka_broker_random( + rk, RD_KAFKA_BROKER_STATE_INIT, + rd_kafka_broker_filter_learned_never_connected, NULL); + +#if ENABLE_DEVEL == 1 + if (rkb) + rd_dassert(rkb->rkb_source == RD_KAFKA_LEARNED); +#endif + + if (!rkb && rd_jitter(0, 9) > 0) { /* 0.9 probability */ + /* Second pass: only match learned brokers + * in state INIT. */ + rkb = rd_kafka_broker_random(rk, RD_KAFKA_BROKER_STATE_INIT, + rd_kafka_broker_filter_learned, + NULL); + +#if ENABLE_DEVEL == 1 + if (rkb) + rd_dassert(rkb->rkb_source == RD_KAFKA_LEARNED); +#endif + } + + /* Third pass: only match brokers never connected to, * to try to exhaust the available brokers - * so that an ERR_ALL_BROKERS_DOWN error can be raised. */ - rkb = rd_kafka_broker_random(rk, RD_KAFKA_BROKER_STATE_INIT, - rd_kafka_broker_filter_never_connected, - NULL); - /* Second pass: match any non-connected/non-connecting broker. */ + * so that an ERR_ALL_BROKERS_DOWN error + * can be raised. */ + if (!rkb) + rkb = rd_kafka_broker_random( + rk, RD_KAFKA_BROKER_STATE_INIT, + rd_kafka_broker_filter_never_connected, NULL); + + /* Fourth pass: match any non-connected/non-connecting broker. */ if (!rkb) rkb = rd_kafka_broker_random(rk, RD_KAFKA_BROKER_STATE_INIT, NULL, NULL); if (!rkb) { /* No brokers matched: - * this happens if there are brokers in > INIT state, + * this happens if all brokers are in > INIT state, * in which case they're already connecting. */ rd_kafka_dbg(rk, BROKER | RD_KAFKA_DBG_GENERIC, "CONNECT", @@ -5696,7 +6013,6 @@ void rd_kafka_broker_active_toppar_del(rd_kafka_broker_t *rkb, */ void rd_kafka_broker_schedule_connection(rd_kafka_broker_t *rkb) { rd_kafka_op_t *rko; - rko = rd_kafka_op_new(RD_KAFKA_OP_CONNECT); rd_kafka_op_set_prio(rko, RD_KAFKA_PRIO_FLASH); rd_kafka_q_enq(rkb->rkb_ops, rko); @@ -5834,6 +6150,122 @@ void rd_kafka_broker_monitor_del(rd_kafka_broker_monitor_t *rkbmon) { rd_kafka_broker_destroy(rkb); } +/** + * @brief Starts the reauth timer for this broker. + * If connections_max_reauth_ms=0, then no timer is set. + * + * @locks none + * @locality broker thread + */ +void rd_kafka_broker_start_reauth_timer(rd_kafka_broker_t *rkb, + int64_t connections_max_reauth_ms) { + /* Timer should not already be started. It indicates that we're about to + * schedule an extra reauth, but this shouldn't be a cause for failure + * in production use cases, so, clear the timer. */ + if (rd_kafka_timer_is_started(&rkb->rkb_rk->rk_timers, + &rkb->rkb_sasl_reauth_tmr)) + rd_kafka_timer_stop(&rkb->rkb_rk->rk_timers, + &rkb->rkb_sasl_reauth_tmr, 1 /*lock*/); + + if (connections_max_reauth_ms == 0) + return; + + rd_kafka_timer_start_oneshot( + &rkb->rkb_rk->rk_timers, &rkb->rkb_sasl_reauth_tmr, rd_false, + connections_max_reauth_ms * 900 /* 90% * microsecond*/, + rd_kafka_broker_start_reauth_cb, (void *)rkb); +} + +/** + * @brief Starts the reauth process for the broker rkb. + * + * @locks none + * @locality main thread + */ +void rd_kafka_broker_start_reauth_cb(rd_kafka_timers_t *rkts, void *_rkb) { + rd_kafka_op_t *rko = NULL; + rd_kafka_broker_t *rkb = (rd_kafka_broker_t *)_rkb; + rd_dassert(rkb); + rko = rd_kafka_op_new(RD_KAFKA_OP_SASL_REAUTH); + rd_kafka_q_enq(rkb->rkb_ops, rko); +} + +int32_t *rd_kafka_brokers_learned_ids(rd_kafka_t *rk, size_t *cntp) { + rd_kafka_broker_t *rkb; + int32_t *ids, *p; + int32_t i; + + *cntp = 0; + rd_kafka_rdlock(rk); + ids = malloc(sizeof(*ids) * rd_list_cnt(&rk->rk_broker_by_id)); + p = ids; + RD_LIST_FOREACH(rkb, &rk->rk_broker_by_id, i) { + *p++ = rkb->rkb_nodeid; + (*cntp)++; + } + rd_kafka_rdunlock(rk); + + return ids; +} + +/** + * @brief Decommission a broker. + * + * @param rk Client instance. + * @param rkb Broker to decommission. + * @param wait_thrds Add the broker's thread to this list if not NULL. + * + * @locks rd_kafka_wrlock() is dropped and reacquired. + * + * Broker threads hold a refcount and detect when it reaches 1 and then + * decommissions itself. Callers can wait for this to happen by calling + * thrd_join() on elements of \p wait_thrds. Callers are responsible for + * managing the creation and destruction of \p wait_thrds which can be NULL. + */ +void rd_kafka_broker_decommission(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_list_t *wait_thrds) { + + if (rd_atomic32_get(&rkb->termination_in_progress) > 0) + return; + + rd_atomic32_add(&rkb->termination_in_progress, 1); + + /* Add broker's thread to wait_thrds list for later joining */ + if (wait_thrds) { + thrd_t *thrd = rd_malloc(sizeof(*thrd)); + *thrd = rkb->rkb_thread; + + rd_list_add(wait_thrds, thrd); + } + + rd_list_remove(&rk->rk_broker_by_id, rkb); + rd_kafka_wrunlock(rk); + + rd_kafka_dbg(rk, BROKER, "DESTROY", "Sending TERMINATE to %s", + rd_kafka_broker_name(rkb)); + +#ifndef _WIN32 + /* Interrupt IO threads to speed up termination. */ + if (rk->rk_conf.term_sig) + pthread_kill(rkb->rkb_thread, rk->rk_conf.term_sig); +#endif + + if (rk->rk_cgrp && rk->rk_cgrp->rkcg_curr_coord && + rk->rk_cgrp->rkcg_curr_coord == rkb) + /* If we're decommissioning current coordinator handle, + * mark it as dead and decrease its reference count. */ + rd_kafka_cgrp_coord_dead(rk->rk_cgrp, + RD_KAFKA_RESP_ERR__DESTROY_BROKER, + "Group coordinator decommissioned"); + /* Send op to trigger queue/io wake-up. + * Broker thread will destroy this thread reference. + * WARNING: This is last time we can read from rkb in this thread! */ + rd_kafka_q_enq(rkb->rkb_ops, rd_kafka_op_new(RD_KAFKA_OP_TERMINATE)); + + rd_kafka_wrlock(rk); +} + /** * @name Unit tests * @{ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_broker.h b/src/third_party/librdkafka/dist/src/rdkafka_broker.h index 1e454d4d718..5e8ea82f290 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_broker.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_broker.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012,2013 Magnus Edenhill + * Copyright (c) 2012,2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -50,10 +51,10 @@ typedef enum { /* Any state >= STATE_UP means the Kafka protocol layer * is operational (to some degree). */ RD_KAFKA_BROKER_STATE_UP, - RD_KAFKA_BROKER_STATE_UPDATE, RD_KAFKA_BROKER_STATE_APIVERSION_QUERY, RD_KAFKA_BROKER_STATE_AUTH_HANDSHAKE, RD_KAFKA_BROKER_STATE_AUTH_REQ, + RD_KAFKA_BROKER_STATE_REAUTH, } rd_kafka_broker_state_t; /** @@ -80,8 +81,7 @@ typedef struct rd_kafka_broker_monitor_s { struct rd_kafka_broker_s { /* rd_kafka_broker_t */ TAILQ_ENTRY(rd_kafka_broker_s) rkb_link; - int32_t rkb_nodeid; /**< Broker Node Id. - * @locks rkb_lock */ + int32_t rkb_nodeid; /**< Broker Node Id, read only. */ #define RD_KAFKA_NODEID_UA -1 rd_sockaddr_list_t *rkb_rsal; @@ -191,6 +191,40 @@ struct rd_kafka_broker_s { /* rd_kafka_broker_t */ rd_atomic64_t ts_recv; /**< Timestamp of last receive */ } rkb_c; + struct { + struct { + int32_t connects; /**< Connection attempts, + * successful or not. */ + } rkb_historic_c; + + struct { + rd_avg_t rkb_avg_rtt; /* Current RTT avg */ + rd_avg_t rkb_avg_throttle; /* Current throttle avg */ + rd_avg_t + rkb_avg_outbuf_latency; /**< Current latency + * between buf_enq0 + * and writing to socket + */ + rd_avg_t rkb_avg_fetch_latency; /**< Current fetch + * latency avg */ + rd_avg_t rkb_avg_produce_latency; /**< Current produce + * latency avg */ + } rd_avg_current; + + struct { + rd_avg_t rkb_avg_rtt; /**< Rolled over RTT avg */ + rd_avg_t + rkb_avg_throttle; /**< Rolled over throttle avg */ + rd_avg_t rkb_avg_outbuf_latency; /**< Rolled over outbuf + * latency avg */ + rd_avg_t rkb_avg_fetch_latency; /**< Rolled over fetch + * latency avg */ + rd_avg_t + rkb_avg_produce_latency; /**< Rolled over produce + * latency avg */ + } rd_avg_rollover; + } rkb_telemetry; + int rkb_req_timeouts; /* Current value */ thrd_t rkb_thread; @@ -252,6 +286,9 @@ struct rd_kafka_broker_s { /* rd_kafka_broker_t */ /** Absolute time of last connection attempt. */ rd_ts_t rkb_ts_connect; + /** True if a reauthentication is in progress. */ + rd_bool_t rkb_reauth_in_progress; + /**< Persistent connection demand is tracked by * a counter for each type of demand. * The broker thread will maintain a persistent connection @@ -323,6 +360,12 @@ struct rd_kafka_broker_s { /* rd_kafka_broker_t */ rd_kafka_resp_err_t err; /**< Last error code */ int cnt; /**< Number of identical errors */ } rkb_last_err; + + + rd_kafka_timer_t rkb_sasl_reauth_tmr; + + /** > 0 if this broker thread is terminating */ + rd_atomic32_t termination_in_progress; }; #define rd_kafka_broker_keep(rkb) rd_refcnt_add(&(rkb)->rkb_refcnt) @@ -350,12 +393,28 @@ rd_kafka_broker_get_state(rd_kafka_broker_t *rkb) { /** - * @returns true if the broker state is UP or UPDATE + * @returns true if the broker state is UP */ -#define rd_kafka_broker_state_is_up(state) \ - ((state) == RD_KAFKA_BROKER_STATE_UP || \ - (state) == RD_KAFKA_BROKER_STATE_UPDATE) +#define rd_kafka_broker_state_is_up(state) ((state) == RD_KAFKA_BROKER_STATE_UP) +/** + * @returns true if the broker state is DOWN + */ +#define rd_kafka_broker_state_is_down(state) \ + ((state) == RD_KAFKA_BROKER_STATE_DOWN) + +/** + * @returns true if the error is a broker destroy error, because of + * termination or because of decommissioning. + */ +#define rd_kafka_broker_is_any_err_destroy(err) \ + ((err) == RD_KAFKA_RESP_ERR__DESTROY || \ + (err) == RD_KAFKA_RESP_ERR__DESTROY_BROKER) + + +#define rd_kafka_broker_or_instance_terminating(rkb) \ + (rd_kafka_broker_termination_in_progress(rkb) || \ + rd_kafka_terminating((rkb)->rkb_rk)) /** * @returns true if the broker connection is up, else false. @@ -368,6 +427,14 @@ rd_kafka_broker_is_up(rd_kafka_broker_t *rkb) { return rd_kafka_broker_state_is_up(state); } +/** + * @returns true if the broker needs a persistent connection + * @locality any + */ +static RD_UNUSED RD_INLINE rd_bool_t +rd_kafka_broker_termination_in_progress(rd_kafka_broker_t *rkb) { + return rd_atomic32_get(&rkb->termination_in_progress) > 0; +} /** * @brief Broker comparator @@ -403,6 +470,13 @@ int16_t rd_kafka_broker_ApiVersion_supported(rd_kafka_broker_t *rkb, int16_t maxver, int *featuresp); +int16_t rd_kafka_broker_ApiVersion_supported0(rd_kafka_broker_t *rkb, + int16_t ApiKey, + int16_t minver, + int16_t maxver, + int *featuresp, + rd_bool_t do_lock); + rd_kafka_broker_t *rd_kafka_broker_find_by_nodeid0_fl(const char *func, int line, rd_kafka_t *rk, @@ -461,7 +535,9 @@ rd_kafka_broker_t *rd_kafka_broker_controller_async(rd_kafka_t *rk, int state, rd_kafka_enq_once_t *eonce); -int rd_kafka_brokers_add0(rd_kafka_t *rk, const char *brokerlist); +int rd_kafka_brokers_add0(rd_kafka_t *rk, + const char *brokerlist, + rd_bool_t is_bootstrap_server_list); void rd_kafka_broker_set_state(rd_kafka_broker_t *rkb, int state); void rd_kafka_broker_fail(rd_kafka_broker_t *rkb, @@ -507,9 +583,13 @@ void rd_kafka_broker_connect_done(rd_kafka_broker_t *rkb, const char *errstr); int rd_kafka_send(rd_kafka_broker_t *rkb); int rd_kafka_recv(rd_kafka_broker_t *rkb); -void rd_kafka_dr_msgq(rd_kafka_topic_t *rkt, - rd_kafka_msgq_t *rkmq, - rd_kafka_resp_err_t err); +#define rd_kafka_dr_msgq(rkt, rkmq, err) \ + rd_kafka_dr_msgq0(rkt, rkmq, err, NULL /*no produce result*/) + +void rd_kafka_dr_msgq0(rd_kafka_topic_t *rkt, + rd_kafka_msgq_t *rkmq, + rd_kafka_resp_err_t err, + const rd_kafka_Produce_result_t *presult); void rd_kafka_dr_implicit_ack(rd_kafka_broker_t *rkb, rd_kafka_toppar_t *rktp, @@ -558,6 +638,25 @@ int rd_kafka_brokers_wait_state_change_async(rd_kafka_t *rk, rd_kafka_enq_once_t *eonce); void rd_kafka_brokers_broadcast_state_change(rd_kafka_t *rk); +rd_kafka_broker_t *rd_kafka_broker_random0(const char *func, + int line, + rd_kafka_t *rk, + rd_bool_t is_up, + int state, + int *filtered_cnt, + int (*filter)(rd_kafka_broker_t *rk, + void *opaque), + void *opaque); + +#define rd_kafka_broker_random(rk, state, filter, opaque) \ + rd_kafka_broker_random0(__FUNCTION__, __LINE__, rk, rd_false, state, \ + NULL, filter, opaque) + +#define rd_kafka_broker_random_up(rk, filter, opaque) \ + rd_kafka_broker_random0(__FUNCTION__, __LINE__, rk, rd_true, \ + RD_KAFKA_BROKER_STATE_UP, NULL, filter, \ + opaque) + /** @@ -602,6 +701,15 @@ void rd_kafka_broker_monitor_add(rd_kafka_broker_monitor_t *rkbmon, void rd_kafka_broker_monitor_del(rd_kafka_broker_monitor_t *rkbmon); +void rd_kafka_broker_start_reauth_timer(rd_kafka_broker_t *rkb, + int64_t connections_max_reauth_ms); + +void rd_kafka_broker_start_reauth_cb(rd_kafka_timers_t *rkts, void *rkb); + +void rd_kafka_broker_decommission(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_list_t *wait_thrds); + int unittest_broker(void); #endif /* _RDKAFKA_BROKER_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_buf.c b/src/third_party/librdkafka/dist/src/rdkafka_buf.c index 5a0e131e8b9..012835de088 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_buf.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_buf.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,11 +38,10 @@ void rd_kafka_buf_destroy_final(rd_kafka_buf_t *rkbuf) { case RD_KAFKAP_Metadata: if (rkbuf->rkbuf_u.Metadata.topics) rd_list_destroy(rkbuf->rkbuf_u.Metadata.topics); + if (rkbuf->rkbuf_u.Metadata.topic_ids) + rd_list_destroy(rkbuf->rkbuf_u.Metadata.topic_ids); if (rkbuf->rkbuf_u.Metadata.reason) rd_free(rkbuf->rkbuf_u.Metadata.reason); - if (rkbuf->rkbuf_u.Metadata.rko) - rd_kafka_op_reply(rkbuf->rkbuf_u.Metadata.rko, - RD_KAFKA_RESP_ERR__DESTROY); if (rkbuf->rkbuf_u.Metadata.decr) { /* Decrease metadata cache's full_.._sent state. */ mtx_lock(rkbuf->rkbuf_u.Metadata.decr_lock); @@ -120,6 +120,18 @@ rd_kafka_buf_t *rd_kafka_buf_new0(int segcnt, size_t size, int flags) { return rkbuf; } +/** + * @brief Upgrade request header to flexver by writing header tags. + */ +void rd_kafka_buf_upgrade_flexver_request(rd_kafka_buf_t *rkbuf) { + if (likely(!(rkbuf->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER))) { + rkbuf->rkbuf_flags |= RD_KAFKA_OP_F_FLEXVER; + + /* Empty request header tags */ + rd_kafka_buf_write_i8(rkbuf, 0); + } +} + /** * @brief Create new request buffer with the request-header written (will @@ -165,12 +177,7 @@ rd_kafka_buf_t *rd_kafka_buf_new_request0(rd_kafka_broker_t *rkb, rd_kafka_buf_write_kstr(rkbuf, rkb->rkb_rk->rk_client_id); if (is_flexver) { - /* Must set flexver after writing the client id since - * it is still a standard non-compact string. */ - rkbuf->rkbuf_flags |= RD_KAFKA_OP_F_FLEXVER; - - /* Empty request header tags */ - rd_kafka_buf_write_i8(rkbuf, 0); + rd_kafka_buf_upgrade_flexver_request(rkbuf); } return rkbuf; @@ -234,6 +241,12 @@ void rd_kafka_bufq_init(rd_kafka_bufq_t *rkbufq) { rd_atomic32_init(&rkbufq->rkbq_msg_cnt, 0); } +static void rd_kafka_bufq_reset(rd_kafka_bufq_t *rkbufq) { + TAILQ_INIT(&rkbufq->rkbq_bufs); + rd_atomic32_set(&rkbufq->rkbq_cnt, 0); + rd_atomic32_set(&rkbufq->rkbq_msg_cnt, 0); +} + /** * Concat all buffers from 'src' to tail of 'dst' */ @@ -242,7 +255,7 @@ void rd_kafka_bufq_concat(rd_kafka_bufq_t *dst, rd_kafka_bufq_t *src) { (void)rd_atomic32_add(&dst->rkbq_cnt, rd_atomic32_get(&src->rkbq_cnt)); (void)rd_atomic32_add(&dst->rkbq_msg_cnt, rd_atomic32_get(&src->rkbq_msg_cnt)); - rd_kafka_bufq_init(src); + rd_kafka_bufq_reset(src); } /** @@ -383,7 +396,7 @@ int rd_kafka_buf_retry(rd_kafka_broker_t *rkb, rd_kafka_buf_t *rkbuf) { rd_assert(rd_buf_len(&rkbuf->rkbuf_buf) > 0); if (unlikely(!rkb || rkb->rkb_source == RD_KAFKA_INTERNAL || - rd_kafka_terminating(rkb->rkb_rk) || + rd_kafka_broker_or_instance_terminating(rkb) || rkbuf->rkbuf_retries + incr_retry > rkbuf->rkbuf_max_retries)) return 0; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_buf.h b/src/third_party/librdkafka/dist/src/rdkafka_buf.h index 0552d895578..9682d099a05 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_buf.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_buf.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -48,21 +49,36 @@ typedef struct rd_tmpabuf_s { size_t of; char *buf; int failed; - int assert_on_fail; + rd_bool_t assert_on_fail; } rd_tmpabuf_t; /** - * @brief Allocate new tmpabuf with \p size bytes pre-allocated. + * @brief Initialize new tmpabuf of non-final \p size bytes. */ static RD_UNUSED void -rd_tmpabuf_new(rd_tmpabuf_t *tab, size_t size, int assert_on_fail) { - tab->buf = rd_malloc(size); - tab->size = size; +rd_tmpabuf_new(rd_tmpabuf_t *tab, size_t size, rd_bool_t assert_on_fail) { + tab->buf = NULL; + tab->size = RD_ROUNDUP(size, 8); tab->of = 0; tab->failed = 0; tab->assert_on_fail = assert_on_fail; } +/** + * @brief Add a new allocation of \p _size bytes, + * rounded up to maximum word size, + * for \p _times times. + */ +#define rd_tmpabuf_add_alloc_times(_tab, _size, _times) \ + (_tab)->size += RD_ROUNDUP(_size, 8) * _times + +#define rd_tmpabuf_add_alloc(_tab, _size) \ + rd_tmpabuf_add_alloc_times(_tab, _size, 1) +/** + * @brief Finalize tmpabuf pre-allocating tab->size bytes. + */ +#define rd_tmpabuf_finalize(_tab) (_tab)->buf = rd_malloc((_tab)->size) + /** * @brief Free memory allocated by tmpabuf */ @@ -359,13 +375,19 @@ struct rd_kafka_buf_s { /* rd_kafka_buf_t */ union { struct { - rd_list_t *topics; /* Requested topics (char *) */ - char *reason; /* Textual reason */ - rd_kafka_op_t *rko; /* Originating rko with replyq - * (if any) */ + rd_list_t *topics; /* Requested topics (char *) */ + rd_list_t * + topic_ids; /* Requested topic ids rd_kafka_Uuid_t */ + char *reason; /* Textual reason */ rd_bool_t all_topics; /**< Full/All topics requested */ rd_bool_t cgrp_update; /**< Update cgrp with topic * status from response. */ + int32_t cgrp_subscription_version; + /**< Consumer group subscription version, to + * check before updating cgrp state. */ + rd_bool_t force_racks; /**< Force the returned metadata + * to contain partition to + * rack mapping. */ int *decr; /* Decrement this integer by one * when request is complete: @@ -503,7 +525,7 @@ struct rd_kafka_buf_s { /* rd_kafka_buf_t */ #define rd_kafka_buf_skip_to(rkbuf, pos) \ do { \ size_t __len1 = \ - (size_t)(pos)-rd_slice_offset(&(rkbuf)->rkbuf_reader); \ + (size_t)(pos) - rd_slice_offset(&(rkbuf)->rkbuf_reader); \ if (__len1 && \ !rd_slice_read(&(rkbuf)->rkbuf_reader, NULL, __len1)) \ rd_kafka_buf_check_len(rkbuf, __len1); \ @@ -682,6 +704,10 @@ struct rd_kafka_buf_s { /* rd_kafka_buf_t */ size_t _slen; \ char *_dst; \ rd_kafka_buf_read_str(rkbuf, &_kstr); \ + if (RD_KAFKAP_STR_IS_NULL(&_kstr)) { \ + dst = NULL; \ + break; \ + } \ _slen = RD_KAFKAP_STR_LEN(&_kstr); \ if (!(_dst = rd_tmpabuf_write(tmpabuf, _kstr.str, _slen + 1))) \ rd_kafka_buf_parse_fail( \ @@ -694,21 +720,44 @@ struct rd_kafka_buf_s { /* rd_kafka_buf_t */ } while (0) /** - * Skip a string. + * Skip a string without flexver. */ -#define rd_kafka_buf_skip_str(rkbuf) \ +#define rd_kafka_buf_skip_str_no_flexver(rkbuf) \ do { \ int16_t _slen; \ rd_kafka_buf_read_i16(rkbuf, &_slen); \ rd_kafka_buf_skip(rkbuf, RD_KAFKAP_STR_LEN0(_slen)); \ } while (0) -/* Read Kafka Bytes representation (4+N). - * The 'kbytes' will be updated to point to rkbuf data */ -#define rd_kafka_buf_read_bytes(rkbuf, kbytes) \ +/** + * Skip a string (generic). + */ +#define rd_kafka_buf_skip_str(rkbuf) \ do { \ - int _klen; \ - rd_kafka_buf_read_i32a(rkbuf, _klen); \ + if ((rkbuf)->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER) { \ + uint64_t _uva; \ + rd_kafka_buf_read_uvarint(rkbuf, &_uva); \ + rd_kafka_buf_skip( \ + rkbuf, RD_KAFKAP_STR_LEN0(((int64_t)_uva) - 1)); \ + } else { \ + rd_kafka_buf_skip_str_no_flexver(rkbuf); \ + } \ + } while (0) +/** + * Read Kafka COMPACT_BYTES representation (VARINT+N) or + * standard BYTES representation(4+N). + * The 'kbytes' will be updated to point to rkbuf data. + */ +#define rd_kafka_buf_read_kbytes(rkbuf, kbytes) \ + do { \ + int32_t _klen; \ + if (!(rkbuf->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER)) { \ + rd_kafka_buf_read_i32a(rkbuf, _klen); \ + } else { \ + uint64_t _uva; \ + rd_kafka_buf_read_uvarint(rkbuf, &_uva); \ + _klen = ((int32_t)_uva) - 1; \ + } \ (kbytes)->len = _klen; \ if (RD_KAFKAP_BYTES_IS_NULL(kbytes)) { \ (kbytes)->data = NULL; \ @@ -720,7 +769,6 @@ struct rd_kafka_buf_s { /* rd_kafka_buf_t */ rd_kafka_buf_check_len(rkbuf, _klen); \ } while (0) - /** * @brief Read \p size bytes from buffer, setting \p *ptr to the start * of the memory region. @@ -737,7 +785,7 @@ struct rd_kafka_buf_s { /* rd_kafka_buf_t */ /** * @brief Read varint-lengted Kafka Bytes representation */ -#define rd_kafka_buf_read_bytes_varint(rkbuf, kbytes) \ +#define rd_kafka_buf_read_kbytes_varint(rkbuf, kbytes) \ do { \ int64_t _len2; \ size_t _r = \ @@ -784,18 +832,62 @@ struct rd_kafka_buf_s { /* rd_kafka_buf_t */ uint64_t _tagtype, _taglen; \ rd_kafka_buf_read_uvarint(rkbuf, &_tagtype); \ rd_kafka_buf_read_uvarint(rkbuf, &_taglen); \ - if (_taglen > 1) \ - rd_kafka_buf_skip(rkbuf, \ - (size_t)(_taglen - 1)); \ + if (_taglen > 0) \ + rd_kafka_buf_skip(rkbuf, (size_t)(_taglen)); \ } \ } while (0) /** - * @brief Write tags at the current position in the buffer. - * @remark Currently always writes empty tags. - * @remark Change to ..write_uvarint() when actual tags are supported. + * @brief Read KIP-482 Tags at current position in the buffer using + * the `read_tag` function receiving the `opaque' pointer. */ -#define rd_kafka_buf_write_tags(rkbuf) \ +#define rd_kafka_buf_read_tags(rkbuf, read_tag, ...) \ + do { \ + uint64_t _tagcnt; \ + if (!((rkbuf)->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER)) \ + break; \ + rd_kafka_buf_read_uvarint(rkbuf, &_tagcnt); \ + while (_tagcnt-- > 0) { \ + uint64_t _tagtype, _taglen; \ + rd_kafka_buf_read_uvarint(rkbuf, &_tagtype); \ + rd_kafka_buf_read_uvarint(rkbuf, &_taglen); \ + int _read_tag_resp = \ + read_tag(rkbuf, _tagtype, _taglen, __VA_ARGS__); \ + if (_read_tag_resp == -1) \ + goto err_parse; \ + if (!_read_tag_resp && _taglen > 0) \ + rd_kafka_buf_skip(rkbuf, (size_t)(_taglen)); \ + } \ + } while (0) + +/** + * @brief Write \p tagcnt tags at the current position in the buffer. + * Calling \p write_tag to write each one with \p rkbuf , tagtype + * argument and the remaining arguments. + */ +#define rd_kafka_buf_write_tags(rkbuf, write_tag, tags, tagcnt, ...) \ + do { \ + uint64_t i; \ + if (!((rkbuf)->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER)) \ + break; \ + rd_kafka_buf_write_uvarint(rkbuf, tagcnt); \ + for (i = 0; i < tagcnt; i++) { \ + size_t of_taglen, prev_buf_len; \ + rd_kafka_buf_write_uvarint(rkbuf, tags[i]); \ + of_taglen = rd_kafka_buf_write_arraycnt_pos(rkbuf); \ + prev_buf_len = (rkbuf)->rkbuf_buf.rbuf_len; \ + write_tag(rkbuf, tags[i], __VA_ARGS__); \ + rd_kafka_buf_finalize_arraycnt( \ + rkbuf, of_taglen, \ + (rkbuf)->rkbuf_buf.rbuf_len - prev_buf_len - 1); \ + } \ + } while (0) + + +/** + * @brief Write empty tags at the current position in the buffer. + */ +#define rd_kafka_buf_write_tags_empty(rkbuf) \ do { \ if (!((rkbuf)->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER)) \ break; \ @@ -815,7 +907,8 @@ struct rd_kafka_buf_s { /* rd_kafka_buf_t */ } else { \ rd_kafka_buf_read_i32(rkbuf, arrcnt); \ } \ - if (*(arrcnt) < 0 || ((maxval) != -1 && *(arrcnt) > (maxval))) \ + if (*(arrcnt) < -1 || \ + ((maxval) != -1 && *(arrcnt) > (maxval))) \ rd_kafka_buf_parse_fail( \ rkbuf, "ApiArrayCnt %" PRId32 " out of range", \ *(arrcnt)); \ @@ -917,6 +1010,7 @@ rd_kafka_buf_t *rd_kafka_buf_new_request0(rd_kafka_broker_t *rkb, #define rd_kafka_buf_new_flexver_request(rkb, ApiKey, segcnt, size, \ is_flexver) \ rd_kafka_buf_new_request0(rkb, ApiKey, segcnt, size, is_flexver) +void rd_kafka_buf_upgrade_flexver_request(rd_kafka_buf_t *rkbuf); rd_kafka_buf_t * rd_kafka_buf_new_shadow(const void *ptr, size_t size, void (*free_cb)(void *)); @@ -1072,9 +1166,57 @@ rd_kafka_buf_update_u32(rd_kafka_buf_t *rkbuf, size_t of, uint32_t v) { } +/** + * @brief Write varint-encoded signed value to buffer. + */ +static RD_INLINE size_t rd_kafka_buf_write_varint(rd_kafka_buf_t *rkbuf, + int64_t v) { + char varint[RD_UVARINT_ENC_SIZEOF(v)]; + size_t sz; + + sz = rd_uvarint_enc_i64(varint, sizeof(varint), v); + + return rd_kafka_buf_write(rkbuf, varint, sz); +} + +/** + * @brief Write varint-encoded unsigned value to buffer. + */ +static RD_INLINE size_t rd_kafka_buf_write_uvarint(rd_kafka_buf_t *rkbuf, + uint64_t v) { + char varint[RD_UVARINT_ENC_SIZEOF(v)]; + size_t sz; + + sz = rd_uvarint_enc_u64(varint, sizeof(varint), v); + + return rd_kafka_buf_write(rkbuf, varint, sz); +} + + + +/** + * @brief Write standard or flexver arround count field to buffer. + * Use this when the array count is known beforehand, else use + * rd_kafka_buf_write_arraycnt_pos(). + */ +static RD_INLINE RD_UNUSED size_t +rd_kafka_buf_write_arraycnt(rd_kafka_buf_t *rkbuf, size_t cnt) { + + /* Count must fit in 31-bits minus the per-byte carry-bit */ + rd_assert(cnt + 1 < (size_t)(INT_MAX >> 4)); + + if (!(rkbuf->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER)) + return rd_kafka_buf_write_i32(rkbuf, (int32_t)cnt); + + /* CompactArray has a base of 1, 0 is for Null arrays */ + cnt += 1; + return rd_kafka_buf_write_uvarint(rkbuf, (uint64_t)cnt); +} + + /** * @brief Write array count field to buffer (i32) for later update with - * rd_kafka_buf_update_arraycnt(). + * rd_kafka_buf_finalize_arraycnt(). */ #define rd_kafka_buf_write_arraycnt_pos(rkbuf) rd_kafka_buf_write_i32(rkbuf, 0) @@ -1092,11 +1234,11 @@ rd_kafka_buf_update_u32(rd_kafka_buf_t *rkbuf, size_t of, uint32_t v) { * and may thus be costly. */ static RD_INLINE void -rd_kafka_buf_finalize_arraycnt(rd_kafka_buf_t *rkbuf, size_t of, int cnt) { +rd_kafka_buf_finalize_arraycnt(rd_kafka_buf_t *rkbuf, size_t of, size_t cnt) { char buf[sizeof(int32_t)]; size_t sz, r; - rd_assert(cnt >= 0); + rd_assert(cnt < (size_t)INT_MAX); if (!(rkbuf->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER)) { rd_kafka_buf_update_i32(rkbuf, of, (int32_t)cnt); @@ -1108,7 +1250,8 @@ rd_kafka_buf_finalize_arraycnt(rd_kafka_buf_t *rkbuf, size_t of, int cnt) { sz = rd_uvarint_enc_u64(buf, sizeof(buf), (uint64_t)cnt); rd_assert(!RD_UVARINT_OVERFLOW(sz)); - + if (cnt < 127) + rd_assert(sz == 1); rd_buf_write_update(&rkbuf->rkbuf_buf, of, buf, sz); if (sz < sizeof(int32_t)) { @@ -1141,34 +1284,6 @@ rd_kafka_buf_update_i64(rd_kafka_buf_t *rkbuf, size_t of, int64_t v) { rd_kafka_buf_update(rkbuf, of, &v, sizeof(v)); } - -/** - * @brief Write varint-encoded signed value to buffer. - */ -static RD_INLINE size_t rd_kafka_buf_write_varint(rd_kafka_buf_t *rkbuf, - int64_t v) { - char varint[RD_UVARINT_ENC_SIZEOF(v)]; - size_t sz; - - sz = rd_uvarint_enc_i64(varint, sizeof(varint), v); - - return rd_kafka_buf_write(rkbuf, varint, sz); -} - -/** - * @brief Write varint-encoded unsigned value to buffer. - */ -static RD_INLINE size_t rd_kafka_buf_write_uvarint(rd_kafka_buf_t *rkbuf, - uint64_t v) { - char varint[RD_UVARINT_ENC_SIZEOF(v)]; - size_t sz; - - sz = rd_uvarint_enc_u64(varint, sizeof(varint), v); - - return rd_kafka_buf_write(rkbuf, varint, sz); -} - - /** * @brief Write standard (2-byte header) or KIP-482 COMPACT_STRING to buffer. * @@ -1274,30 +1389,40 @@ static RD_INLINE void rd_kafka_buf_push_kstr(rd_kafka_buf_t *rkbuf, static RD_INLINE size_t rd_kafka_buf_write_kbytes(rd_kafka_buf_t *rkbuf, const rd_kafkap_bytes_t *kbytes) { - size_t len; + size_t len, r; - if (!kbytes || RD_KAFKAP_BYTES_IS_NULL(kbytes)) - return rd_kafka_buf_write_i32(rkbuf, -1); + if (!(rkbuf->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER)) { + if (!kbytes || RD_KAFKAP_BYTES_IS_NULL(kbytes)) + return rd_kafka_buf_write_i32(rkbuf, -1); - if (RD_KAFKAP_BYTES_IS_SERIALIZED(kbytes)) - return rd_kafka_buf_write(rkbuf, RD_KAFKAP_BYTES_SER(kbytes), - RD_KAFKAP_BYTES_SIZE(kbytes)); + if (RD_KAFKAP_BYTES_IS_SERIALIZED(kbytes)) + return rd_kafka_buf_write(rkbuf, + RD_KAFKAP_BYTES_SER(kbytes), + RD_KAFKAP_BYTES_SIZE(kbytes)); - len = RD_KAFKAP_BYTES_LEN(kbytes); - rd_kafka_buf_write_i32(rkbuf, (int32_t)len); - rd_kafka_buf_write(rkbuf, kbytes->data, len); + len = RD_KAFKAP_BYTES_LEN(kbytes); + rd_kafka_buf_write_i32(rkbuf, (int32_t)len); + rd_kafka_buf_write(rkbuf, kbytes->data, len); - return 4 + len; -} + return 4 + len; + } -/** - * Push (i.e., no copy) Kafka bytes to buffer iovec - */ -static RD_INLINE void -rd_kafka_buf_push_kbytes(rd_kafka_buf_t *rkbuf, - const rd_kafkap_bytes_t *kbytes) { - rd_kafka_buf_push(rkbuf, RD_KAFKAP_BYTES_SER(kbytes), - RD_KAFKAP_BYTES_SIZE(kbytes), NULL); + /* COMPACT_BYTES lengths are: + * 0 = NULL, + * 1 = empty + * N.. = length + 1 + */ + if (!kbytes) + len = 0; + else + len = kbytes->len + 1; + + r = rd_kafka_buf_write_uvarint(rkbuf, (uint64_t)len); + if (len > 1) { + rd_kafka_buf_write(rkbuf, kbytes->data, len - 1); + r += len - 1; + } + return r; } /** @@ -1381,4 +1506,20 @@ void rd_kafka_buf_set_maker(rd_kafka_buf_t *rkbuf, void *make_opaque, void (*free_make_opaque_cb)(void *make_opaque)); + +#define rd_kafka_buf_read_uuid(rkbuf, uuid) \ + do { \ + rd_kafka_buf_read_i64(rkbuf, \ + &((uuid)->most_significant_bits)); \ + rd_kafka_buf_read_i64(rkbuf, \ + &((uuid)->least_significant_bits)); \ + (uuid)->base64str[0] = '\0'; \ + } while (0) + +static RD_UNUSED void rd_kafka_buf_write_uuid(rd_kafka_buf_t *rkbuf, + rd_kafka_Uuid_t *uuid) { + rd_kafka_buf_write_i64(rkbuf, uuid->most_significant_bits); + rd_kafka_buf_write_i64(rkbuf, uuid->least_significant_bits); +} + #endif /* _RDKAFKA_BUF_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_cert.c b/src/third_party/librdkafka/dist/src/rdkafka_cert.c index 2a19e454931..e4393c32885 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_cert.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_cert.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -78,6 +78,8 @@ static void rd_kafka_cert_destroy(rd_kafka_cert_t *cert) { if (cert->x509) X509_free(cert->x509); + if (cert->chain) + sk_X509_pop_free(cert->chain, X509_free); if (cert->pkey) EVP_PKEY_free(cert->pkey); if (cert->store) @@ -314,10 +316,11 @@ static rd_kafka_cert_t *rd_kafka_cert_new(const rd_kafka_conf_t *conf, switch (encoding) { case RD_KAFKA_CERT_ENC_PKCS12: { EVP_PKEY *ign_pkey; + STACK_OF(X509) *ca = NULL; action = "parse PKCS#12"; if (!PKCS12_parse(p12, conf->ssl.key_password, - &ign_pkey, &cert->x509, NULL)) + &ign_pkey, &cert->x509, &ca)) goto fail; EVP_PKEY_free(ign_pkey); @@ -325,6 +328,13 @@ static rd_kafka_cert_t *rd_kafka_cert_new(const rd_kafka_conf_t *conf, action = "retrieve public key"; if (!cert->x509) goto fail; + + if (ca) { + if (sk_X509_num(ca) > 0) + cert->chain = ca; + else + sk_X509_pop_free(ca, X509_free); + } } break; case RD_KAFKA_CERT_ENC_DER: @@ -341,6 +351,20 @@ static rd_kafka_cert_t *rd_kafka_cert_new(const rd_kafka_conf_t *conf, (void *)conf); if (!cert->x509) goto fail; + + cert->chain = sk_X509_new_null(); + if (rd_kafka_ssl_read_cert_chain_from_BIO( + bio, cert->chain, rd_kafka_conf_ssl_passwd_cb, + (void *)conf) != 0) { + sk_X509_pop_free(cert->chain, X509_free); + cert->chain = NULL; + goto fail; + } + + if (sk_X509_num(cert->chain) == 0) { + sk_X509_pop_free(cert->chain, X509_free); + cert->chain = NULL; + } break; default: diff --git a/src/third_party/librdkafka/dist/src/rdkafka_cert.h b/src/third_party/librdkafka/dist/src/rdkafka_cert.h index b53f46c010d..753223c5392 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_cert.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_cert.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -43,9 +43,10 @@ typedef struct rd_kafka_cert_s { rd_kafka_cert_enc_t encoding; rd_refcnt_t refcnt; #if WITH_SSL - X509 *x509; /**< Certificate (public key) */ - EVP_PKEY *pkey; /**< Private key */ - X509_STORE *store; /**< CA certificate chain store */ + X509 *x509; /**< Certificate (public key) */ + STACK_OF(X509) * chain; /**< Certificate chain (public key) */ + EVP_PKEY *pkey; /**< Private key */ + X509_STORE *store; /**< CA trusted certificates */ #endif } rd_kafka_cert_t; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_cgrp.c b/src/third_party/librdkafka/dist/src/rdkafka_cgrp.c index dc7ed6c0e93..d87ab2c0585 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_cgrp.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_cgrp.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -97,6 +98,7 @@ static void rd_kafka_cgrp_handle_assignment(rd_kafka_cgrp_t *rkcg, rd_kafka_topic_partition_list_t *assignment); +static void rd_kafka_cgrp_consumer_assignment_done(rd_kafka_cgrp_t *rkcg); /** * @returns true if the current assignment is lost. @@ -170,6 +172,14 @@ rd_kafka_cgrp_assignment_clear_lost(rd_kafka_cgrp_t *rkcg, char *fmt, ...) { */ rd_kafka_rebalance_protocol_t rd_kafka_cgrp_rebalance_protocol(rd_kafka_cgrp_t *rkcg) { + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + if (!(rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_SUBSCRIBED_ONCE)) + return RD_KAFKA_REBALANCE_PROTOCOL_NONE; + + return RD_KAFKA_REBALANCE_PROTOCOL_COOPERATIVE; + } + if (!rkcg->rkcg_assignor) return RD_KAFKA_REBALANCE_PROTOCOL_NONE; return rkcg->rkcg_assignor->rkas_protocol; @@ -215,7 +225,12 @@ static void rd_kafka_cgrp_clear_wait_resp(rd_kafka_cgrp_t *rkcg, rkcg->rkcg_wait_resp = -1; } - +/** + * @brief No-op, just serves for awaking the main loop when needed. + * TODO: complete the refactor and serve directly from here. + */ +static void rd_kafka_cgrp_serve_timer_cb(rd_kafka_timers_t *rkts, void *arg) { +} /** * @struct Auxillary glue type used for COOPERATIVE rebalance set operations. @@ -248,11 +263,28 @@ typedef RD_MAP_TYPE(const rd_kafka_topic_partition_t *, /** * @returns true if consumer has joined the group and thus requires a leave. + * + * `rkcg_member_id` is sufficient to know this with "classic" group protocol. */ -#define RD_KAFKA_CGRP_HAS_JOINED(rkcg) \ - (rkcg->rkcg_member_id != NULL && \ +#define RD_KAFKA_CGRP_HAS_JOINED_CLASSIC(rkcg) \ + (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CLASSIC && \ + rkcg->rkcg_member_id != NULL && \ RD_KAFKAP_STR_LEN((rkcg)->rkcg_member_id) > 0) +/** + * @returns true if consumer has joined the group and thus requires a leave. + * + * With "consumer" group protocol we cannot rely on the `rkcg_member_id` + * as it's client generated. + */ +#define RD_KAFKA_CGRP_HAS_JOINED_CONSUMER(rkcg) \ + (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER && \ + rkcg->rkcg_generation_id > 0) + +#define RD_KAFKA_CGRP_HAS_JOINED(rkcg) \ + (RD_KAFKA_CGRP_HAS_JOINED_CLASSIC(rkcg) || \ + RD_KAFKA_CGRP_HAS_JOINED_CONSUMER(rkcg)) + /** * @returns true if cgrp is waiting for a rebalance_cb to be handled by @@ -338,11 +370,53 @@ static int rd_kafka_cgrp_set_state(rd_kafka_cgrp_t *rkcg, int state) { return 1; } +/** + * @brief Set the cgrp last error and current timestamp + * as last error timestamp. + */ +static void rd_kafka_cgrp_set_last_err(rd_kafka_cgrp_t *rkcg, + rd_kafka_resp_err_t rkcg_last_err) { + rkcg->rkcg_last_err = rkcg_last_err; + rkcg->rkcg_ts_last_err = rd_clock(); +} + +/** + * @brief Clears cgrp last error and its timestamp. + */ +static void rd_kafka_cgrp_clear_last_err(rd_kafka_cgrp_t *rkcg) { + rkcg->rkcg_last_err = RD_KAFKA_RESP_ERR_NO_ERROR; + rkcg->rkcg_ts_last_err = 0; +} + +/** + * @brief Clears cgrp last error if it's an heartbeat related error like + * a topic authorization failed one. + */ +static void +rd_kafka_cgrp_maybe_clear_heartbeat_failed_err(rd_kafka_cgrp_t *rkcg) { + if (rkcg->rkcg_last_err == + RD_KAFKA_RESP_ERR_TOPIC_AUTHORIZATION_FAILED) { + rd_kafka_cgrp_clear_last_err(rkcg); + } +} + void rd_kafka_cgrp_set_join_state(rd_kafka_cgrp_t *rkcg, int join_state) { if ((int)rkcg->rkcg_join_state == join_state) return; + if (rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_INIT || + rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_STEADY) { + /* Start timer when leaving the INIT or STEADY state */ + rkcg->rkcg_ts_rebalance_start = rd_clock(); + } else if (join_state == RD_KAFKA_CGRP_JOIN_STATE_STEADY) { + /* End timer when reaching the STEADY state */ + rd_dassert(rkcg->rkcg_ts_rebalance_start); + rd_avg_add(&rkcg->rkcg_rk->rk_telemetry.rd_avg_current + .rk_avg_rebalance_latency, + rd_clock() - rkcg->rkcg_ts_rebalance_start); + } + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "CGRPJOINSTATE", "Group \"%.*s\" changed join state %s -> %s " "(state %s)", @@ -357,10 +431,20 @@ void rd_kafka_cgrp_set_join_state(rd_kafka_cgrp_t *rkcg, int join_state) { void rd_kafka_cgrp_destroy_final(rd_kafka_cgrp_t *rkcg) { rd_kafka_assert(rkcg->rkcg_rk, !rkcg->rkcg_subscription); rd_kafka_assert(rkcg->rkcg_rk, !rkcg->rkcg_group_leader.members); + rd_kafka_assert(rkcg->rkcg_rk, !rkcg->rkcg_subscription_topics); + rd_kafka_assert(rkcg->rkcg_rk, !rkcg->rkcg_subscription_regex); rd_kafka_cgrp_set_member_id(rkcg, NULL); + rd_kafka_topic_partition_list_destroy(rkcg->rkcg_current_assignment); + RD_IF_FREE(rkcg->rkcg_target_assignment, + rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(rkcg->rkcg_next_target_assignment, + rd_kafka_topic_partition_list_destroy); if (rkcg->rkcg_group_instance_id) rd_kafkap_str_destroy(rkcg->rkcg_group_instance_id); - + if (rkcg->rkcg_group_remote_assignor) + rd_kafkap_str_destroy(rkcg->rkcg_group_remote_assignor); + if (rkcg->rkcg_client_rack) + rd_kafkap_str_destroy(rkcg->rkcg_client_rack); rd_kafka_q_destroy_owner(rkcg->rkcg_q); rd_kafka_q_destroy_owner(rkcg->rkcg_ops); rd_kafka_q_destroy_owner(rkcg->rkcg_wait_coord_q); @@ -369,7 +453,8 @@ void rd_kafka_cgrp_destroy_final(rd_kafka_cgrp_t *rkcg) { rd_list_destroy(&rkcg->rkcg_toppars); rd_list_destroy(rkcg->rkcg_subscribed_topics); rd_kafka_topic_partition_list_destroy(rkcg->rkcg_errored_topics); - if (rkcg->rkcg_assignor && rkcg->rkcg_assignor->rkas_destroy_state_cb) + if (rkcg->rkcg_assignor && rkcg->rkcg_assignor->rkas_destroy_state_cb && + rkcg->rkcg_assignor_state) rkcg->rkcg_assignor->rkas_destroy_state_cb( rkcg->rkcg_assignor_state); rd_free(rkcg); @@ -396,18 +481,19 @@ rd_kafka_cgrp_update_session_timeout(rd_kafka_cgrp_t *rkcg, rd_bool_t reset) { rd_kafka_cgrp_t *rd_kafka_cgrp_new(rd_kafka_t *rk, + rd_kafka_group_protocol_t group_protocol, const rd_kafkap_str_t *group_id, const rd_kafkap_str_t *client_id) { rd_kafka_cgrp_t *rkcg; - rkcg = rd_calloc(1, sizeof(*rkcg)); - rkcg->rkcg_rk = rk; - rkcg->rkcg_group_id = group_id; - rkcg->rkcg_client_id = client_id; - rkcg->rkcg_coord_id = -1; - rkcg->rkcg_generation_id = -1; - rkcg->rkcg_wait_resp = -1; + rkcg->rkcg_rk = rk; + rkcg->rkcg_group_protocol = group_protocol; + rkcg->rkcg_group_id = group_id; + rkcg->rkcg_client_id = client_id; + rkcg->rkcg_coord_id = -1; + rkcg->rkcg_generation_id = -1; + rkcg->rkcg_wait_resp = -1; rkcg->rkcg_ops = rd_kafka_q_new(rk); rkcg->rkcg_ops->rkq_serve = rd_kafka_cgrp_op_serve; @@ -415,21 +501,41 @@ rd_kafka_cgrp_t *rd_kafka_cgrp_new(rd_kafka_t *rk, rkcg->rkcg_wait_coord_q = rd_kafka_q_new(rk); rkcg->rkcg_wait_coord_q->rkq_serve = rkcg->rkcg_ops->rkq_serve; rkcg->rkcg_wait_coord_q->rkq_opaque = rkcg->rkcg_ops->rkq_opaque; - rkcg->rkcg_q = rd_kafka_q_new(rk); + rkcg->rkcg_q = rd_kafka_consume_q_new(rk); rkcg->rkcg_group_instance_id = rd_kafkap_str_new(rk->rk_conf.group_instance_id, -1); + rkcg->rkcg_group_remote_assignor = + rd_kafkap_str_new(rk->rk_conf.group_remote_assignor, -1); + if (!RD_KAFKAP_STR_LEN(rkcg->rkcg_rk->rk_conf.client_rack)) + rkcg->rkcg_client_rack = rd_kafkap_str_new(NULL, -1); + else + rkcg->rkcg_client_rack = + rd_kafkap_str_copy(rkcg->rkcg_rk->rk_conf.client_rack); + rkcg->rkcg_next_subscription = NULL; TAILQ_INIT(&rkcg->rkcg_topics); rd_list_init(&rkcg->rkcg_toppars, 32, NULL); - rd_kafka_cgrp_set_member_id(rkcg, ""); + + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_Uuid_t uuid = rd_kafka_Uuid_random(); + rd_kafka_cgrp_set_member_id(rkcg, + rd_kafka_Uuid_base64str(&uuid)); + } else { + rd_kafka_cgrp_set_member_id(rkcg, ""); + } + rkcg->rkcg_subscribed_topics = - rd_list_new(0, (void *)rd_kafka_topic_info_destroy); + rd_list_new(0, rd_kafka_topic_info_destroy_free); rd_interval_init(&rkcg->rkcg_coord_query_intvl); rd_interval_init(&rkcg->rkcg_heartbeat_intvl); rd_interval_init(&rkcg->rkcg_join_intvl); rd_interval_init(&rkcg->rkcg_timeout_scan_intvl); rd_atomic32_init(&rkcg->rkcg_assignment_lost, rd_false); rd_atomic32_init(&rkcg->rkcg_terminated, rd_false); + rd_atomic32_init(&rkcg->rkcg_subscription_version, 0); + rkcg->rkcg_current_assignment = rd_kafka_topic_partition_list_new(0); + rkcg->rkcg_target_assignment = NULL; + rkcg->rkcg_next_target_assignment = NULL; rkcg->rkcg_errored_topics = rd_kafka_topic_partition_list_new(0); @@ -449,6 +555,12 @@ rd_kafka_cgrp_t *rd_kafka_cgrp_new(rd_kafka_t *rk, rk->rk_conf.auto_commit_interval_ms * 1000ll, rd_kafka_cgrp_offset_commit_tmr_cb, rkcg); + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_log(rk, LOG_WARNING, "CGRP", + "KIP-848 Consumer Group Protocol is in 'Preview' " + "and MUST NOT be used in production"); + } + return rkcg; } @@ -542,6 +654,9 @@ static int rd_kafka_cgrp_coord_update(rd_kafka_cgrp_t *rkcg, int32_t coord_id) { /* Clear previous broker handle, if any */ if (rkcg->rkcg_curr_coord) rd_kafka_cgrp_coord_clear_broker(rkcg); + + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rkcg, "coordinator changed"); } @@ -676,6 +791,8 @@ err: RD_KAFKA_ERR_ACTION_RETRY, RD_KAFKA_RESP_ERR__TIMED_OUT_QUEUE, + RD_KAFKA_ERR_ACTION_RETRY, RD_KAFKA_RESP_ERR__DESTROY_BROKER, + RD_KAFKA_ERR_ACTION_END); @@ -692,7 +809,13 @@ err: "FindCoordinator response error: %s", errstr); /* Suppress repeated errors */ - rkcg->rkcg_last_err = ErrorCode; + rd_kafka_cgrp_set_last_err(rkcg, ErrorCode); + } + + if (ErrorCode == RD_KAFKA_RESP_ERR__DESTROY_BROKER) { + /* This error is one-time and should cause + * an immediate retry. */ + rd_interval_reset(&rkcg->rkcg_coord_query_intvl); } /* Retries are performed by the timer-intervalled @@ -753,8 +876,11 @@ void rd_kafka_cgrp_coord_query(rd_kafka_cgrp_t *rkcg, const char *reason) { rd_kafka_broker_destroy(rkb); - /* Back off the next intervalled query since we just sent one. */ - rd_interval_reset_to_now(&rkcg->rkcg_coord_query_intvl, 0); + /* Back off the next intervalled query with a jitter since we just sent + * one. */ + rd_interval_reset_to_now_with_jitter(&rkcg->rkcg_coord_query_intvl, 0, + 500, + RD_KAFKA_RETRY_JITTER_PERCENT); } /** @@ -795,6 +921,44 @@ rd_kafka_broker_t *rd_kafka_cgrp_get_coord(rd_kafka_cgrp_t *rkcg) { return rkcg->rkcg_coord; } +#define rd_kafka_cgrp_will_leave(rkcg) \ + (rkcg->rkcg_flags & (RD_KAFKA_CGRP_F_LEAVE_ON_UNASSIGN_DONE | \ + RD_KAFKA_CGRP_F_WAIT_LEAVE)) + +#define rd_kafka_cgrp_consumer_will_rejoin(rkcg) \ + (rkcg->rkcg_consumer_flags & \ + (RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN | \ + RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN_TO_COMPLETE)) + +#define rd_kafka_cgrp_consumer_subscription_preconditions_met(rkcg) \ + (!RD_KAFKA_CGRP_REBALANCING(rkcg) && \ + rkcg->rkcg_consumer_flags & \ + RD_KAFKA_CGRP_CONSUMER_F_SEND_NEW_SUBSCRIPTION) + +static int32_t +rd_kafka_cgrp_subscription_set(rd_kafka_cgrp_t *rkcg, + rd_kafka_topic_partition_list_t *rktparlist); + +/** + * @brief Apply next subscription in \p rkcg , if set. + */ +static void rd_kafka_cgrp_consumer_apply_next_subscribe(rd_kafka_cgrp_t *rkcg) { + if (rkcg->rkcg_next_subscription) { + if (unlikely(rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE)) { + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_next_subscription); + rkcg->rkcg_next_subscription = NULL; + return; + } + + rd_kafka_cgrp_subscription_set(rkcg, + rkcg->rkcg_next_subscription); + rkcg->rkcg_next_subscription = NULL; + + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rkcg, "subscription changed"); + } +} /** * @brief cgrp handling of LeaveGroup responses @@ -847,6 +1011,122 @@ err_parse: goto err; } +static void rd_kafka_cgrp_consumer_reset(rd_kafka_cgrp_t *rkcg) { + if (rkcg->rkcg_group_protocol != RD_KAFKA_GROUP_PROTOCOL_CONSUMER) + return; + + rkcg->rkcg_generation_id = 0; + rd_kafka_topic_partition_list_destroy(rkcg->rkcg_current_assignment); + RD_IF_FREE(rkcg->rkcg_target_assignment, + rd_kafka_topic_partition_list_destroy); + rkcg->rkcg_target_assignment = NULL; + RD_IF_FREE(rkcg->rkcg_next_target_assignment, + rd_kafka_topic_partition_list_destroy); + rkcg->rkcg_next_target_assignment = NULL; + rkcg->rkcg_current_assignment = rd_kafka_topic_partition_list_new(0); + + /* Leave only specified flags, reset the rest */ + rkcg->rkcg_consumer_flags = + (rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_SUBSCRIBED_ONCE) | + (rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN_TO_COMPLETE); +} + +/** + * @brief cgrp handling of ConsumerGroupHeartbeat response after leaving group + * @param opaque must be the cgrp handle. + * @locality rdkafka main thread (unless err==ERR__DESTROY) + */ +static void +rd_kafka_cgrp_handle_ConsumerGroupHeartbeat_leave(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + void *opaque) { + rd_kafka_cgrp_t *rkcg = opaque; + const int log_decode_errors = LOG_ERR; + int16_t ErrorCode = 0; + + if (err) { + ErrorCode = err; + goto err; + } + + rd_kafka_buf_read_throttle_time(rkbuf); + + rd_kafka_buf_read_i16(rkbuf, &ErrorCode); + +err: + if (ErrorCode) + rd_kafka_dbg( + rkb->rkb_rk, CGRP, "LEAVEGROUP", + "ConsumerGroupHeartbeat response error in state %s: %s", + rd_kafka_cgrp_state_names[rkcg->rkcg_state], + rd_kafka_err2str(ErrorCode)); + else + rd_kafka_dbg( + rkb->rkb_rk, CGRP, "LEAVEGROUP", + "ConsumerGroupHeartbeat response received in state %s", + rd_kafka_cgrp_state_names[rkcg->rkcg_state]); + + rd_kafka_cgrp_consumer_reset(rkcg); + + if (ErrorCode != RD_KAFKA_RESP_ERR__DESTROY) { + rd_assert(thrd_is_current(rk->rk_thread)); + rkcg->rkcg_flags &= ~RD_KAFKA_CGRP_F_WAIT_LEAVE; + rd_kafka_cgrp_try_terminate(rkcg); + } + + return; + +err_parse: + ErrorCode = rkbuf->rkbuf_err; + goto err; +} + +static void rd_kafka_cgrp_consumer_leave(rd_kafka_cgrp_t *rkcg) { + int32_t member_epoch = -1; + + if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_WAIT_LEAVE) { + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "LEAVE", + "Group \"%.*s\": leave (in state %s): " + "ConsumerGroupHeartbeat already in-transit", + RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), + rd_kafka_cgrp_state_names[rkcg->rkcg_state]); + return; + } + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "LEAVE", + "Group \"%.*s\": leave (in state %s)", + RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), + rd_kafka_cgrp_state_names[rkcg->rkcg_state]); + + rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_WAIT_LEAVE; + if (RD_KAFKA_CGRP_IS_STATIC_MEMBER(rkcg)) { + member_epoch = -2; + } + + if (rkcg->rkcg_state == RD_KAFKA_CGRP_STATE_UP) { + rd_rkb_dbg(rkcg->rkcg_curr_coord, CONSUMER, "LEAVE", + "Leaving group"); + rd_kafka_ConsumerGroupHeartbeatRequest( + rkcg->rkcg_coord, rkcg->rkcg_group_id, rkcg->rkcg_member_id, + member_epoch, rkcg->rkcg_group_instance_id, + NULL /* no rack */, -1 /* no rebalance_timeout_ms */, + NULL /* no subscription topics */, + NULL /* no regex subscription */, + NULL /* no remote assignor */, + NULL /* no current assignment */, + RD_KAFKA_REPLYQ(rkcg->rkcg_ops, 0), + rd_kafka_cgrp_handle_ConsumerGroupHeartbeat_leave, rkcg); + } else { + rd_kafka_cgrp_handle_ConsumerGroupHeartbeat_leave( + rkcg->rkcg_rk, rkcg->rkcg_coord, + RD_KAFKA_RESP_ERR__WAIT_COORD, NULL, NULL, rkcg); + } +} static void rd_kafka_cgrp_leave(rd_kafka_cgrp_t *rkcg) { char *member_id; @@ -900,22 +1180,25 @@ static rd_bool_t rd_kafka_cgrp_leave_maybe(rd_kafka_cgrp_t *rkcg) { rkcg->rkcg_flags &= ~RD_KAFKA_CGRP_F_LEAVE_ON_UNASSIGN_DONE; - /* Don't send Leave when termating with NO_CONSUMER_CLOSE flag */ + /* Don't send Leave when terminating with NO_CONSUMER_CLOSE flag */ if (rd_kafka_destroy_flags_no_consumer_close(rkcg->rkcg_rk)) return rd_false; - /* KIP-345: Static group members must not send a LeaveGroupRequest - * on termination. */ - if (RD_KAFKA_CGRP_IS_STATIC_MEMBER(rkcg) && - rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE) - return rd_false; + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_cgrp_consumer_leave(rkcg); + } else { + /* KIP-345: Static group members must not send a + * LeaveGroupRequest on termination. */ + if (RD_KAFKA_CGRP_IS_STATIC_MEMBER(rkcg) && + rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE) + return rd_false; - rd_kafka_cgrp_leave(rkcg); + rd_kafka_cgrp_leave(rkcg); + } return rd_true; } - /** * @brief Enqueues a rebalance op, delegating responsibility of calling * incremental_assign / incremental_unassign to the application. @@ -1163,6 +1446,50 @@ done: rd_kafka_cgrp_group_assignment_set(rkcg, NULL); } +/** + * @brief Rejoin the group (KIP-848). + */ +static void +rd_kafka_cgrp_consumer_rejoin(rd_kafka_cgrp_t *rkcg, const char *fmt, ...) { + char reason[512]; + va_list ap; + char astr[128]; + + va_start(ap, fmt); + rd_vsnprintf(reason, sizeof(reason), fmt, ap); + va_end(ap); + + if (rkcg->rkcg_group_assignment) + rd_snprintf(astr, sizeof(astr), " with %d owned partition(s)", + rkcg->rkcg_group_assignment->cnt); + else + rd_snprintf(astr, sizeof(astr), " without an assignment"); + + if (rkcg->rkcg_subscription || rkcg->rkcg_next_subscription) { + rd_kafka_dbg( + rkcg->rkcg_rk, CONSUMER | RD_KAFKA_DBG_CGRP, "REJOIN", + "Group \"%s\": %s group%s: %s", rkcg->rkcg_group_id->str, + rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_INIT + ? "Joining" + : "Rejoining", + astr, reason); + } else { + rd_kafka_dbg( + rkcg->rkcg_rk, CONSUMER | RD_KAFKA_DBG_CGRP, "NOREJOIN", + "Group \"%s\": Not %s group%s: %s: " + "no subscribed topics", + rkcg->rkcg_group_id->str, + rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_INIT + ? "joining" + : "rejoining", + astr, reason); + } + + rd_kafka_cgrp_leave_maybe(rkcg); + rd_kafka_cgrp_consumer_reset(rkcg); + rd_kafka_cgrp_set_join_state(rkcg, RD_KAFKA_CGRP_JOIN_STATE_INIT); + rd_kafka_cgrp_consumer_expedite_next_heartbeat(rkcg, "rejoining"); +} /** * @brief Rejoin the group. @@ -1177,6 +1504,10 @@ static void rd_kafka_cgrp_rejoin(rd_kafka_cgrp_t *rkcg, const char *fmt, ...) { char reason[512]; va_list ap; char astr[128]; + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_cgrp_consumer_rejoin(rkcg, fmt, ap); + return; + } va_start(ap, fmt); rd_vsnprintf(reason, sizeof(reason), fmt, ap); @@ -1505,10 +1836,13 @@ static void rd_kafka_cgrp_handle_SyncGroup_memberstate( rkbuf->rkbuf_rkb = rd_kafka_broker_internal(rkcg->rkcg_rk); rd_kafka_buf_read_i16(rkbuf, &Version); + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; if (!(assignment = rd_kafka_buf_read_topic_partitions( - rkbuf, 0, rd_false, rd_false))) + rkbuf, rd_false /*don't use topic_id*/, rd_true, 0, fields))) goto err_parse; - rd_kafka_buf_read_bytes(rkbuf, &UserData); + rd_kafka_buf_read_kbytes(rkbuf, &UserData); done: rd_kafka_cgrp_update_session_timeout(rkcg, rd_true /*reset timeout*/); @@ -1613,7 +1947,7 @@ static void rd_kafka_cgrp_handle_SyncGroup(rd_kafka_t *rk, rd_kafka_buf_read_throttle_time(rkbuf); rd_kafka_buf_read_i16(rkbuf, &ErrorCode); - rd_kafka_buf_read_bytes(rkbuf, &MemberState); + rd_kafka_buf_read_kbytes(rkbuf, &MemberState); err: actions = rd_kafka_err_action(rkb, ErrorCode, request, @@ -1659,7 +1993,7 @@ err_parse: static void rd_kafka_cgrp_assignor_run(rd_kafka_cgrp_t *rkcg, rd_kafka_assignor_t *rkas, rd_kafka_resp_err_t err, - rd_kafka_metadata_t *metadata, + rd_kafka_metadata_internal_t *metadata, rd_kafka_group_member_t *members, int member_cnt) { char errstr[512]; @@ -1674,8 +2008,8 @@ static void rd_kafka_cgrp_assignor_run(rd_kafka_cgrp_t *rkcg, *errstr = '\0'; /* Run assignor */ - err = rd_kafka_assignor_run(rkcg, rkas, metadata, members, member_cnt, - errstr, sizeof(errstr)); + err = rd_kafka_assignor_run(rkcg, rkas, &metadata->metadata, members, + member_cnt, errstr, sizeof(errstr)); if (err) { if (!*errstr) @@ -1742,7 +2076,7 @@ rd_kafka_cgrp_assignor_handle_Metadata_op(rd_kafka_t *rk, } rd_kafka_cgrp_assignor_run(rkcg, rkcg->rkcg_assignor, rko->rko_err, - rko->rko_u.metadata.md, + rko->rko_u.metadata.mdi, rkcg->rkcg_group_leader.members, rkcg->rkcg_group_leader.member_cnt); @@ -1774,9 +2108,12 @@ static int rd_kafka_group_MemberMetadata_consumer_read( rkbuf = rd_kafka_buf_new_shadow( MemberMetadata->data, RD_KAFKAP_BYTES_LEN(MemberMetadata), NULL); - /* Protocol parser needs a broker handle to log errors on. */ - rkbuf->rkbuf_rkb = rkb; - rd_kafka_broker_keep(rkb); + /* Protocol parser needs a broker handle to log errors on. + * If none is provided, don't log errors (mainly for unit tests). */ + if (rkb) { + rkbuf->rkbuf_rkb = rkb; + rd_kafka_broker_keep(rkb); + } rd_kafka_buf_read_i16(rkbuf, &Version); rd_kafka_buf_read_i32(rkbuf, &subscription_cnt); @@ -1796,14 +2133,27 @@ static int rd_kafka_group_MemberMetadata_consumer_read( rkgm->rkgm_subscription, topic_name, RD_KAFKA_PARTITION_UA); } - rd_kafka_buf_read_bytes(rkbuf, &UserData); + rd_kafka_buf_read_kbytes(rkbuf, &UserData); rkgm->rkgm_userdata = rd_kafkap_bytes_copy(&UserData); + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; if (Version >= 1 && !(rkgm->rkgm_owned = rd_kafka_buf_read_topic_partitions( - rkbuf, 0, rd_false, rd_false))) + rkbuf, rd_false /*don't use topic_id*/, rd_true, 0, fields))) goto err; + if (Version >= 2) { + rd_kafka_buf_read_i32(rkbuf, &rkgm->rkgm_generation); + } + + if (Version >= 3) { + rd_kafkap_str_t RackId = RD_KAFKAP_STR_INITIALIZER; + rd_kafka_buf_read_str(rkbuf, &RackId); + rkgm->rkgm_rack_id = rd_kafkap_str_copy(&RackId); + } + rd_kafka_buf_destroy(rkbuf); return 0; @@ -1812,10 +2162,11 @@ err_parse: err = rkbuf->rkbuf_err; err: - rd_rkb_dbg(rkb, CGRP, "MEMBERMETA", - "Failed to parse MemberMetadata for \"%.*s\": %s", - RD_KAFKAP_STR_PR(rkgm->rkgm_member_id), - rd_kafka_err2str(err)); + if (rkb) + rd_rkb_dbg(rkb, CGRP, "MEMBERMETA", + "Failed to parse MemberMetadata for \"%.*s\": %s", + RD_KAFKAP_STR_PR(rkgm->rkgm_member_id), + rd_kafka_err2str(err)); if (rkgm->rkgm_subscription) { rd_kafka_topic_partition_list_destroy(rkgm->rkgm_subscription); rkgm->rkgm_subscription = NULL; @@ -1893,7 +2244,9 @@ static void rd_kafka_cgrp_handle_JoinGroup(rd_kafka_t *rk, "Unsupported assignment strategy \"%s\"", protocol_name); if (rkcg->rkcg_assignor) { - if (rkcg->rkcg_assignor->rkas_destroy_state_cb) + if (rkcg->rkcg_assignor + ->rkas_destroy_state_cb && + rkcg->rkcg_assignor_state) rkcg->rkcg_assignor ->rkas_destroy_state_cb( rkcg->rkcg_assignor_state); @@ -1931,7 +2284,8 @@ static void rd_kafka_cgrp_handle_JoinGroup(rd_kafka_t *rk, } if (rkcg->rkcg_assignor && rkcg->rkcg_assignor != rkas) { - if (rkcg->rkcg_assignor->rkas_destroy_state_cb) + if (rkcg->rkcg_assignor->rkas_destroy_state_cb && + rkcg->rkcg_assignor_state) rkcg->rkcg_assignor->rkas_destroy_state_cb( rkcg->rkcg_assignor_state); rkcg->rkcg_assignor_state = NULL; @@ -1944,6 +2298,7 @@ static void rd_kafka_cgrp_handle_JoinGroup(rd_kafka_t *rk, int sub_cnt = 0; rd_list_t topics; rd_kafka_op_t *rko; + rd_bool_t any_member_rack = rd_false; rd_kafka_dbg(rkb->rkb_rk, CGRP, "JOINGROUP", "I am elected leader for group \"%s\" " "with %" PRId32 " member(s)", @@ -1968,7 +2323,7 @@ static void rd_kafka_cgrp_handle_JoinGroup(rd_kafka_t *rk, rd_kafka_buf_read_str(rkbuf, &MemberId); if (request->rkbuf_reqhdr.ApiVersion >= 5) rd_kafka_buf_read_str(rkbuf, &GroupInstanceId); - rd_kafka_buf_read_bytes(rkbuf, &MemberMetadata); + rd_kafka_buf_read_kbytes(rkbuf, &MemberMetadata); rkgm = &members[sub_cnt]; rkgm->rkgm_member_id = rd_kafkap_str_copy(&MemberId); @@ -1989,6 +2344,9 @@ static void rd_kafka_cgrp_handle_JoinGroup(rd_kafka_t *rk, rd_kafka_topic_partition_list_get_topic_names( rkgm->rkgm_subscription, &topics, 0 /*dont include regex*/); + if (!any_member_rack && rkgm->rkgm_rack_id && + RD_KAFKAP_STR_LEN(rkgm->rkgm_rack_id)) + any_member_rack = rd_true; } } @@ -2016,7 +2374,7 @@ static void rd_kafka_cgrp_handle_JoinGroup(rd_kafka_t *rk, rd_kafka_op_set_replyq(rko, rkcg->rkcg_ops, NULL); rd_kafka_MetadataRequest( - rkb, &topics, "partition assignor", + rkb, &topics, NULL, "partition assignor", rd_false /*!allow_auto_create*/, /* cgrp_update=false: * Since the subscription list may not be identical @@ -2026,7 +2384,13 @@ static void rd_kafka_cgrp_handle_JoinGroup(rd_kafka_t *rk, * avoid triggering a rejoin or error propagation * on receiving the response since some topics * may be missing. */ - rd_false, rko); + rd_false, + /* cgrp_update=false: no subscription version is used */ + -1, + /* force_racks is true if any memeber has a client rack set, + since we will require partition to rack mapping in that + case for rack-aware assignors. */ + any_member_rack, rko); rd_list_destroy(&topics); } else { @@ -2127,7 +2491,10 @@ static rd_kafka_op_res_t rd_kafka_cgrp_handle_Metadata_op(rd_kafka_t *rk, if (rko->rko_err == RD_KAFKA_RESP_ERR__DESTROY) return RD_KAFKA_OP_RES_HANDLED; /* Terminating */ - rd_kafka_cgrp_metadata_update_check(rkcg, rd_false /*dont rejoin*/); + if (rd_atomic32_get(&rkcg->rkcg_subscription_version) == + rko->rko_u.metadata.subscription_version) + rd_kafka_cgrp_metadata_update_check(rkcg, + rd_false /*dont rejoin*/); return RD_KAFKA_OP_RES_HANDLED; } @@ -2144,6 +2511,7 @@ static rd_kafka_op_res_t rd_kafka_cgrp_handle_Metadata_op(rd_kafka_t *rk, */ static int rd_kafka_cgrp_metadata_refresh(rd_kafka_cgrp_t *rkcg, int *metadata_agep, + int32_t cgrp_subscription_version, const char *reason) { rd_kafka_t *rk = rkcg->rkcg_rk; rd_kafka_op_t *rko; @@ -2153,8 +2521,8 @@ static int rd_kafka_cgrp_metadata_refresh(rd_kafka_cgrp_t *rkcg, rd_list_init(&topics, 8, rd_free); /* Insert all non-wildcard topics in cache. */ - rd_kafka_metadata_cache_hint_rktparlist( - rkcg->rkcg_rk, rkcg->rkcg_subscription, NULL, 0 /*dont replace*/); + rd_kafka_metadata_cache_hint_rktparlist(rkcg->rkcg_rk, + rkcg->rkcg_subscription, NULL); if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION) { /* For wildcard subscriptions make sure the @@ -2212,10 +2580,19 @@ static int rd_kafka_cgrp_metadata_refresh(rd_kafka_cgrp_t *rkcg, rd_kafka_cgrp_handle_Metadata_op); rd_kafka_op_set_replyq(rko, rkcg->rkcg_ops, 0); - err = rd_kafka_metadata_request(rkcg->rkcg_rk, NULL, &topics, - rd_false /*!allow auto create */, - rd_true /*cgrp_update*/, reason, rko); + err = rd_kafka_metadata_request( + rkcg->rkcg_rk, NULL, &topics, rd_false /*!allow auto create */, + rd_true /*cgrp_update*/, cgrp_subscription_version, reason, rko); if (err) { + /* Hint cache that something is interested in + * these topics so that they will be included in + * a future all known_topics query. */ + + rd_kafka_wrlock(rk); + rd_kafka_metadata_cache_hint(rk, &topics, NULL, + RD_KAFKA_RESP_ERR__NOENT); + rd_kafka_wrunlock(rk); + rd_kafka_dbg(rk, CGRP | RD_KAFKA_DBG_METADATA, "CGRPMETADATA", "%s: need to refresh metadata (%dms old) " "but no usable brokers available: %s", @@ -2230,8 +2607,9 @@ static int rd_kafka_cgrp_metadata_refresh(rd_kafka_cgrp_t *rkcg, -static void rd_kafka_cgrp_join(rd_kafka_cgrp_t *rkcg) { - int metadata_age; +static void rd_kafka_cgrp_join(rd_kafka_cgrp_t *rkcg, + int32_t cgrp_subscription_version) { + int metadata_age, metadata_refresh_outcome; if (rkcg->rkcg_state != RD_KAFKA_CGRP_STATE_UP || rkcg->rkcg_join_state != RD_KAFKA_CGRP_JOIN_STATE_INIT || @@ -2265,8 +2643,9 @@ static void rd_kafka_cgrp_join(rd_kafka_cgrp_t *rkcg) { */ /* We need up-to-date full metadata to continue, * refresh metadata if necessary. */ - if (rd_kafka_cgrp_metadata_refresh(rkcg, &metadata_age, - "consumer join") == 1) { + metadata_refresh_outcome = rd_kafka_cgrp_metadata_refresh( + rkcg, &metadata_age, cgrp_subscription_version, "consumer join"); + if (metadata_refresh_outcome == 1) { rd_kafka_dbg(rkcg->rkcg_rk, CGRP | RD_KAFKA_DBG_CONSUMER, "JOIN", "Group \"%.*s\": " @@ -2283,6 +2662,14 @@ static void rd_kafka_cgrp_join(rd_kafka_cgrp_t *rkcg) { rkcg, RD_KAFKA_CGRP_JOIN_STATE_WAIT_METADATA); return; /* ^ async call */ + } else if (metadata_refresh_outcome == -1) { + rd_kafka_dbg(rkcg->rkcg_rk, CGRP | RD_KAFKA_DBG_CONSUMER, + "JOIN", + "Group \"%.*s\": " + "postponing join until up-to-date " + "metadata can be requested", + RD_KAFKAP_STR_PR(rkcg->rkcg_group_id)); + return; /* ^ async call */ } if (rd_list_empty(rkcg->rkcg_subscribed_topics)) @@ -2369,7 +2756,7 @@ static rd_bool_t rd_kafka_cgrp_update_subscribed_topics(rd_kafka_cgrp_t *rkcg, "clearing subscribed topics list (%d)", RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), rd_list_cnt(rkcg->rkcg_subscribed_topics)); - tinfos = rd_list_new(0, (void *)rd_kafka_topic_info_destroy); + tinfos = rd_list_new(0, rd_kafka_topic_info_destroy_free); } else { if (rd_list_cnt(tinfos) == 0) @@ -2410,6 +2797,571 @@ static rd_bool_t rd_kafka_cgrp_update_subscribed_topics(rd_kafka_cgrp_t *rkcg, return rd_true; } +/** + * Compares a new target assignment with + * existing consumer group assignment. + * + * Returns that they're the same assignment + * in two cases: + * + * 1) If target assignment is present and the + * new assignment is same as target assignment, + * then we are already in process of adding that + * target assignment. + * 2) If target assignment is not present and + * the new assignment is same as current assignment, + * then we are already at correct assignment. + * + * @param new_target_assignment New target assignment + * + * @return Is the new assignment different from what's being handled by + * group \p cgrp ? + **/ +static rd_bool_t rd_kafka_cgrp_consumer_is_new_assignment_different( + rd_kafka_cgrp_t *rkcg, + rd_kafka_topic_partition_list_t *new_target_assignment) { + int is_assignment_different; + if (rkcg->rkcg_target_assignment) { + is_assignment_different = rd_kafka_topic_partition_list_cmp( + new_target_assignment, rkcg->rkcg_target_assignment, + rd_kafka_topic_partition_by_id_cmp); + } else { + is_assignment_different = rd_kafka_topic_partition_list_cmp( + new_target_assignment, rkcg->rkcg_current_assignment, + rd_kafka_topic_partition_by_id_cmp); + } + return is_assignment_different ? rd_true : rd_false; +} + +static rd_kafka_op_res_t rd_kafka_cgrp_consumer_handle_next_assignment( + rd_kafka_cgrp_t *rkcg, + rd_kafka_topic_partition_list_t *new_target_assignment, + rd_bool_t clear_next_assignment) { + rd_bool_t is_assignment_different = rd_false; + rd_bool_t has_next_target_assignment_to_clear = + rkcg->rkcg_next_target_assignment && clear_next_assignment; + if (rkcg->rkcg_consumer_flags & RD_KAFKA_CGRP_CONSUMER_F_WAIT_ACK) { + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Reconciliation in progress, " + "postponing next one"); + return RD_KAFKA_OP_RES_HANDLED; + } + + is_assignment_different = + rd_kafka_cgrp_consumer_is_new_assignment_different( + rkcg, new_target_assignment); + + /* Starts reconcilation only when the group is in state + * INIT or state STEADY, keeps it as next target assignment + * otherwise. */ + if (!is_assignment_different) { + if (has_next_target_assignment_to_clear) { + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_next_target_assignment); + rkcg->rkcg_next_target_assignment = NULL; + } + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Not reconciling new assignment: " + "Assignment is the same. " + "Next assignment %s", + (has_next_target_assignment_to_clear + ? "cleared" + : "not cleared")); + + } else if (rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_INIT || + rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_STEADY) { + rkcg->rkcg_consumer_flags |= RD_KAFKA_CGRP_CONSUMER_F_WAIT_ACK; + if (rkcg->rkcg_target_assignment) { + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_target_assignment); + } + rkcg->rkcg_target_assignment = + rd_kafka_topic_partition_list_copy(new_target_assignment); + + if (has_next_target_assignment_to_clear) { + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_next_target_assignment); + rkcg->rkcg_next_target_assignment = NULL; + } + + if (rd_kafka_is_dbg(rkcg->rkcg_rk, CGRP)) { + char rkcg_target_assignment_str[512] = "NULL"; + + rd_kafka_topic_partition_list_str( + rkcg->rkcg_target_assignment, + rkcg_target_assignment_str, + sizeof(rkcg_target_assignment_str), 0); + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Reconciliation starts with new target " + "assignment \"%s\". " + "Next assignment %s", + rkcg_target_assignment_str, + (has_next_target_assignment_to_clear + ? "cleared" + : "not cleared")); + } + rd_kafka_cgrp_handle_assignment(rkcg, + rkcg->rkcg_target_assignment); + } + + return RD_KAFKA_OP_RES_HANDLED; +} + +static rd_kafka_topic_partition_list_t * +rd_kafka_cgrp_consumer_assignment_with_metadata( + rd_kafka_cgrp_t *rkcg, + rd_kafka_topic_partition_list_t *assignment, + rd_list_t **missing_topic_ids) { + int i; + rd_kafka_t *rk = rkcg->rkcg_rk; + rd_kafka_topic_partition_list_t *assignment_with_metadata = + rd_kafka_topic_partition_list_new(assignment->cnt); + for (i = 0; i < assignment->cnt; i++) { + struct rd_kafka_metadata_cache_entry *rkmce; + rd_kafka_topic_partition_t *rktpar; + char *topic_name = NULL; + rd_kafka_Uuid_t request_topic_id = + rd_kafka_topic_partition_get_topic_id( + &assignment->elems[i]); + + rd_kafka_rdlock(rk); + rkmce = + rd_kafka_metadata_cache_find_by_id(rk, request_topic_id, 1); + + if (rkmce) + topic_name = rd_strdup(rkmce->rkmce_mtopic.topic); + rd_kafka_rdunlock(rk); + + if (unlikely(!topic_name)) { + rktpar = rd_kafka_topic_partition_list_find_topic_by_id( + rkcg->rkcg_current_assignment, request_topic_id); + if (rktpar) + topic_name = rd_strdup(rktpar->topic); + } + + if (likely(topic_name != NULL)) { + rd_kafka_topic_partition_list_add_with_topic_name_and_id( + assignment_with_metadata, request_topic_id, + topic_name, assignment->elems[i].partition); + rd_free(topic_name); + continue; + } + + if (missing_topic_ids) { + if (unlikely(!*missing_topic_ids)) + *missing_topic_ids = + rd_list_new(1, rd_list_Uuid_destroy); + rd_list_add(*missing_topic_ids, + rd_kafka_Uuid_copy(&request_topic_id)); + } + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Metadata not found for the " + "assigned topic id: %s." + " Continuing without it", + rd_kafka_Uuid_base64str(&request_topic_id)); + } + if (missing_topic_ids && *missing_topic_ids) + rd_list_deduplicate(missing_topic_ids, + (void *)rd_kafka_Uuid_ptr_cmp); + return assignment_with_metadata; +} + +/** + * @brief Op callback from handle_JoinGroup + */ +static rd_kafka_op_res_t +rd_kafka_cgrp_consumer_handle_Metadata_op(rd_kafka_t *rk, + rd_kafka_q_t *rkq, + rd_kafka_op_t *rko) { + rd_kafka_cgrp_t *rkcg = rk->rk_cgrp; + rd_kafka_op_res_t assignment_handle_ret; + rd_kafka_topic_partition_list_t *assignment_with_metadata; + rd_bool_t all_partition_metadata_available; + + if (rko->rko_err == RD_KAFKA_RESP_ERR__DESTROY) + return RD_KAFKA_OP_RES_HANDLED; /* Terminating */ + + if (!rkcg->rkcg_next_target_assignment) + return RD_KAFKA_OP_RES_HANDLED; + + assignment_with_metadata = + rd_kafka_cgrp_consumer_assignment_with_metadata( + rkcg, rkcg->rkcg_next_target_assignment, NULL); + + all_partition_metadata_available = + assignment_with_metadata->cnt == + rkcg->rkcg_next_target_assignment->cnt + ? rd_true + : rd_false; + + if (rd_kafka_is_dbg(rkcg->rkcg_rk, CGRP)) { + char assignment_with_metadata_str[512] = "NULL"; + + rd_kafka_topic_partition_list_str( + assignment_with_metadata, assignment_with_metadata_str, + sizeof(assignment_with_metadata_str), 0); + + rd_kafka_dbg( + rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Metadata available for %d/%d of next target assignment, " + " which is: \"%s\"", + assignment_with_metadata->cnt, + rkcg->rkcg_next_target_assignment->cnt, + assignment_with_metadata_str); + } + + assignment_handle_ret = rd_kafka_cgrp_consumer_handle_next_assignment( + rkcg, assignment_with_metadata, all_partition_metadata_available); + rd_kafka_topic_partition_list_destroy(assignment_with_metadata); + return assignment_handle_ret; +} + +void rd_kafka_cgrp_consumer_next_target_assignment_request_metadata( + rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_kafka_topic_partition_list_t *assignment_with_metadata; + rd_kafka_op_t *rko; + rd_kafka_cgrp_t *rkcg = rk->rk_cgrp; + rd_list_t *missing_topic_ids = NULL; + + if (!rkcg->rkcg_next_target_assignment->cnt) { + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "No metadata to request, continuing"); + rd_kafka_topic_partition_list_t *new_target_assignment = + rd_kafka_topic_partition_list_new(0); + rd_kafka_cgrp_consumer_handle_next_assignment( + rkcg, new_target_assignment, rd_true); + rd_kafka_topic_partition_list_destroy(new_target_assignment); + return; + } + + + assignment_with_metadata = + rd_kafka_cgrp_consumer_assignment_with_metadata( + rkcg, rkcg->rkcg_next_target_assignment, &missing_topic_ids); + + if (!missing_topic_ids) { + /* Metadata is already available for all the topics. */ + rd_kafka_cgrp_consumer_handle_next_assignment( + rkcg, assignment_with_metadata, rd_true); + rd_kafka_topic_partition_list_destroy(assignment_with_metadata); + return; + } + rd_kafka_topic_partition_list_destroy(assignment_with_metadata); + + /* Request missing metadata. */ + rko = rd_kafka_op_new_cb(rkcg->rkcg_rk, RD_KAFKA_OP_METADATA, + rd_kafka_cgrp_consumer_handle_Metadata_op); + rd_kafka_op_set_replyq(rko, rkcg->rkcg_ops, NULL); + rd_kafka_MetadataRequest( + rkb, NULL, missing_topic_ids, "ConsumerGroupHeartbeat API Response", + rd_false /*!allow_auto_create*/, rd_false, + -1 /* no subscription version is used */, rd_false, rko); + rd_list_destroy(missing_topic_ids); +} + +/** + * @brief Handle Heartbeat response. + */ +void rd_kafka_cgrp_handle_ConsumerGroupHeartbeat(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + void *opaque) { + rd_kafka_cgrp_t *rkcg = rk->rk_cgrp; + const int log_decode_errors = LOG_ERR; + int16_t error_code = 0; + int actions = 0; + rd_kafkap_str_t error_str = RD_KAFKAP_STR_INITIALIZER_EMPTY; + rd_kafkap_str_t member_id; + int32_t member_epoch; + int32_t heartbeat_interval_ms; + + if (err == RD_KAFKA_RESP_ERR__DESTROY) + return; + + rd_dassert(rkcg->rkcg_flags & RD_KAFKA_CGRP_F_HEARTBEAT_IN_TRANSIT); + + if (rd_kafka_cgrp_will_leave(rkcg)) + err = RD_KAFKA_RESP_ERR__OUTDATED; + if (err) + goto err; + + rd_kafka_buf_read_throttle_time(rkbuf); + + rd_kafka_buf_read_i16(rkbuf, &error_code); + rd_kafka_buf_read_str(rkbuf, &error_str); + + if (error_code) { + err = error_code; + goto err; + } + + rd_kafka_buf_read_str(rkbuf, &member_id); + if (!RD_KAFKAP_STR_IS_NULL(&member_id)) { + rd_kafka_cgrp_set_member_id(rkcg, member_id.str); + } + + rd_kafka_buf_read_i32(rkbuf, &member_epoch); + rd_kafka_buf_read_i32(rkbuf, &heartbeat_interval_ms); + + int8_t are_assignments_present; + rd_kafka_buf_read_i8(rkbuf, &are_assignments_present); + rkcg->rkcg_generation_id = member_epoch; + if (heartbeat_interval_ms > 0) { + rkcg->rkcg_heartbeat_intvl_ms = heartbeat_interval_ms; + } + + if (are_assignments_present == 1) { + rd_kafka_topic_partition_list_t *assigned_topic_partitions; + const rd_kafka_topic_partition_field_t assignments_fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + assigned_topic_partitions = rd_kafka_buf_read_topic_partitions( + rkbuf, rd_true, rd_false /* Don't use Topic Name */, 0, + assignments_fields); + + if (rd_kafka_is_dbg(rk, CGRP)) { + char assigned_topic_partitions_str[512] = "NULL"; + + if (assigned_topic_partitions) { + rd_kafka_topic_partition_list_str( + assigned_topic_partitions, + assigned_topic_partitions_str, + sizeof(assigned_topic_partitions_str), 0); + } + + rd_kafka_dbg( + rk, CGRP, "HEARTBEAT", + "ConsumerGroupHeartbeat response received target " + "assignment \"%s\"", + assigned_topic_partitions_str); + } + + if (assigned_topic_partitions) { + RD_IF_FREE(rkcg->rkcg_next_target_assignment, + rd_kafka_topic_partition_list_destroy); + rkcg->rkcg_next_target_assignment = NULL; + if (rd_kafka_cgrp_consumer_is_new_assignment_different( + rkcg, assigned_topic_partitions)) { + rkcg->rkcg_next_target_assignment = + assigned_topic_partitions; + } else { + rd_kafka_topic_partition_list_destroy( + assigned_topic_partitions); + assigned_topic_partitions = NULL; + } + } + } + + if (rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_STEADY && + (rkcg->rkcg_consumer_flags & RD_KAFKA_CGRP_CONSUMER_F_WAIT_ACK) && + rkcg->rkcg_target_assignment) { + if (rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_SENDING_ACK) { + if (rkcg->rkcg_current_assignment) + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_current_assignment); + rkcg->rkcg_current_assignment = + rd_kafka_topic_partition_list_copy( + rkcg->rkcg_target_assignment); + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_target_assignment); + rkcg->rkcg_target_assignment = NULL; + rkcg->rkcg_consumer_flags &= + ~RD_KAFKA_CGRP_CONSUMER_F_WAIT_ACK; + + if (rd_kafka_is_dbg(rkcg->rkcg_rk, CGRP)) { + char rkcg_current_assignment_str[512] = "NULL"; + + rd_kafka_topic_partition_list_str( + rkcg->rkcg_current_assignment, + rkcg_current_assignment_str, + sizeof(rkcg_current_assignment_str), 0); + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Target assignment acked, new " + "current assignment " + " \"%s\"", + rkcg_current_assignment_str); + } + } else if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_SUBSCRIPTION) { + /* We've finished reconciliation but we weren't + * sending an ack, need to send a new HB with the ack. + */ + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rkcg, "not subscribed anymore"); + } + } + + if (rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_SERVE_PENDING && + rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_STEADY) { + /* TODO: Check if this should be done only for the steady state? + */ + rd_kafka_assignment_serve(rk); + rkcg->rkcg_consumer_flags &= + ~RD_KAFKA_CGRP_CONSUMER_F_SERVE_PENDING; + } + + if (rkcg->rkcg_next_target_assignment) { + if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_SUBSCRIPTION) { + rd_kafka_cgrp_consumer_next_target_assignment_request_metadata( + rk, rkb); + } else { + /* Consumer left the group sending an HB request + * while this one was in-flight. */ + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_next_target_assignment); + rkcg->rkcg_next_target_assignment = NULL; + } + } + + if (rd_kafka_cgrp_consumer_subscription_preconditions_met(rkcg)) + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rkcg, "send new subscription"); + + rkcg->rkcg_flags &= ~RD_KAFKA_CGRP_F_HEARTBEAT_IN_TRANSIT; + rkcg->rkcg_consumer_flags &= + ~RD_KAFKA_CGRP_CONSUMER_F_SENDING_NEW_SUBSCRIPTION & + ~RD_KAFKA_CGRP_CONSUMER_F_SEND_FULL_REQUEST & + ~RD_KAFKA_CGRP_CONSUMER_F_SENDING_ACK; + rd_kafka_cgrp_maybe_clear_heartbeat_failed_err(rkcg); + rkcg->rkcg_last_heartbeat_err = RD_KAFKA_RESP_ERR_NO_ERROR; + rkcg->rkcg_expedite_heartbeat_retries = 0; + + return; + + +err_parse: + err = rkbuf->rkbuf_err; + +err: + rkcg->rkcg_last_heartbeat_err = err; + rkcg->rkcg_flags &= ~RD_KAFKA_CGRP_F_HEARTBEAT_IN_TRANSIT; + + switch (err) { + case RD_KAFKA_RESP_ERR__DESTROY: + /* quick cleanup */ + return; + + case RD_KAFKA_RESP_ERR_COORDINATOR_LOAD_IN_PROGRESS: + rd_kafka_dbg( + rkcg->rkcg_rk, CONSUMER, "HEARTBEAT", + "ConsumerGroupHeartbeat failed due to coordinator (%s) " + "loading in progress: %s: " + "retrying", + rkcg->rkcg_curr_coord + ? rd_kafka_broker_name(rkcg->rkcg_curr_coord) + : "none", + rd_kafka_err2str(err)); + actions = RD_KAFKA_ERR_ACTION_RETRY; + break; + + case RD_KAFKA_RESP_ERR_NOT_COORDINATOR_FOR_GROUP: + case RD_KAFKA_RESP_ERR_GROUP_COORDINATOR_NOT_AVAILABLE: + case RD_KAFKA_RESP_ERR__TRANSPORT: + rd_kafka_dbg( + rkcg->rkcg_rk, CONSUMER, "HEARTBEAT", + "ConsumerGroupHeartbeat failed due to coordinator (%s) " + "no longer available: %s: " + "re-querying for coordinator", + rkcg->rkcg_curr_coord + ? rd_kafka_broker_name(rkcg->rkcg_curr_coord) + : "none", + rd_kafka_err2str(err)); + /* Remain in joined state and keep querying for coordinator */ + actions = RD_KAFKA_ERR_ACTION_REFRESH; + break; + + case RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID: + case RD_KAFKA_RESP_ERR_FENCED_MEMBER_EPOCH: + rd_kafka_dbg(rkcg->rkcg_rk, CONSUMER, "HEARTBEAT", + "ConsumerGroupHeartbeat failed due to: %s: " + "will rejoin the group", + rd_kafka_err2str(err)); + rkcg->rkcg_consumer_flags |= + RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN; + return; + + case RD_KAFKA_RESP_ERR_INVALID_REQUEST: + case RD_KAFKA_RESP_ERR_GROUP_MAX_SIZE_REACHED: + case RD_KAFKA_RESP_ERR_UNSUPPORTED_ASSIGNOR: + case RD_KAFKA_RESP_ERR_UNSUPPORTED_VERSION: + case RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE: + case RD_KAFKA_RESP_ERR_UNRELEASED_INSTANCE_ID: + case RD_KAFKA_RESP_ERR_GROUP_AUTHORIZATION_FAILED: + actions = RD_KAFKA_ERR_ACTION_FATAL; + break; + default: + actions = rd_kafka_err_action( + rkb, err, request, + + RD_KAFKA_ERR_ACTION_SPECIAL, + RD_KAFKA_RESP_ERR_TOPIC_AUTHORIZATION_FAILED, + + RD_KAFKA_ERR_ACTION_END); + break; + } + + if (actions & RD_KAFKA_ERR_ACTION_FATAL) { + rd_kafka_set_fatal_error( + rkcg->rkcg_rk, err, + "ConsumerGroupHeartbeat fatal error: %s", + rd_kafka_err2str(err)); + rd_kafka_cgrp_revoke_all_rejoin_maybe( + rkcg, rd_true, /*assignments lost*/ + rd_true, /*initiating*/ + "Fatal error in ConsumerGroupHeartbeat API response"); + return; + } + + if (!rkcg->rkcg_heartbeat_intvl_ms) { + /* When an error happens on first HB, it should be always + * retried, unless fatal, to avoid entering a tight loop + * and to use exponential backoff. */ + actions |= RD_KAFKA_ERR_ACTION_RETRY; + } + + if (actions & RD_KAFKA_ERR_ACTION_REFRESH) { + /* Re-query for coordinator */ + rkcg->rkcg_consumer_flags |= + RD_KAFKA_CGRP_CONSUMER_F_SEND_FULL_REQUEST; + rd_kafka_cgrp_coord_query(rkcg, rd_kafka_err2str(err)); + /* If coordinator changes, HB will be expedited. */ + } + + if (actions & RD_KAFKA_ERR_ACTION_SPECIAL) { + rd_ts_t min_error_interval = + RD_MAX(rkcg->rkcg_heartbeat_intvl_ms * 1000, + /* default group.consumer.heartbeat.interval.ms */ + 5000000); + if (rkcg->rkcg_last_err != err || + (rd_clock() > + rkcg->rkcg_ts_last_err + min_error_interval)) { + rd_kafka_cgrp_set_last_err(rkcg, err); + rd_kafka_consumer_err( + rkcg->rkcg_q, rd_kafka_broker_id(rkb), err, 0, NULL, + NULL, err, + "ConsumerGroupHeartbeat failed: %s%s%.*s", + rd_kafka_err2str(err), + RD_KAFKAP_STR_LEN(&error_str) ? ": " : "", + RD_KAFKAP_STR_PR(&error_str)); + } + } + + if (actions & RD_KAFKA_ERR_ACTION_RETRY && + rkcg->rkcg_flags & RD_KAFKA_CGRP_F_SUBSCRIPTION && + !rd_kafka_cgrp_will_leave(rkcg) && + rd_kafka_buf_retry(rkb, request)) { + /* Retry */ + rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_HEARTBEAT_IN_TRANSIT; + } +} + /** * @brief Handle Heartbeat response. @@ -2590,13 +3542,15 @@ static void rd_kafka_cgrp_terminated(rd_kafka_cgrp_t *rkcg) { rd_kafka_q_purge(rkcg->rkcg_wait_coord_q); - /* Disable and empty ops queue since there will be no + /* Disable ops queue since there will be no * (broker) thread serving it anymore after the unassign_broker * below. - * This prevents hang on destroy where responses are enqueued on - * rkcg_ops without anything serving the queue. */ + * As queue is forwarded to rk_ops, it cannot be purged, + * so consumer group operation need to be served with a no-op + * when `rkcg_terminated` is true. */ + + rd_atomic32_set(&rkcg->rkcg_terminated, rd_true); rd_kafka_q_disable(rkcg->rkcg_ops); - rd_kafka_q_purge(rkcg->rkcg_ops); if (rkcg->rkcg_curr_coord) rd_kafka_cgrp_coord_clear_broker(rkcg); @@ -2606,8 +3560,6 @@ static void rd_kafka_cgrp_terminated(rd_kafka_cgrp_t *rkcg) { rkcg->rkcg_coord = NULL; } - rd_atomic32_set(&rkcg->rkcg_terminated, rd_true); - rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "CGRPTERM", "Consumer group sub-system terminated%s", rkcg->rkcg_reply_rko ? " (will enqueue reply)" : ""); @@ -2621,6 +3573,9 @@ static void rd_kafka_cgrp_terminated(rd_kafka_cgrp_t *rkcg) { /* Remove cgrp application queue forwarding, if any. */ rd_kafka_q_fwd_set(rkcg->rkcg_q, NULL); + + /* Destroy KIP-848 consumer group structures */ + rd_kafka_cgrp_consumer_reset(rkcg); } @@ -2637,7 +3592,11 @@ static RD_INLINE int rd_kafka_cgrp_try_terminate(rd_kafka_cgrp_t *rkcg) { if (likely(!(rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE))) return 0; - /* Check if wait-coord queue has timed out. */ + /* Check if wait-coord queue has timed out. + + FIXME: Remove usage of `group_session_timeout_ms` for the new + consumer group protocol implementation defined in KIP-848. + */ if (rd_kafka_q_len(rkcg->rkcg_wait_coord_q) > 0 && rkcg->rkcg_ts_terminate + (rkcg->rkcg_rk->rk_conf.group_session_timeout_ms * 1000) < @@ -2734,6 +3693,9 @@ static void rd_kafka_cgrp_partition_del(rd_kafka_cgrp_t *rkcg, rd_kafka_toppar_lock(rktp); rd_assert(rktp->rktp_flags & RD_KAFKA_TOPPAR_F_ON_CGRP); rktp->rktp_flags &= ~RD_KAFKA_TOPPAR_F_ON_CGRP; + + rd_kafka_toppar_purge_internal_fetch_queue_maybe(rktp); + rd_kafka_toppar_unlock(rktp); rd_list_remove(&rkcg->rkcg_toppars, rktp); @@ -2754,7 +3716,6 @@ static void rd_kafka_cgrp_partition_del(rd_kafka_cgrp_t *rkcg, static int rd_kafka_cgrp_defer_offset_commit(rd_kafka_cgrp_t *rkcg, rd_kafka_op_t *rko, const char *reason) { - /* wait_coord_q is disabled session.timeout.ms after * group close() has been initated. */ if (rko->rko_u.offset_commit.ts_timeout != 0 || @@ -2773,6 +3734,11 @@ static int rd_kafka_cgrp_defer_offset_commit(rd_kafka_cgrp_t *rkcg, : "none"); rko->rko_flags |= RD_KAFKA_OP_F_REPROCESS; + + /* FIXME: Remove `group_session_timeout_ms` for the new protocol + * defined in KIP-848 as this property is deprecated from client + * side in the new protocol. + */ rko->rko_u.offset_commit.ts_timeout = rd_clock() + (rkcg->rkcg_rk->rk_conf.group_session_timeout_ms * 1000); @@ -2781,6 +3747,45 @@ static int rd_kafka_cgrp_defer_offset_commit(rd_kafka_cgrp_t *rkcg, return 1; } +/** + * @brief Defer offset commit (rko) until coordinator is available (KIP-848). + * + * @returns 1 if the rko was deferred or 0 if the defer queue is disabled + * or rko already deferred. + */ +static int rd_kafka_cgrp_consumer_defer_offset_commit(rd_kafka_cgrp_t *rkcg, + rd_kafka_op_t *rko, + const char *reason) { + /* wait_coord_q is disabled session.timeout.ms after + * group close() has been initated. */ + if ((rko->rko_u.offset_commit.ts_timeout != 0 && + rd_clock() >= rko->rko_u.offset_commit.ts_timeout) || + !rd_kafka_q_ready(rkcg->rkcg_wait_coord_q)) + return 0; + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "COMMIT", + "Group \"%s\": " + "unable to OffsetCommit in state %s: %s: " + "retrying later", + rkcg->rkcg_group_id->str, + rd_kafka_cgrp_state_names[rkcg->rkcg_state], reason); + + rko->rko_flags |= RD_KAFKA_OP_F_REPROCESS; + + if (!rko->rko_u.offset_commit.ts_timeout) { + rko->rko_u.offset_commit.ts_timeout = + rd_clock() + + (rkcg->rkcg_rk->rk_conf.group_session_timeout_ms * 1000); + } + + /* Reset partition level error before retrying */ + rd_kafka_topic_partition_list_set_err( + rko->rko_u.offset_commit.partitions, RD_KAFKA_RESP_ERR_NO_ERROR); + + rd_kafka_q_enq(rkcg->rkcg_wait_coord_q, rko); + + return 1; +} /** * @brief Update the committed offsets for the partitions in \p offsets, @@ -2832,7 +3837,8 @@ static int rd_kafka_cgrp_update_committed_offsets( continue; rd_kafka_toppar_lock(rktp); - rktp->rktp_committed_offset = rktpar->offset; + rktp->rktp_committed_pos = + rd_kafka_topic_partition_get_fetch_pos(rktpar); rd_kafka_toppar_unlock(rktp); rd_kafka_toppar_destroy(rktp); /* from get_toppar() */ @@ -2978,22 +3984,32 @@ static void rd_kafka_cgrp_op_handle_OffsetCommit(rd_kafka_t *rk, rd_kafka_err2str(err)); } - /* * Error handling */ switch (err) { case RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID: - /* Revoke assignment and rebalance on unknown member */ - rd_kafka_cgrp_set_member_id(rk->rk_cgrp, ""); - rd_kafka_cgrp_revoke_all_rejoin_maybe( - rkcg, rd_true /*assignment is lost*/, - rd_true /*this consumer is initiating*/, - "OffsetCommit error: Unknown member"); + if (rkcg->rkcg_group_protocol == + RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rk->rk_cgrp, "OffsetCommit error: Unknown member"); + } else { + /* Revoke assignment and rebalance on unknown member */ + rd_kafka_cgrp_set_member_id(rk->rk_cgrp, ""); + rd_kafka_cgrp_revoke_all_rejoin_maybe( + rkcg, rd_true /*assignment is lost*/, + rd_true /*this consumer is initiating*/, + "OffsetCommit error: Unknown member"); + } break; case RD_KAFKA_RESP_ERR_ILLEGAL_GENERATION: - /* Revoke assignment and rebalance on illegal generation */ + /* Revoke assignment and rebalance on illegal generation, + * only if not rebalancing, because a new generation id + * can be received soon after this error. */ + if (RD_KAFKA_CGRP_REBALANCING(rkcg)) + break; + rk->rk_cgrp->rkcg_generation_id = -1; rd_kafka_cgrp_revoke_all_rejoin_maybe( rkcg, rd_true /*assignment is lost*/, @@ -3004,6 +4020,21 @@ static void rd_kafka_cgrp_op_handle_OffsetCommit(rd_kafka_t *rk, case RD_KAFKA_RESP_ERR__IN_PROGRESS: return; /* Retrying */ + case RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH: + /* FIXME: Add logs.*/ + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rk->rk_cgrp, "OffsetCommit error: Stale member epoch"); + if (!rd_strcmp(rko_orig->rko_u.offset_commit.reason, "manual")) + /* Don't retry manual commits giving this error. + * TODO: do this in a faster and cleaner way + * with a bool. */ + break; + + if (rd_kafka_cgrp_consumer_defer_offset_commit( + rkcg, rko_orig, rd_kafka_err2str(err))) + return; + break; + case RD_KAFKA_RESP_ERR_NOT_COORDINATOR: case RD_KAFKA_RESP_ERR_COORDINATOR_NOT_AVAILABLE: case RD_KAFKA_RESP_ERR__TRANSPORT: @@ -3046,7 +4077,8 @@ static void rd_kafka_cgrp_op_handle_OffsetCommit(rd_kafka_t *rk, !(err == RD_KAFKA_RESP_ERR__NO_OFFSET && rko_orig->rko_u.offset_commit.silent_empty)) { /* Propagate commit results (success or permanent error) - * unless we're shutting down or commit was empty. */ + * unless we're shutting down or commit was empty, or if + * there was a rebalance in progress. */ rd_kafka_cgrp_propagate_commit_result(rkcg, rko_orig, err, errcnt, offsets); } @@ -3076,8 +4108,9 @@ static size_t rd_kafka_topic_partition_has_absolute_offset( * * \p rko...silent_empty: if there are no offsets to commit bail out * silently without posting an op on the reply queue. - * \p set_offsets: set offsets in rko->rko_u.offset_commit.partitions from - * the rktp's stored offset. + * \p set_offsets: set offsets and epochs in + * rko->rko_u.offset_commit.partitions from the rktp's + * stored offset. * * Locality: cgrp thread */ @@ -3293,6 +4326,19 @@ rd_kafka_trigger_waiting_subscribe_maybe(rd_kafka_cgrp_t *rkcg) { return rd_false; } +static void rd_kafka_cgrp_start_max_poll_interval_timer(rd_kafka_cgrp_t *rkcg) { + /* If using subscribe(), start a timer to enforce + * `max.poll.interval.ms`. + * Instead of restarting the timer on each ...poll() + * call, which would be costly (once per message), + * set up an intervalled timer that checks a timestamp + * (that is updated on ..poll()). + * The timer interval is 2 hz. */ + rd_kafka_timer_start( + &rkcg->rkcg_rk->rk_timers, &rkcg->rkcg_max_poll_interval_tmr, + 500 * 1000ll /* 500ms */, + rd_kafka_cgrp_max_poll_interval_check_tmr_cb, rkcg); +} /** * @brief Incrementally add to an existing partition assignment @@ -3315,20 +4361,8 @@ rd_kafka_cgrp_incremental_assign(rd_kafka_cgrp_t *rkcg, "incremental assign called"); rd_kafka_cgrp_set_join_state(rkcg, RD_KAFKA_CGRP_JOIN_STATE_STEADY); - if (rkcg->rkcg_subscription) { - /* If using subscribe(), start a timer to enforce - * `max.poll.interval.ms`. - * Instead of restarting the timer on each ...poll() - * call, which would be costly (once per message), - * set up an intervalled timer that checks a timestamp - * (that is updated on ..poll()). - * The timer interval is 2 hz. */ - rd_kafka_timer_start( - &rkcg->rkcg_rk->rk_timers, - &rkcg->rkcg_max_poll_interval_tmr, - 500 * 1000ll /* 500ms */, - rd_kafka_cgrp_max_poll_interval_check_tmr_cb, rkcg); + rd_kafka_cgrp_start_max_poll_interval_timer(rkcg); } } @@ -3477,6 +4511,11 @@ static void rd_kafka_cgrp_unassign_done(rd_kafka_cgrp_t *rkcg) { * change in the rkcg. */ void rd_kafka_cgrp_assignment_done(rd_kafka_cgrp_t *rkcg) { + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_cgrp_consumer_assignment_done(rkcg); + return; + } + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "ASSIGNDONE", "Group \"%s\": " "assignment operations done in join-state %s " @@ -3548,7 +4587,6 @@ static rd_kafka_error_t *rd_kafka_cgrp_unassign(rd_kafka_cgrp_t *rkcg) { return NULL; } - /** * @brief Set new atomic partition assignment * May update \p assignment but will not hold on to it. @@ -3581,20 +4619,8 @@ rd_kafka_cgrp_assign(rd_kafka_cgrp_t *rkcg, rd_kafka_assignment_resume(rkcg->rkcg_rk, "assign called"); rd_kafka_cgrp_set_join_state(rkcg, RD_KAFKA_CGRP_JOIN_STATE_STEADY); - if (rkcg->rkcg_subscription) { - /* If using subscribe(), start a timer to enforce - * `max.poll.interval.ms`. - * Instead of restarting the timer on each ...poll() - * call, which would be costly (once per message), - * set up an intervalled timer that checks a timestamp - * (that is updated on ..poll()). - * The timer interval is 2 hz. */ - rd_kafka_timer_start( - &rkcg->rkcg_rk->rk_timers, - &rkcg->rkcg_max_poll_interval_tmr, - 500 * 1000ll /* 500ms */, - rd_kafka_cgrp_max_poll_interval_check_tmr_cb, rkcg); + rd_kafka_cgrp_start_max_poll_interval_timer(rkcg); } } @@ -3641,7 +4667,7 @@ rd_kafka_toppar_member_info_map_to_list(map_toppar_member_info_t *map) { rd_kafka_topic_partition_list_new((int)RD_MAP_CNT(map)); RD_MAP_FOREACH_KEY(k, map) { - rd_kafka_topic_partition_list_add(list, k->topic, k->partition); + rd_kafka_topic_partition_list_add_copy(list, k); } return list; @@ -4152,25 +5178,36 @@ rd_kafka_cgrp_max_poll_interval_check_tmr_cb(rd_kafka_timers_t *rkts, rd_kafka_timer_stop(rkts, &rkcg->rkcg_max_poll_interval_tmr, 1 /*lock*/); - /* Leave the group before calling rebalance since the standard leave - * will be triggered first after the rebalance callback has been served. - * But since the application is blocked still doing processing - * that leave will be further delayed. - * - * KIP-345: static group members should continue to respect - * `max.poll.interval.ms` but should not send a LeaveGroupRequest. - */ - if (!RD_KAFKA_CGRP_IS_STATIC_MEMBER(rkcg)) - rd_kafka_cgrp_leave(rkcg); + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_cgrp_consumer_leave(rkcg); + rkcg->rkcg_consumer_flags |= + RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN; + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rkcg, + "max poll interval " + "exceeded"); + } else { + /* Leave the group before calling rebalance since the standard + * leave will be triggered first after the rebalance callback + * has been served. But since the application is blocked still + * doing processing that leave will be further delayed. + * + * KIP-345: static group members should continue to respect + * `max.poll.interval.ms` but should not send a + * LeaveGroupRequest. + */ + if (!RD_KAFKA_CGRP_IS_STATIC_MEMBER(rkcg)) + rd_kafka_cgrp_leave(rkcg); + /* Timing out or leaving the group invalidates the member id, + * reset it now to avoid an ERR_UNKNOWN_MEMBER_ID on the next + * join. */ + rd_kafka_cgrp_set_member_id(rkcg, ""); - /* Timing out or leaving the group invalidates the member id, reset it - * now to avoid an ERR_UNKNOWN_MEMBER_ID on the next join. */ - rd_kafka_cgrp_set_member_id(rkcg, ""); - - /* Trigger rebalance */ - rd_kafka_cgrp_revoke_all_rejoin_maybe(rkcg, rd_true /*lost*/, - rd_true /*initiating*/, - "max.poll.interval.ms exceeded"); + /* Trigger rebalance */ + rd_kafka_cgrp_revoke_all_rejoin_maybe( + rkcg, rd_true /*lost*/, rd_true /*initiating*/, + "max.poll.interval.ms exceeded"); + } } @@ -4315,6 +5352,59 @@ rd_kafka_cgrp_calculate_subscribe_revoking_partitions( return revoking; } +/** + * @brief Set the new subscription and increase subscription version. + * + * @return The new subscription version. + */ +static int32_t +rd_kafka_cgrp_subscription_set(rd_kafka_cgrp_t *rkcg, + rd_kafka_topic_partition_list_t *rktparlist) { + + rkcg->rkcg_flags &= ~(RD_KAFKA_CGRP_F_SUBSCRIPTION | + RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION); + RD_IF_FREE(rkcg->rkcg_subscription, + rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(rkcg->rkcg_subscription_topics, + rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(rkcg->rkcg_subscription_regex, rd_kafkap_str_destroy); + + rkcg->rkcg_subscription = rktparlist; + + if (rkcg->rkcg_subscription) { + rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_SUBSCRIPTION; + if (rd_kafka_topic_partition_list_regex_cnt( + rkcg->rkcg_subscription) > 0) + rkcg->rkcg_flags |= + RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION; + + if (rkcg->rkcg_group_protocol == + RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rkcg->rkcg_subscription_regex = + rd_kafka_topic_partition_list_combine_regexes( + rkcg->rkcg_subscription); + rkcg->rkcg_subscription_topics = + rd_kafka_topic_partition_list_remove_regexes( + rkcg->rkcg_subscription); + rkcg->rkcg_consumer_flags |= + RD_KAFKA_CGRP_CONSUMER_F_SUBSCRIBED_ONCE | + RD_KAFKA_CGRP_CONSUMER_F_SEND_NEW_SUBSCRIPTION; + rd_kafka_cgrp_maybe_clear_heartbeat_failed_err(rkcg); + } + } else { + rkcg->rkcg_subscription_regex = NULL; + rkcg->rkcg_subscription_topics = NULL; + if (rkcg->rkcg_next_subscription) { + /* When unsubscribing clear next subscription too */ + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_next_subscription); + rkcg->rkcg_next_subscription = NULL; + } + } + + return rd_atomic32_add(&rkcg->rkcg_subscription_version, 1); +} + /** * @brief Handle a new subscription that is modifying an existing subscription @@ -4331,11 +5421,7 @@ rd_kafka_cgrp_modify_subscription(rd_kafka_cgrp_t *rkcg, rd_kafka_topic_partition_list_t *errored; int metadata_age; int old_cnt = rkcg->rkcg_subscription->cnt; - - rkcg->rkcg_flags &= ~RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION; - - if (rd_kafka_topic_partition_list_regex_cnt(rktparlist) > 0) - rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION; + int32_t cgrp_subscription_version; /* Topics in rkcg_subscribed_topics that don't match any pattern in the new subscription. */ @@ -4346,10 +5432,11 @@ rd_kafka_cgrp_modify_subscription(rd_kafka_cgrp_t *rkcg, revoking = rd_kafka_cgrp_calculate_subscribe_revoking_partitions( rkcg, unsubscribing_topics); - rd_kafka_topic_partition_list_destroy(rkcg->rkcg_subscription); - rkcg->rkcg_subscription = rktparlist; + cgrp_subscription_version = + rd_kafka_cgrp_subscription_set(rkcg, rktparlist); if (rd_kafka_cgrp_metadata_refresh(rkcg, &metadata_age, + cgrp_subscription_version, "modify subscription") == 1) { rd_kafka_dbg(rkcg->rkcg_rk, CGRP | RD_KAFKA_DBG_CONSUMER, "MODSUB", @@ -4392,7 +5479,7 @@ rd_kafka_cgrp_modify_subscription(rd_kafka_cgrp_t *rkcg, /* Create a list of the topics in metadata that matches the new * subscription */ tinfos = rd_list_new(rkcg->rkcg_subscription->cnt, - (void *)rd_kafka_topic_info_destroy); + rd_kafka_topic_info_destroy_free); /* Unmatched topics will be added to the errored list. */ errored = rd_kafka_topic_partition_list_new(0); @@ -4432,12 +5519,12 @@ rd_kafka_cgrp_modify_subscription(rd_kafka_cgrp_t *rkcg, return RD_KAFKA_RESP_ERR_NO_ERROR; } - /** - * Remove existing topic subscription. + * Remove existing topic subscription (KIP 848). */ -static rd_kafka_resp_err_t rd_kafka_cgrp_unsubscribe(rd_kafka_cgrp_t *rkcg, - rd_bool_t leave_group) { +static rd_kafka_resp_err_t +rd_kafka_cgrp_consumer_unsubscribe(rd_kafka_cgrp_t *rkcg, + rd_bool_t leave_group) { rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "UNSUBSCRIBE", "Group \"%.*s\": unsubscribe from current %ssubscription " @@ -4454,11 +5541,51 @@ static rd_kafka_resp_err_t rd_kafka_cgrp_unsubscribe(rd_kafka_cgrp_t *rkcg, rd_kafka_timer_stop(&rkcg->rkcg_rk->rk_timers, &rkcg->rkcg_max_poll_interval_tmr, 1 /*lock*/); - if (rkcg->rkcg_subscription) { - rd_kafka_topic_partition_list_destroy(rkcg->rkcg_subscription); - rkcg->rkcg_subscription = NULL; + rd_kafka_cgrp_subscription_set(rkcg, NULL); + + /* When group is rejoining the leave group call is either: + * - been done on max.poll.interval reached + * - not necessary because member has been fenced + * + * When group is already leaving we just wait that previous + * leave request finishes. + */ + if (leave_group && !rd_kafka_cgrp_consumer_will_rejoin(rkcg) && + RD_KAFKA_CGRP_HAS_JOINED(rkcg) && !rd_kafka_cgrp_will_leave(rkcg)) { + rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_LEAVE_ON_UNASSIGN_DONE; + rd_kafka_cgrp_revoke_all_rejoin(rkcg, rd_false /*not lost*/, + rd_true /*initiating*/, + "unsubscribe"); } + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * Remove existing topic subscription. + */ +static rd_kafka_resp_err_t rd_kafka_cgrp_unsubscribe(rd_kafka_cgrp_t *rkcg, + rd_bool_t leave_group) { + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) + return rd_kafka_cgrp_consumer_unsubscribe(rkcg, leave_group); + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "UNSUBSCRIBE", + "Group \"%.*s\": unsubscribe from current %ssubscription " + "of size %d (leave group=%s, has joined=%s, %s, " + "join-state %s)", + RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), + rkcg->rkcg_subscription ? "" : "unset ", + rkcg->rkcg_subscription ? rkcg->rkcg_subscription->cnt : 0, + RD_STR_ToF(leave_group), + RD_STR_ToF(RD_KAFKA_CGRP_HAS_JOINED(rkcg)), + rkcg->rkcg_member_id ? rkcg->rkcg_member_id->str : "n/a", + rd_kafka_cgrp_join_state_names[rkcg->rkcg_join_state]); + + rd_kafka_timer_stop(&rkcg->rkcg_rk->rk_timers, + &rkcg->rkcg_max_poll_interval_tmr, 1 /*lock*/); + + rd_kafka_cgrp_subscription_set(rkcg, NULL); + rd_kafka_cgrp_update_subscribed_topics(rkcg, NULL); /* @@ -4475,20 +5602,16 @@ static rd_kafka_resp_err_t rd_kafka_cgrp_unsubscribe(rd_kafka_cgrp_t *rkcg, rd_true /*initiating*/, "unsubscribe"); - rkcg->rkcg_flags &= ~(RD_KAFKA_CGRP_F_SUBSCRIPTION | - RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION); - return RD_KAFKA_RESP_ERR_NO_ERROR; } - /** * Set new atomic topic subscription. */ static rd_kafka_resp_err_t rd_kafka_cgrp_subscribe(rd_kafka_cgrp_t *rkcg, rd_kafka_topic_partition_list_t *rktparlist) { - + int32_t subscription_version; rd_kafka_dbg(rkcg->rkcg_rk, CGRP | RD_KAFKA_DBG_CONSUMER, "SUBSCRIBE", "Group \"%.*s\": subscribe to new %ssubscription " "of %d topics (join-state %s)", @@ -4549,14 +5672,9 @@ rd_kafka_cgrp_subscribe(rd_kafka_cgrp_t *rkcg, if (!rktparlist) return RD_KAFKA_RESP_ERR_NO_ERROR; - rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_SUBSCRIPTION; + subscription_version = rd_kafka_cgrp_subscription_set(rkcg, rktparlist); - if (rd_kafka_topic_partition_list_regex_cnt(rktparlist) > 0) - rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION; - - rkcg->rkcg_subscription = rktparlist; - - rd_kafka_cgrp_join(rkcg); + rd_kafka_cgrp_join(rkcg, subscription_version); return RD_KAFKA_RESP_ERR_NO_ERROR; } @@ -4605,6 +5723,11 @@ void rd_kafka_cgrp_terminate0(rd_kafka_cgrp_t *rkcg, rd_kafka_op_t *rko) { /* Mark for stopping, the actual state transition * is performed when all toppars have left. */ rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_TERMINATE; + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rkcg->rkcg_consumer_flags &= + ~RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN & + ~RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN_TO_COMPLETE; + } rkcg->rkcg_ts_terminate = rd_clock(); rkcg->rkcg_reply_rko = rko; @@ -4738,8 +5861,21 @@ static void rd_kafka_cgrp_handle_assign_op(rd_kafka_cgrp_t *rkcg, rko->rko_u.assign.partitions); rko->rko_u.assign.partitions = NULL; } + + if (rkcg->rkcg_rebalance_incr_assignment) { + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_rebalance_incr_assignment); + rkcg->rkcg_rebalance_incr_assignment = NULL; + } + rko->rko_u.assign.method = RD_KAFKA_ASSIGN_METHOD_ASSIGN; + if (rkcg->rkcg_join_state == + RD_KAFKA_CGRP_JOIN_STATE_WAIT_ASSIGN_CALL) { + rd_kafka_cgrp_set_join_state( + rkcg, RD_KAFKA_CGRP_JOIN_STATE_WAIT_UNASSIGN_CALL); + } + } else if (rd_kafka_cgrp_rebalance_protocol(rkcg) == RD_KAFKA_REBALANCE_PROTOCOL_COOPERATIVE && !(rko->rko_u.assign.method == @@ -4805,189 +5941,6 @@ static void rd_kafka_cgrp_handle_assign_op(rd_kafka_cgrp_t *rkcg, rd_kafka_op_error_reply(rko, error); } - -/** - * @brief Handle cgrp queue op. - * @locality rdkafka main thread - * @locks none - */ -static rd_kafka_op_res_t rd_kafka_cgrp_op_serve(rd_kafka_t *rk, - rd_kafka_q_t *rkq, - rd_kafka_op_t *rko, - rd_kafka_q_cb_type_t cb_type, - void *opaque) { - rd_kafka_cgrp_t *rkcg = opaque; - rd_kafka_toppar_t *rktp; - rd_kafka_resp_err_t err; - const int silent_op = rko->rko_type == RD_KAFKA_OP_RECV_BUF; - - rktp = rko->rko_rktp; - - if (rktp && !silent_op) - rd_kafka_dbg( - rkcg->rkcg_rk, CGRP, "CGRPOP", - "Group \"%.*s\" received op %s in state %s " - "(join-state %s) for %.*s [%" PRId32 "]", - RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), - rd_kafka_op2str(rko->rko_type), - rd_kafka_cgrp_state_names[rkcg->rkcg_state], - rd_kafka_cgrp_join_state_names[rkcg->rkcg_join_state], - RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition); - else if (!silent_op) - rd_kafka_dbg( - rkcg->rkcg_rk, CGRP, "CGRPOP", - "Group \"%.*s\" received op %s in state %s " - "(join-state %s)", - RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), - rd_kafka_op2str(rko->rko_type), - rd_kafka_cgrp_state_names[rkcg->rkcg_state], - rd_kafka_cgrp_join_state_names[rkcg->rkcg_join_state]); - - switch ((int)rko->rko_type) { - case RD_KAFKA_OP_NAME: - /* Return the currently assigned member id. */ - if (rkcg->rkcg_member_id) - rko->rko_u.name.str = - RD_KAFKAP_STR_DUP(rkcg->rkcg_member_id); - rd_kafka_op_reply(rko, 0); - rko = NULL; - break; - - case RD_KAFKA_OP_CG_METADATA: - /* Return the current consumer group metadata. */ - rko->rko_u.cg_metadata = - rkcg->rkcg_member_id - ? rd_kafka_consumer_group_metadata_new_with_genid( - rkcg->rkcg_rk->rk_conf.group_id_str, - rkcg->rkcg_generation_id, - rkcg->rkcg_member_id->str, - rkcg->rkcg_rk->rk_conf.group_instance_id) - : NULL; - rd_kafka_op_reply(rko, RD_KAFKA_RESP_ERR_NO_ERROR); - rko = NULL; - break; - - case RD_KAFKA_OP_OFFSET_FETCH: - if (rkcg->rkcg_state != RD_KAFKA_CGRP_STATE_UP || - (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE)) { - rd_kafka_op_handle_OffsetFetch( - rkcg->rkcg_rk, NULL, RD_KAFKA_RESP_ERR__WAIT_COORD, - NULL, NULL, rko); - rko = NULL; /* rko freed by handler */ - break; - } - - rd_kafka_OffsetFetchRequest( - rkcg->rkcg_coord, rk->rk_group_id->str, - rko->rko_u.offset_fetch.partitions, - rko->rko_u.offset_fetch.require_stable_offsets, - 0, /* Timeout */ - RD_KAFKA_REPLYQ(rkcg->rkcg_ops, 0), - rd_kafka_op_handle_OffsetFetch, rko); - rko = NULL; /* rko now owned by request */ - break; - - case RD_KAFKA_OP_PARTITION_JOIN: - rd_kafka_cgrp_partition_add(rkcg, rktp); - - /* If terminating tell the partition to leave */ - if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE) - rd_kafka_toppar_op_fetch_stop(rktp, RD_KAFKA_NO_REPLYQ); - break; - - case RD_KAFKA_OP_PARTITION_LEAVE: - rd_kafka_cgrp_partition_del(rkcg, rktp); - break; - - case RD_KAFKA_OP_OFFSET_COMMIT: - /* Trigger offsets commit. */ - rd_kafka_cgrp_offsets_commit(rkcg, rko, - /* only set offsets - * if no partitions were - * specified. */ - rko->rko_u.offset_commit.partitions - ? 0 - : 1 /* set_offsets*/, - rko->rko_u.offset_commit.reason); - rko = NULL; /* rko now owned by request */ - break; - - case RD_KAFKA_OP_COORD_QUERY: - rd_kafka_cgrp_coord_query( - rkcg, - rko->rko_err ? rd_kafka_err2str(rko->rko_err) : "from op"); - break; - - case RD_KAFKA_OP_SUBSCRIBE: - rd_kafka_app_polled(rk); - - /* New atomic subscription (may be NULL) */ - err = - rd_kafka_cgrp_subscribe(rkcg, rko->rko_u.subscribe.topics); - - if (!err) /* now owned by rkcg */ - rko->rko_u.subscribe.topics = NULL; - - rd_kafka_op_reply(rko, err); - rko = NULL; - break; - - case RD_KAFKA_OP_ASSIGN: - rd_kafka_cgrp_handle_assign_op(rkcg, rko); - rko = NULL; - break; - - case RD_KAFKA_OP_GET_SUBSCRIPTION: - if (rkcg->rkcg_next_subscription) - rko->rko_u.subscribe.topics = - rd_kafka_topic_partition_list_copy( - rkcg->rkcg_next_subscription); - else if (rkcg->rkcg_next_unsubscribe) - rko->rko_u.subscribe.topics = NULL; - else if (rkcg->rkcg_subscription) - rko->rko_u.subscribe.topics = - rd_kafka_topic_partition_list_copy( - rkcg->rkcg_subscription); - rd_kafka_op_reply(rko, 0); - rko = NULL; - break; - - case RD_KAFKA_OP_GET_ASSIGNMENT: - /* This is the consumer assignment, not the group assignment. */ - rko->rko_u.assign.partitions = - rd_kafka_topic_partition_list_copy( - rkcg->rkcg_rk->rk_consumer.assignment.all); - - rd_kafka_op_reply(rko, 0); - rko = NULL; - break; - - case RD_KAFKA_OP_GET_REBALANCE_PROTOCOL: - rko->rko_u.rebalance_protocol.str = - rd_kafka_rebalance_protocol2str( - rd_kafka_cgrp_rebalance_protocol(rkcg)); - rd_kafka_op_reply(rko, RD_KAFKA_RESP_ERR_NO_ERROR); - rko = NULL; - break; - - case RD_KAFKA_OP_TERMINATE: - rd_kafka_cgrp_terminate0(rkcg, rko); - rko = NULL; /* terminate0() takes ownership */ - break; - - default: - rd_kafka_assert(rkcg->rkcg_rk, !*"unknown type"); - break; - } - - if (rko) - rd_kafka_op_destroy(rko); - - return RD_KAFKA_OP_RES_HANDLED; -} - - /** * @returns true if the session timeout has expired (due to no successful * Heartbeats in session.timeout.ms) and triggers a rebalance. @@ -5084,7 +6037,8 @@ static void rd_kafka_cgrp_join_state_serve(rd_kafka_cgrp_t *rkcg) { if (rd_interval_immediate(&rkcg->rkcg_join_intvl, 1000 * 1000, now) > 0) - rd_kafka_cgrp_join(rkcg); + rd_kafka_cgrp_join( + rkcg, -1 /* current subscription version */); break; case RD_KAFKA_CGRP_JOIN_STATE_WAIT_JOIN: @@ -5108,6 +6062,431 @@ static void rd_kafka_cgrp_join_state_serve(rd_kafka_cgrp_t *rkcg) { break; } } + + +void rd_kafka_cgrp_consumer_group_heartbeat(rd_kafka_cgrp_t *rkcg, + rd_bool_t full_request, + rd_bool_t send_ack) { + + rd_kafkap_str_t *rkcg_group_instance_id = NULL; + rd_kafkap_str_t *rkcg_client_rack = NULL; + int max_poll_interval_ms = -1; + rd_kafka_topic_partition_list_t *rkcg_subscription_topics = NULL; + rd_kafkap_str_t *rkcg_subscription_regex = NULL; + rd_kafkap_str_t *rkcg_group_remote_assignor = NULL; + rd_kafka_topic_partition_list_t *rkcg_group_assignment = NULL; + int32_t member_epoch = rkcg->rkcg_generation_id; + if (member_epoch < 0) + member_epoch = 0; + + + rkcg->rkcg_flags &= ~RD_KAFKA_CGRP_F_MAX_POLL_EXCEEDED; + rkcg->rkcg_flags |= RD_KAFKA_CGRP_F_HEARTBEAT_IN_TRANSIT; + + if (full_request) { + rkcg_group_instance_id = rkcg->rkcg_group_instance_id; + rkcg_client_rack = rkcg->rkcg_client_rack; + max_poll_interval_ms = + rkcg->rkcg_rk->rk_conf.max_poll_interval_ms; + rkcg_subscription_topics = rkcg->rkcg_subscription_topics; + rkcg_subscription_regex = rkcg->rkcg_subscription_regex; + rkcg_group_remote_assignor = rkcg->rkcg_group_remote_assignor; + } + + if (send_ack) { + rkcg_group_assignment = rkcg->rkcg_target_assignment; + rkcg->rkcg_consumer_flags |= + RD_KAFKA_CGRP_CONSUMER_F_SENDING_ACK; + + if (rd_kafka_is_dbg(rkcg->rkcg_rk, CGRP)) { + char rkcg_group_assignment_str[512] = "NULL"; + + if (rkcg_group_assignment) { + rd_kafka_topic_partition_list_str( + rkcg_group_assignment, + rkcg_group_assignment_str, + sizeof(rkcg_group_assignment_str), 0); + } + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Acknowledging target assignment \"%s\"", + rkcg_group_assignment_str); + } + } else if (full_request) { + rkcg_group_assignment = rkcg->rkcg_current_assignment; + } + + if (rd_kafka_cgrp_consumer_subscription_preconditions_met(rkcg) || + rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_SENDING_NEW_SUBSCRIPTION) { + rkcg->rkcg_consumer_flags = + (rkcg->rkcg_consumer_flags & + ~RD_KAFKA_CGRP_CONSUMER_F_SEND_NEW_SUBSCRIPTION) | + RD_KAFKA_CGRP_CONSUMER_F_SENDING_NEW_SUBSCRIPTION; + rkcg_subscription_topics = rkcg->rkcg_subscription_topics; + rkcg_subscription_regex = rkcg->rkcg_subscription_regex; + + if (rd_kafka_is_dbg(rkcg->rkcg_rk, CGRP)) { + char rkcg_new_subscription_str[512] = "NULL"; + + if (rkcg->rkcg_subscription) { + rd_kafka_topic_partition_list_str( + rkcg->rkcg_subscription, + rkcg_new_subscription_str, + sizeof(rkcg_new_subscription_str), 0); + } + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Sending new subscription \"%s\"", + rkcg_new_subscription_str); + } + } + + rkcg->rkcg_expedite_heartbeat_retries++; + rd_kafka_ConsumerGroupHeartbeatRequest( + rkcg->rkcg_coord, rkcg->rkcg_group_id, rkcg->rkcg_member_id, + member_epoch, rkcg_group_instance_id, rkcg_client_rack, + max_poll_interval_ms, rkcg_subscription_topics, + rkcg_subscription_regex, rkcg_group_remote_assignor, + rkcg_group_assignment, RD_KAFKA_REPLYQ(rkcg->rkcg_ops, 0), + rd_kafka_cgrp_handle_ConsumerGroupHeartbeat, NULL); +} + +static rd_bool_t +rd_kafka_cgrp_consumer_heartbeat_preconditions_met(rd_kafka_cgrp_t *rkcg) { + rd_dassert( + !(rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_INIT && + rkcg->rkcg_flags & RD_KAFKA_CGRP_F_LEAVE_ON_UNASSIGN_DONE)); + + if (!(rkcg->rkcg_flags & RD_KAFKA_CGRP_F_SUBSCRIPTION)) + return rd_false; + + if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_HEARTBEAT_IN_TRANSIT) + return rd_false; + + if (rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN_TO_COMPLETE) + return rd_false; + + if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_MAX_POLL_EXCEEDED && + rd_kafka_max_poll_exceeded(rkcg->rkcg_rk)) + return rd_false; + + if (rd_kafka_cgrp_will_leave(rkcg)) + return rd_false; + + return rd_true; +} + +void rd_kafka_cgrp_consumer_serve(rd_kafka_cgrp_t *rkcg) { + rd_bool_t full_request = rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_SEND_FULL_REQUEST; + rd_bool_t send_ack = rd_false; + + if (unlikely(rd_kafka_fatal_error_code(rkcg->rkcg_rk))) + return; + + if (unlikely(rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN)) { + if (RD_KAFKA_CGRP_REBALANCING(rkcg)) + return; + rkcg->rkcg_consumer_flags &= + ~RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN; + rkcg->rkcg_consumer_flags |= + RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN_TO_COMPLETE; + + rd_kafka_dbg( + rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Revoking assignment as lost an rejoining in join state %s", + rd_kafka_cgrp_join_state_names[rkcg->rkcg_join_state]); + + rd_kafka_cgrp_revoke_all_rejoin(rkcg, rd_true, rd_true, + "member fenced - rejoining"); + } + + switch (rkcg->rkcg_join_state) { + case RD_KAFKA_CGRP_JOIN_STATE_INIT: + rkcg->rkcg_consumer_flags &= + ~RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN_TO_COMPLETE; + rd_kafka_cgrp_consumer_apply_next_subscribe(rkcg); + full_request = rd_true; + break; + case RD_KAFKA_CGRP_JOIN_STATE_STEADY: + if (rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_WAIT_ACK) { + send_ack = rd_true; + } + break; + case RD_KAFKA_CGRP_JOIN_STATE_WAIT_UNASSIGN_CALL: + case RD_KAFKA_CGRP_JOIN_STATE_WAIT_ASSIGN_CALL: + case RD_KAFKA_CGRP_JOIN_STATE_WAIT_INCR_UNASSIGN_TO_COMPLETE: + case RD_KAFKA_CGRP_JOIN_STATE_WAIT_UNASSIGN_TO_COMPLETE: + break; + default: + rd_assert(!*"unexpected state"); + } + + if (rd_kafka_cgrp_consumer_heartbeat_preconditions_met(rkcg)) { + rd_ts_t next_heartbeat = + rd_interval(&rkcg->rkcg_heartbeat_intvl, + rkcg->rkcg_heartbeat_intvl_ms * 1000, 0); + if (next_heartbeat > 0) { + rd_kafka_cgrp_consumer_group_heartbeat( + rkcg, full_request, send_ack); + next_heartbeat = rkcg->rkcg_heartbeat_intvl_ms * 1000; + } else { + next_heartbeat = -1 * next_heartbeat; + } + if (likely(rkcg->rkcg_heartbeat_intvl_ms > 0)) { + if (rkcg->rkcg_serve_timer.rtmr_next > + (rd_clock() + next_heartbeat)) { + /* We stop the timer if it expires later + * than expected and restart it below. */ + rd_kafka_timer_stop(&rkcg->rkcg_rk->rk_timers, + &rkcg->rkcg_serve_timer, 0); + } + + /* Scheduling a timer yields the main loop so + * 'restart' has to be set to false to avoid a tight + * loop. */ + rd_kafka_timer_start_oneshot( + &rkcg->rkcg_rk->rk_timers, &rkcg->rkcg_serve_timer, + rd_false /*don't restart*/, next_heartbeat, + rd_kafka_cgrp_serve_timer_cb, NULL); + } + } +} + +/** + * Set new atomic topic subscription (KIP-848). + * + * @locality rdkafka main thread + * @locks none + */ +static rd_kafka_resp_err_t +rd_kafka_cgrp_consumer_subscribe(rd_kafka_cgrp_t *rkcg, + rd_kafka_topic_partition_list_t *rktparlist) { + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP | RD_KAFKA_DBG_CONSUMER, "SUBSCRIBE", + "Group \"%.*s\": subscribe to new %ssubscription " + "of %d topics (join-state %s)", + RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), + rktparlist ? "" : "unset ", + rktparlist ? rktparlist->cnt : 0, + rd_kafka_cgrp_join_state_names[rkcg->rkcg_join_state]); + + /* If the consumer has raised a fatal error treat all subscribes as + unsubscribe */ + if (rd_kafka_fatal_error_code(rkcg->rkcg_rk)) { + if (rkcg->rkcg_subscription) + rd_kafka_cgrp_unsubscribe(rkcg, + rd_true /*leave group*/); + return RD_KAFKA_RESP_ERR__FATAL; + } + + if (rktparlist) { + if (rkcg->rkcg_next_subscription) + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_next_subscription); + rkcg->rkcg_next_subscription = rktparlist; + + /* If member is leaving, new subscription + * will be applied after the leave + * ConsumerGroupHeartbeat */ + if (!rd_kafka_cgrp_will_leave(rkcg)) + rd_kafka_cgrp_consumer_apply_next_subscribe(rkcg); + } else { + rd_kafka_cgrp_consumer_unsubscribe(rkcg, + rd_true /*leave group*/); + } + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Call when all incremental unassign operations are done to transition + * to the next state. + */ +static void rd_kafka_cgrp_consumer_incr_unassign_done(rd_kafka_cgrp_t *rkcg) { + + /* If this action was underway when a terminate was initiated, it will + * be left to complete. Now that's done, unassign all partitions */ + if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE) { + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "UNASSIGN", + "Group \"%s\" is terminating, initiating full " + "unassign", + rkcg->rkcg_group_id->str); + rd_kafka_cgrp_unassign(rkcg); + + /* Leave group, if desired. */ + rd_kafka_cgrp_leave_maybe(rkcg); + return; + } + + if (rkcg->rkcg_rebalance_incr_assignment) { + /* This incremental unassign was part of a normal rebalance + * (in which the revoke set was not empty). Immediately + * trigger the assign that follows this revoke. The protocol + * dictates this should occur even if the new assignment + * set is empty. + * + * Also, since this rebalance had some revoked partitions, + * a re-join should occur following the assign. + */ + + rd_kafka_rebalance_op_incr( + rkcg, RD_KAFKA_RESP_ERR__ASSIGN_PARTITIONS, + rkcg->rkcg_rebalance_incr_assignment, + rd_false /* don't rejoin following assign*/, + "cooperative assign after revoke"); + + rd_kafka_topic_partition_list_destroy( + rkcg->rkcg_rebalance_incr_assignment); + rkcg->rkcg_rebalance_incr_assignment = NULL; + + /* Note: rkcg_rebalance_rejoin is actioned / reset in + * rd_kafka_cgrp_incremental_assign call */ + + } else if (rkcg->rkcg_rebalance_rejoin) { + rkcg->rkcg_rebalance_rejoin = rd_false; + + /* There are some cases (lost partitions), where a rejoin + * should occur immediately following the unassign (this + * is not the case under normal conditions), in which case + * the rejoin flag will be set. */ + + rd_kafka_cgrp_rejoin(rkcg, "Incremental unassignment done"); + + } else { + /* After this incremental unassignment we're now back in + * a steady state. */ + rd_kafka_cgrp_set_join_state(rkcg, + RD_KAFKA_CGRP_JOIN_STATE_STEADY); + if (rkcg->rkcg_subscription) { + rd_kafka_cgrp_start_max_poll_interval_timer(rkcg); + } + } +} + +/** + * @brief KIP 848: Called from assignment code when all in progress + * assignment/unassignment operations are done, allowing the cgrp to + * transition to other states if needed. + * + * @param rkcg Consumer group. + * + * @remark This may be called spontaneously without any need for a state + * change in the rkcg. + * + * @locality rdkafka main thread + * @locks none + */ +static void rd_kafka_cgrp_consumer_assignment_done(rd_kafka_cgrp_t *rkcg) { + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "ASSIGNDONE", + "Group \"%s\": " + "assignment operations done in join-state %s " + "(rebalance rejoin=%s)", + rkcg->rkcg_group_id->str, + rd_kafka_cgrp_join_state_names[rkcg->rkcg_join_state], + RD_STR_ToF(rkcg->rkcg_rebalance_rejoin)); + + switch (rkcg->rkcg_join_state) { + case RD_KAFKA_CGRP_JOIN_STATE_WAIT_UNASSIGN_TO_COMPLETE: + rd_kafka_cgrp_unassign_done(rkcg); + break; + + case RD_KAFKA_CGRP_JOIN_STATE_WAIT_INCR_UNASSIGN_TO_COMPLETE: + rd_kafka_cgrp_consumer_incr_unassign_done(rkcg); + break; + + case RD_KAFKA_CGRP_JOIN_STATE_INIT: + case RD_KAFKA_CGRP_JOIN_STATE_STEADY: { + + rd_bool_t not_in_group = rd_false; + /* + * There maybe a case when there are no assignments are + * assigned to this consumer. In this case, while terminating + * the consumer can be in STEADY or INIT state and won't go + * to intermediate state. In this scenario, last leave call is + * done from here. + */ + not_in_group |= rd_kafka_cgrp_leave_maybe(rkcg); + + /* Check if cgrp is trying to terminate, which is safe to do + * in these two states. Otherwise we'll need to wait for + * the current state to decommission. */ + not_in_group |= rd_kafka_cgrp_try_terminate(rkcg); + + if (not_in_group) + break; + + if (rkcg->rkcg_join_state == RD_KAFKA_CGRP_JOIN_STATE_INIT) { + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rkcg, "Assignment Done: in init state"); + } else if (rkcg->rkcg_rebalance_rejoin) { + /* No need to expedite the HB here as it's being + * expedited in the rejoin call.*/ + rkcg->rkcg_rebalance_rejoin = rd_false; + rd_kafka_cgrp_rejoin( + rkcg, + "Assignment Done: rejoining group to redistribute " + "previously owned partitions to other " + "group members"); + } else if (rkcg->rkcg_consumer_flags & + RD_KAFKA_CGRP_CONSUMER_F_WAIT_ACK) { + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rkcg, + "Assignment Done: in steady state, waiting for " + "ack"); + } + break; + } + + default: + break; + } +} + +void rd_kafka_cgrp_consumer_expedite_next_heartbeat(rd_kafka_cgrp_t *rkcg, + const char *reason) { + if (rkcg->rkcg_group_protocol != RD_KAFKA_GROUP_PROTOCOL_CONSUMER) + return; + + rd_kafka_t *rk = rkcg->rkcg_rk; + /* Calculate the exponential backoff. */ + int64_t backoff = 0; + if (rkcg->rkcg_expedite_heartbeat_retries) + backoff = 1 << (rkcg->rkcg_expedite_heartbeat_retries - 1); + + /* We are multiplying by 10 as (backoff_ms * percent * 1000)/100 -> + * backoff_ms * jitter * 10 */ + backoff = rd_jitter(100 - RD_KAFKA_RETRY_JITTER_PERCENT, + 100 + RD_KAFKA_RETRY_JITTER_PERCENT) * + backoff * 10; + + /* Backoff is limited by retry_backoff_max_ms. */ + if (backoff > rk->rk_conf.retry_backoff_max_ms * 1000) + backoff = rk->rk_conf.retry_backoff_max_ms * 1000; + + /* Reset the interval as it happened `rkcg_heartbeat_intvl_ms` + * milliseconds ago. */ + rd_interval_reset_to_now(&rkcg->rkcg_heartbeat_intvl, + rd_clock() - + rkcg->rkcg_heartbeat_intvl_ms * 1000); + /* Set the exponential backoff. */ + rd_interval_backoff(&rkcg->rkcg_heartbeat_intvl, backoff); + + rd_kafka_dbg(rkcg->rkcg_rk, CGRP, "HEARTBEAT", + "Expediting next heartbeat" + ", with backoff %" PRId64 ": %s", + backoff, reason); + + /* Scheduling the timer awakes main loop too. */ + rd_kafka_timer_start_oneshot(&rkcg->rkcg_rk->rk_timers, + &rkcg->rkcg_serve_timer, rd_true, backoff, + rd_kafka_cgrp_serve_timer_cb, NULL); +} + /** * Client group handling. * Called from main thread to serve the operational aspects of a cgrp. @@ -5201,9 +6580,15 @@ retry: rd_kafka_cgrp_set_state(rkcg, RD_KAFKA_CGRP_STATE_UP); /* Serve join state to trigger (re)join */ - rd_kafka_cgrp_join_state_serve(rkcg); + if (rkcg->rkcg_group_protocol == + RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_cgrp_consumer_serve(rkcg); + } else { + rd_kafka_cgrp_join_state_serve(rkcg); + } - /* Serve any pending partitions in the assignment */ + /* Serve any pending partitions in the + * assignment */ rd_kafka_assignment_serve(rkcg->rkcg_rk); } break; @@ -5221,7 +6606,13 @@ retry: rd_kafka_cgrp_coord_query(rkcg, "intervaled in state up"); - rd_kafka_cgrp_join_state_serve(rkcg); + if (rkcg->rkcg_group_protocol == + RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + rd_kafka_cgrp_consumer_serve(rkcg); + } else { + rd_kafka_cgrp_join_state_serve(rkcg); + } + break; } @@ -5255,8 +6646,201 @@ void rd_kafka_cgrp_op(rd_kafka_cgrp_t *rkcg, rd_kafka_q_enq(rkcg->rkcg_ops, rko); } +/** + * @brief Handle cgrp queue op. + * @locality rdkafka main thread + * @locks none + */ +static rd_kafka_op_res_t rd_kafka_cgrp_op_serve(rd_kafka_t *rk, + rd_kafka_q_t *rkq, + rd_kafka_op_t *rko, + rd_kafka_q_cb_type_t cb_type, + void *opaque) { + rd_kafka_cgrp_t *rkcg = opaque; + rd_kafka_toppar_t *rktp; + rd_kafka_resp_err_t err; + const int silent_op = rko->rko_type == RD_KAFKA_OP_RECV_BUF; + if (unlikely(rd_atomic32_get(&rkcg->rkcg_terminated) == rd_true)) { + if (rko) + rd_kafka_op_destroy(rko); + return RD_KAFKA_OP_RES_HANDLED; + } + rktp = rko->rko_rktp; + + if (rktp && !silent_op) + rd_kafka_dbg( + rkcg->rkcg_rk, CGRP, "CGRPOP", + "Group \"%.*s\" received op %s in state %s " + "(join-state %s) for %.*s [%" PRId32 "]", + RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), + rd_kafka_op2str(rko->rko_type), + rd_kafka_cgrp_state_names[rkcg->rkcg_state], + rd_kafka_cgrp_join_state_names[rkcg->rkcg_join_state], + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition); + else if (!silent_op) + rd_kafka_dbg( + rkcg->rkcg_rk, CGRP, "CGRPOP", + "Group \"%.*s\" received op %s in state %s " + "(join-state %s)", + RD_KAFKAP_STR_PR(rkcg->rkcg_group_id), + rd_kafka_op2str(rko->rko_type), + rd_kafka_cgrp_state_names[rkcg->rkcg_state], + rd_kafka_cgrp_join_state_names[rkcg->rkcg_join_state]); + + switch ((int)rko->rko_type) { + case RD_KAFKA_OP_NAME: + /* Return the currently assigned member id. */ + if (rkcg->rkcg_member_id) + rko->rko_u.name.str = + RD_KAFKAP_STR_DUP(rkcg->rkcg_member_id); + rd_kafka_op_reply(rko, 0); + rko = NULL; + break; + + case RD_KAFKA_OP_CG_METADATA: + /* Return the current consumer group metadata. */ + rko->rko_u.cg_metadata = + rkcg->rkcg_member_id + ? rd_kafka_consumer_group_metadata_new_with_genid( + rkcg->rkcg_rk->rk_conf.group_id_str, + rkcg->rkcg_generation_id, + rkcg->rkcg_member_id->str, + rkcg->rkcg_rk->rk_conf.group_instance_id) + : NULL; + rd_kafka_op_reply(rko, RD_KAFKA_RESP_ERR_NO_ERROR); + rko = NULL; + break; + + case RD_KAFKA_OP_OFFSET_FETCH: + if (rkcg->rkcg_state != RD_KAFKA_CGRP_STATE_UP || + (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE)) { + rd_kafka_op_handle_OffsetFetch( + rkcg->rkcg_rk, NULL, RD_KAFKA_RESP_ERR__WAIT_COORD, + NULL, NULL, rko); + rko = NULL; /* rko freed by handler */ + break; + } + + rd_kafka_OffsetFetchRequest( + rkcg->rkcg_coord, rk->rk_group_id->str, + rko->rko_u.offset_fetch.partitions, rd_false, -1, NULL, + rko->rko_u.offset_fetch.require_stable_offsets, + 0, /* Timeout */ + RD_KAFKA_REPLYQ(rkcg->rkcg_ops, 0), + rd_kafka_op_handle_OffsetFetch, rko); + rko = NULL; /* rko now owned by request */ + break; + + case RD_KAFKA_OP_PARTITION_JOIN: + rd_kafka_cgrp_partition_add(rkcg, rktp); + + /* If terminating tell the partition to leave */ + if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_TERMINATE) + rd_kafka_toppar_op_fetch_stop(rktp, RD_KAFKA_NO_REPLYQ); + break; + + case RD_KAFKA_OP_PARTITION_LEAVE: + rd_kafka_cgrp_partition_del(rkcg, rktp); + break; + + case RD_KAFKA_OP_OFFSET_COMMIT: + /* Trigger offsets commit. */ + rd_kafka_cgrp_offsets_commit(rkcg, rko, + /* only set offsets + * if no partitions were + * specified. */ + rko->rko_u.offset_commit.partitions + ? 0 + : 1 /* set_offsets*/, + rko->rko_u.offset_commit.reason); + rko = NULL; /* rko now owned by request */ + break; + + case RD_KAFKA_OP_COORD_QUERY: + rd_kafka_cgrp_coord_query( + rkcg, + rko->rko_err ? rd_kafka_err2str(rko->rko_err) : "from op"); + break; + + case RD_KAFKA_OP_SUBSCRIBE: + /* We just want to avoid reaching max poll interval, + * without anything else is done on poll. */ + rd_atomic64_set(&rk->rk_ts_last_poll, rd_clock()); + + /* New atomic subscription (may be NULL) */ + if (rkcg->rkcg_group_protocol == + RD_KAFKA_GROUP_PROTOCOL_CONSUMER) { + err = rd_kafka_cgrp_consumer_subscribe( + rkcg, rko->rko_u.subscribe.topics); + } else { + err = rd_kafka_cgrp_subscribe( + rkcg, rko->rko_u.subscribe.topics); + } + + if (!err) /* now owned by rkcg */ + rko->rko_u.subscribe.topics = NULL; + + rd_kafka_op_reply(rko, err); + rko = NULL; + break; + + case RD_KAFKA_OP_ASSIGN: + rd_kafka_cgrp_handle_assign_op(rkcg, rko); + rko = NULL; + break; + + case RD_KAFKA_OP_GET_SUBSCRIPTION: + if (rkcg->rkcg_next_subscription) + rko->rko_u.subscribe.topics = + rd_kafka_topic_partition_list_copy( + rkcg->rkcg_next_subscription); + else if (rkcg->rkcg_next_unsubscribe) + rko->rko_u.subscribe.topics = NULL; + else if (rkcg->rkcg_subscription) + rko->rko_u.subscribe.topics = + rd_kafka_topic_partition_list_copy( + rkcg->rkcg_subscription); + rd_kafka_op_reply(rko, 0); + rko = NULL; + break; + + case RD_KAFKA_OP_GET_ASSIGNMENT: + /* This is the consumer assignment, not the group assignment. */ + rko->rko_u.assign.partitions = + rd_kafka_topic_partition_list_copy( + rkcg->rkcg_rk->rk_consumer.assignment.all); + + rd_kafka_op_reply(rko, 0); + rko = NULL; + break; + + case RD_KAFKA_OP_GET_REBALANCE_PROTOCOL: + rko->rko_u.rebalance_protocol.str = + rd_kafka_rebalance_protocol2str( + rd_kafka_cgrp_rebalance_protocol(rkcg)); + rd_kafka_op_reply(rko, RD_KAFKA_RESP_ERR_NO_ERROR); + rko = NULL; + break; + + case RD_KAFKA_OP_TERMINATE: + rd_kafka_cgrp_terminate0(rkcg, rko); + rko = NULL; /* terminate0() takes ownership */ + break; + + default: + rd_kafka_assert(rkcg->rkcg_rk, !*"unknown type"); + break; + } + + if (rko) + rd_kafka_op_destroy(rko); + + return RD_KAFKA_OP_RES_HANDLED; +} + void rd_kafka_cgrp_set_member_id(rd_kafka_cgrp_t *rkcg, const char *member_id) { if (rkcg->rkcg_member_id && member_id && !rd_kafkap_str_cmp_str(rkcg->rkcg_member_id, member_id)) @@ -5300,9 +6884,7 @@ rd_kafka_cgrp_owned_but_not_exist_partitions(rd_kafka_cgrp_t *rkcg) { result = rd_kafka_topic_partition_list_new( rkcg->rkcg_group_assignment->cnt); - rd_kafka_topic_partition_list_add0( - __FUNCTION__, __LINE__, result, curr->topic, - curr->partition, curr->_private); + rd_kafka_topic_partition_list_add_copy(result, curr); } return result; @@ -5326,6 +6908,9 @@ void rd_kafka_cgrp_metadata_update_check(rd_kafka_cgrp_t *rkcg, rd_kafka_assert(NULL, thrd_is_current(rkcg->rkcg_rk->rk_thread)); + if (rkcg->rkcg_group_protocol != RD_KAFKA_GROUP_PROTOCOL_CLASSIC) + return; + if (!rkcg->rkcg_subscription || rkcg->rkcg_subscription->cnt == 0) return; @@ -5338,7 +6923,7 @@ void rd_kafka_cgrp_metadata_update_check(rd_kafka_cgrp_t *rkcg, * Create a list of the topics in metadata that matches our subscription */ tinfos = rd_list_new(rkcg->rkcg_subscription->cnt, - (void *)rd_kafka_topic_info_destroy); + rd_kafka_topic_info_destroy_free); if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION) rd_kafka_metadata_topic_match(rkcg->rkcg_rk, tinfos, @@ -5396,7 +6981,8 @@ void rd_kafka_cgrp_metadata_update_check(rd_kafka_cgrp_t *rkcg, owned_but_not_exist, rkcg->rkcg_group_leader.members != NULL /* Rejoin group following revoke's - * unassign if we are leader */ + * unassign if we are leader and consumer + * group protocol is GENERIC */ , "topics not available"); rd_kafka_topic_partition_list_destroy( @@ -5476,6 +7062,27 @@ rd_kafka_consumer_group_metadata(rd_kafka_t *rk) { return cgmetadata; } +const char *rd_kafka_consumer_group_metadata_group_id( + const rd_kafka_consumer_group_metadata_t *group_metadata) { + return group_metadata->group_id; +} + +const char *rd_kafka_consumer_group_metadata_member_id( + const rd_kafka_consumer_group_metadata_t *group_metadata) { + return group_metadata->member_id; +} + +const char *rd_kafka_consumer_group_metadata_group_instance_id( + const rd_kafka_consumer_group_metadata_t *group_metadata) { + return group_metadata->group_instance_id; +} + +int32_t rd_kafka_consumer_group_metadata_generation_id( + const rd_kafka_consumer_group_metadata_t *group_metadata) { + return group_metadata->generation_id; +} + + void rd_kafka_consumer_group_metadata_destroy( rd_kafka_consumer_group_metadata_t *cgmetadata) { rd_free(cgmetadata->group_id); @@ -5892,6 +7499,75 @@ static int unittest_list_to_map(void) { RD_UT_PASS(); } +int unittest_member_metadata_serdes(void) { + rd_list_t *topics = rd_list_new(0, rd_kafka_topic_info_destroy_free); + rd_kafka_topic_partition_list_t *owned_partitions = + rd_kafka_topic_partition_list_new(0); + rd_kafkap_str_t *rack_id = rd_kafkap_str_new("myrack", -1); + const void *userdata = NULL; + const int32_t userdata_size = 0; + const int generation = 3; + const char topic_name[] = "mytopic"; + rd_kafka_group_member_t *rkgm; + int version; + + rd_list_add(topics, rd_kafka_topic_info_new(topic_name, 3)); + rd_kafka_topic_partition_list_add(owned_partitions, topic_name, 0); + rkgm = rd_calloc(1, sizeof(*rkgm)); + + /* Note that the version variable doesn't actually change the Version + * field in the serialized message. It only runs the tests with/without + * additional fields added in that particular version. */ + for (version = 0; version <= 3; version++) { + rd_kafkap_bytes_t *member_metadata; + + /* Serialize. */ + member_metadata = + rd_kafka_consumer_protocol_member_metadata_new( + topics, userdata, userdata_size, + version >= 1 ? owned_partitions : NULL, + version >= 2 ? generation : -1, + version >= 3 ? rack_id : NULL); + + /* Deserialize. */ + rd_kafka_group_MemberMetadata_consumer_read(NULL, rkgm, + member_metadata); + + /* Compare results. */ + RD_UT_ASSERT(rkgm->rkgm_subscription->cnt == + rd_list_cnt(topics), + "subscription size should be correct"); + RD_UT_ASSERT(!strcmp(topic_name, + rkgm->rkgm_subscription->elems[0].topic), + "subscriptions should be correct"); + RD_UT_ASSERT(rkgm->rkgm_userdata->len == userdata_size, + "userdata should have the size 0"); + if (version >= 1) + RD_UT_ASSERT(!rd_kafka_topic_partition_list_cmp( + rkgm->rkgm_owned, owned_partitions, + rd_kafka_topic_partition_cmp), + "owned partitions should be same"); + if (version >= 2) + RD_UT_ASSERT(generation == rkgm->rkgm_generation, + "generation should be same"); + if (version >= 3) + RD_UT_ASSERT( + !rd_kafkap_str_cmp(rack_id, rkgm->rkgm_rack_id), + "rack id should be same"); + + rd_kafka_group_member_clear(rkgm); + rd_kafkap_bytes_destroy(member_metadata); + } + + /* Clean up. */ + rd_list_destroy(topics); + rd_kafka_topic_partition_list_destroy(owned_partitions); + rd_kafkap_str_destroy(rack_id); + rd_free(rkgm); + + RD_UT_PASS(); +} + /** * @brief Consumer group unit tests @@ -5904,6 +7580,7 @@ int unittest_cgrp(void) { fails += unittest_set_subtract(); fails += unittest_map_to_list(); fails += unittest_list_to_map(); + fails += unittest_member_metadata_serdes(); return fails; } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_cgrp.h b/src/third_party/librdkafka/dist/src/rdkafka_cgrp.h index 4fa51e54897..79a734f5fb2 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_cgrp.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_cgrp.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -56,6 +57,7 @@ typedef struct rd_kafka_cgrp_s { rd_kafkap_str_t *rkcg_member_id; /* Last assigned MemberId */ rd_kafkap_str_t *rkcg_group_instance_id; const rd_kafkap_str_t *rkcg_client_id; + rd_kafkap_str_t *rkcg_client_rack; enum { /* Init state */ @@ -163,6 +165,10 @@ typedef struct rd_kafka_cgrp_s { rd_interval_t rkcg_coord_query_intvl; /* Coordinator query intvl*/ rd_interval_t rkcg_heartbeat_intvl; /* Heartbeat intvl */ + rd_kafka_timer_t rkcg_serve_timer; /* Timer for next serve. */ + int rkcg_heartbeat_intvl_ms; /* KIP 848: received + * heartbeat interval in + * milliseconds */ rd_interval_t rkcg_join_intvl; /* JoinGroup interval */ rd_interval_t rkcg_timeout_scan_intvl; /* Timeout scanner */ @@ -179,7 +185,8 @@ typedef struct rd_kafka_cgrp_s { rd_list_t rkcg_toppars; /* Toppars subscribed to*/ - int32_t rkcg_generation_id; /* Current generation id */ + int32_t rkcg_generation_id; /* Current generation id (classic) + * or member epoch (consumer). */ rd_kafka_assignor_t *rkcg_assignor; /**< The current partition * assignor. used by both @@ -190,6 +197,12 @@ typedef struct rd_kafka_cgrp_s { int32_t rkcg_coord_id; /**< Current coordinator id, * or -1 if not known. */ + rd_kafka_group_protocol_t + rkcg_group_protocol; /**< Group protocol to use */ + + rd_kafkap_str_t *rkcg_group_remote_assignor; /**< Group remote + * assignor to use */ + rd_kafka_broker_t *rkcg_curr_coord; /**< Current coordinator * broker handle, or NULL. * rkcg_coord's nodename is @@ -217,9 +230,33 @@ typedef struct rd_kafka_cgrp_s { rd_kafka_topic_partition_list_t *rkcg_errored_topics; /** If a SUBSCRIBE op is received during a COOPERATIVE rebalance, * actioning this will be postponed until after the rebalance - * completes. The waiting subscription is stored here. - * Mutually exclusive with rkcg_next_subscription. */ + * completes. The waiting subscription is stored here. */ rd_kafka_topic_partition_list_t *rkcg_next_subscription; + + /** + * Subscription regex pattern. All the provided regex patterns are + * stored as a single string with each pattern separated by '|'. + * + * Only applicable for the consumer protocol introduced in KIP-848. + * + * rkcg_subscription = rkcg_subscription_topics + + * rkcg_subscription_regex + */ + rd_kafkap_str_t *rkcg_subscription_regex; + + /** + * Full topic names extracted out from the rkcg_subscription. + * + * Only applicable for the consumer protocol introduced in KIP-848. + * + * For the consumer protocol, this field doesn't include regex + * subscriptions. For that please refer `rkcg_subscription_regex` + * + * rkcg_subscription = rkcg_subscription_topics + + * rkcg_subscription_regex + */ + rd_kafka_topic_partition_list_t *rkcg_subscription_topics; + /** If a (un)SUBSCRIBE op is received during a COOPERATIVE rebalance, * actioning this will be posponed until after the rebalance * completes. This flag is used to signal a waiting unsubscribe @@ -255,10 +292,52 @@ typedef struct rd_kafka_cgrp_s { * currently in-progress incremental unassign. */ rd_kafka_topic_partition_list_t *rkcg_rebalance_incr_assignment; + /** Current acked assignment, start with an empty list. */ + rd_kafka_topic_partition_list_t *rkcg_current_assignment; + + /** Assignment the is currently reconciling. + * Can be NULL in case there's no reconciliation ongoing. */ + rd_kafka_topic_partition_list_t *rkcg_target_assignment; + + /** Next assignment that will be reconciled once current + * reconciliation finishes. Can be NULL. */ + rd_kafka_topic_partition_list_t *rkcg_next_target_assignment; + + /** Number of backoff retries when expediting next heartbeat. */ + int rkcg_expedite_heartbeat_retries; + + /** Flags for KIP-848 state machine. */ + int rkcg_consumer_flags; +/** Coordinator is waiting for an acknowledgement of currently reconciled + * target assignment. Cleared when an HB succeeds + * after reconciliation finishes. */ +#define RD_KAFKA_CGRP_CONSUMER_F_WAIT_ACK 0x1 +/** Member is sending an acknowledgement for a reconciled assignment */ +#define RD_KAFKA_CGRP_CONSUMER_F_SENDING_ACK 0x2 +/** A new subscription needs to be sent to the Coordinator. */ +#define RD_KAFKA_CGRP_CONSUMER_F_SEND_NEW_SUBSCRIPTION 0x4 +/** A new subscription is being sent to the Coordinator. */ +#define RD_KAFKA_CGRP_CONSUMER_F_SENDING_NEW_SUBSCRIPTION 0x8 +/** Consumer has subscribed at least once, + * if it didn't happen rebalance protocol is still + * considered NONE, otherwise it depends on the + * configured partition assignors. */ +#define RD_KAFKA_CGRP_CONSUMER_F_SUBSCRIBED_ONCE 0x10 +/** Send a complete request in next heartbeat */ +#define RD_KAFKA_CGRP_CONSUMER_F_SEND_FULL_REQUEST 0x20 +/** Member is fenced, need to rejoin */ +#define RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN 0x40 +/** Member is fenced, rejoining */ +#define RD_KAFKA_CGRP_CONSUMER_F_WAIT_REJOIN_TO_COMPLETE 0x80 +/** Serve pending assignments after heartbeat */ +#define RD_KAFKA_CGRP_CONSUMER_F_SERVE_PENDING 0x100 + /** Rejoin the group following a currently in-progress * incremental unassign. */ rd_bool_t rkcg_rebalance_rejoin; + rd_ts_t rkcg_ts_last_err; /* Timestamp of last error + * propagated to application */ rd_kafka_resp_err_t rkcg_last_err; /* Last error propagated to * application. * This is for silencing @@ -280,6 +359,8 @@ typedef struct rd_kafka_cgrp_s { rd_atomic32_t rkcg_terminated; /**< Consumer has been closed */ + rd_atomic32_t rkcg_subscription_version; /**< Subscription version */ + /* Protected by rd_kafka_*lock() */ struct { rd_ts_t ts_rebalance; /* Timestamp of @@ -293,6 +374,9 @@ typedef struct rd_kafka_cgrp_s { * assignment */ } rkcg_c; + /* Timestamp of last rebalance start */ + rd_ts_t rkcg_ts_rebalance_start; + } rd_kafka_cgrp_t; @@ -313,6 +397,7 @@ extern const char *rd_kafka_cgrp_join_state_names[]; void rd_kafka_cgrp_destroy_final(rd_kafka_cgrp_t *rkcg); rd_kafka_cgrp_t *rd_kafka_cgrp_new(rd_kafka_t *rk, + rd_kafka_group_protocol_t group_protocol, const rd_kafkap_str_t *group_id, const rd_kafkap_str_t *client_id); void rd_kafka_cgrp_serve(rd_kafka_cgrp_t *rkcg); @@ -346,6 +431,12 @@ void rd_kafka_cgrp_metadata_update_check(rd_kafka_cgrp_t *rkcg, rd_bool_t do_join); #define rd_kafka_cgrp_get(rk) ((rk)->rk_cgrp) +#define rd_kafka_cgrp_same_subscription_version(rk_cgrp, \ + cgrp_subscription_version) \ + ((rk_cgrp) && \ + (cgrp_subscription_version == -1 || \ + rd_atomic32_get(&(rk_cgrp)->rkcg_subscription_version) == \ + cgrp_subscription_version)) void rd_kafka_cgrp_assigned_offsets_commit( rd_kafka_cgrp_t *rkcg, @@ -380,4 +471,7 @@ rd_kafka_rebalance_protocol2str(rd_kafka_rebalance_protocol_t protocol) { } } +void rd_kafka_cgrp_consumer_expedite_next_heartbeat(rd_kafka_cgrp_t *rkcg, + const char *reason); + #endif /* _RDKAFKA_CGRP_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_conf.c b/src/third_party/librdkafka/dist/src/rdkafka_conf.c index 5933b8cd9b8..d04f5872f2d 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_conf.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_conf.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2022 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023 Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,24 +56,29 @@ #include #endif +#ifdef WITH_OAUTHBEARER_OIDC +#include +#endif + struct rd_kafka_property { rd_kafka_conf_scope_t scope; const char *name; - enum { _RK_C_STR, - _RK_C_INT, - _RK_C_DBL, /* Double */ - _RK_C_S2I, /* String to Integer mapping. - * Supports limited canonical str->int mappings - * using s2i[] */ - _RK_C_S2F, /* CSV String to Integer flag mapping (OR:ed) */ - _RK_C_BOOL, - _RK_C_PTR, /* Only settable through special set functions */ - _RK_C_PATLIST, /* Pattern list */ - _RK_C_KSTR, /* Kafka string */ - _RK_C_ALIAS, /* Alias: points to other property through .sdef */ - _RK_C_INTERNAL, /* Internal, don't expose to application */ - _RK_C_INVALID, /* Invalid property, used to catch known - * but unsupported Java properties. */ + enum { + _RK_C_STR, + _RK_C_INT, + _RK_C_DBL, /* Double */ + _RK_C_S2I, /* String to Integer mapping. + * Supports limited canonical str->int mappings + * using s2i[] */ + _RK_C_S2F, /* CSV String to Integer flag mapping (OR:ed) */ + _RK_C_BOOL, + _RK_C_PTR, /* Only settable through special set functions */ + _RK_C_PATLIST, /* Pattern list */ + _RK_C_KSTR, /* Kafka string */ + _RK_C_ALIAS, /* Alias: points to other property through .sdef */ + _RK_C_INTERNAL, /* Internal, don't expose to application */ + _RK_C_INVALID, /* Invalid property, used to catch known + * but unsupported Java properties. */ } type; int offset; const char *desc; @@ -89,7 +95,7 @@ struct rd_kafka_property { const char *str; const char *unsupported; /**< Reason for value not being * supported in this build. */ - } s2i[20]; /* _RK_C_S2I and _RK_C_S2F */ + } s2i[21]; /* _RK_C_S2I and _RK_C_S2F */ const char *unsupported; /**< Reason for propery not being supported * in this build. @@ -197,6 +203,15 @@ struct rd_kafka_property { "available at build time" #endif +#if WITH_OAUTHBEARER_OIDC +#define _UNSUPPORTED_HTTPS .unsupported = NULL +#else +#define _UNSUPPORTED_HTTPS \ + .unsupported = \ + "HTTPS calls depend on libcurl and OpenSSL which were not " \ + "available at build time" +#endif + #ifdef _WIN32 #define _UNSUPPORTED_WIN32_GSSAPI \ .unsupported = \ @@ -436,6 +451,34 @@ static const struct rd_kafka_property rd_kafka_properties[] = { 1, 1000000, 1000000}, {_RK_GLOBAL, "max.in.flight", _RK_C_ALIAS, .sdef = "max.in.flight.requests.per.connection"}, + {_RK_GLOBAL, "metadata.recovery.strategy", _RK_C_S2I, + _RK(metadata_recovery_strategy), + "Controls how the client recovers when none of the brokers known to it " + "is available. If set to `none`, the client doesn't re-bootstrap. " + "If set to `rebootstrap`, the client repeats the bootstrap process " + "using `bootstrap.servers` and brokers added through " + "`rd_kafka_brokers_add()`. Rebootstrapping is useful when a client " + "communicates with brokers so infrequently that the set of brokers " + "may change entirely before the client refreshes metadata. " + "Metadata recovery is triggered when all last-known brokers appear " + "unavailable simultaneously or the client cannot refresh metadata within " + "`metadata.recovery.rebootstrap.trigger.ms` or it's requested in a " + "metadata response.", + .vdef = RD_KAFKA_METADATA_RECOVERY_STRATEGY_REBOOTSTRAP, + .s2i = {{RD_KAFKA_METADATA_RECOVERY_STRATEGY_NONE, "none"}, + {RD_KAFKA_METADATA_RECOVERY_STRATEGY_REBOOTSTRAP, "rebootstrap"}, + {0, NULL}}}, + {_RK_GLOBAL, "metadata.recovery.rebootstrap.trigger.ms", _RK_C_INT, + _RK(metadata_recovery_rebootstrap_trigger_ms), + "If a client configured to rebootstrap using " + "`metadata.recovery.strategy=rebootstrap` " + "is unable to obtain metadata from any " + "of the brokers for this interval, " + "client repeats the bootstrap process using " + "`bootstrap.servers` configuration " + "and brokers added through " + "`rd_kafka_brokers_add()`.", + 0, INT_MAX, 300000}, {_RK_GLOBAL | _RK_DEPRECATED | _RK_HIDDEN, "metadata.request.timeout.ms", _RK_C_INT, _RK(metadata_request_timeout_ms), "Not used.", 10, 900 * 1000, 10}, @@ -457,10 +500,12 @@ static const struct rd_kafka_property rd_kafka_properties[] = { {_RK_GLOBAL, "topic.metadata.refresh.fast.interval.ms", _RK_C_INT, _RK(metadata_refresh_fast_interval_ms), "When a topic loses its leader a new metadata request will be " - "enqueued with this initial interval, exponentially increasing " + "enqueued immediately and then with this initial interval, exponentially " + "increasing upto `retry.backoff.max.ms`, " "until the topic metadata has been refreshed. " + "If not set explicitly, it will be defaulted to `retry.backoff.ms`. " "This is used to recover quickly from transitioning leader brokers.", - 1, 60 * 1000, 250}, + 1, 60 * 1000, 100}, {_RK_GLOBAL | _RK_DEPRECATED, "topic.metadata.refresh.fast.cnt", _RK_C_INT, _RK(metadata_refresh_fast_cnt), "No longer used.", 0, 1000, 10}, {_RK_GLOBAL, "topic.metadata.refresh.sparse", _RK_C_BOOL, @@ -508,6 +553,7 @@ static const struct rd_kafka_property rd_kafka_properties[] = { {RD_KAFKA_DBG_MOCK, "mock"}, {RD_KAFKA_DBG_ASSIGNOR, "assignor"}, {RD_KAFKA_DBG_CONF, "conf"}, + {RD_KAFKA_DBG_TELEMETRY, "telemetry"}, {RD_KAFKA_DBG_ALL, "all"}}}, {_RK_GLOBAL, "socket.timeout.ms", _RK_C_INT, _RK(socket_timeout_ms), "Default timeout for network requests. " @@ -536,7 +582,7 @@ static const struct rd_kafka_property rd_kafka_properties[] = { #endif }, {_RK_GLOBAL, "socket.nagle.disable", _RK_C_BOOL, _RK(socket_nagle_disable), - "Disable the Nagle algorithm (TCP_NODELAY) on broker sockets.", 0, 1, 0 + "Disable the Nagle algorithm (TCP_NODELAY) on broker sockets.", 0, 1, 1 #ifndef TCP_NODELAY , .unsupported = "TCP_NODELAY not available at build time" @@ -698,8 +744,10 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "The application should mask this signal as an internal " "signal handler is installed.", 0, 128, 0}, - {_RK_GLOBAL | _RK_HIGH, "api.version.request", _RK_C_BOOL, + {_RK_GLOBAL | _RK_HIGH | _RK_DEPRECATED, "api.version.request", _RK_C_BOOL, _RK(api_version_request), + "**Post-deprecation actions: remove this configuration property, " + "brokers < 0.10.0 won't be supported anymore in librdkafka 3.x.** " "Request broker's supported API versions to adjust functionality to " "available protocol features. If set to false, or the " "ApiVersionRequest fails, the fallback version " @@ -711,16 +759,20 @@ static const struct rd_kafka_property rd_kafka_properties[] = { {_RK_GLOBAL, "api.version.request.timeout.ms", _RK_C_INT, _RK(api_version_request_timeout_ms), "Timeout for broker API version requests.", 1, 5 * 60 * 1000, 10 * 1000}, - {_RK_GLOBAL | _RK_MED, "api.version.fallback.ms", _RK_C_INT, - _RK(api_version_fallback_ms), + {_RK_GLOBAL | _RK_MED | _RK_DEPRECATED, "api.version.fallback.ms", + _RK_C_INT, _RK(api_version_fallback_ms), + "**Post-deprecation actions: remove this configuration property, " + "brokers < 0.10.0 won't be supported anymore in librdkafka 3.x.** " "Dictates how long the `broker.version.fallback` fallback is used " "in the case the ApiVersionRequest fails. " "**NOTE**: The ApiVersionRequest is only issued when a new connection " "to the broker is made (such as after an upgrade).", 0, 86400 * 7 * 1000, 0}, - {_RK_GLOBAL | _RK_MED, "broker.version.fallback", _RK_C_STR, - _RK(broker_version_fallback), + {_RK_GLOBAL | _RK_MED | _RK_DEPRECATED, "broker.version.fallback", + _RK_C_STR, _RK(broker_version_fallback), + "**Post-deprecation actions: remove this configuration property, " + "brokers < 0.10.0 won't be supported anymore in librdkafka 3.x.** " "Older broker versions (before 0.10.0) provide no way for a client to " "query " "for supported protocol features " @@ -754,10 +806,10 @@ static const struct rd_kafka_property rd_kafka_properties[] = { _RK(security_protocol), "Protocol used to communicate with brokers.", .vdef = RD_KAFKA_PROTO_PLAINTEXT, .s2i = {{RD_KAFKA_PROTO_PLAINTEXT, "plaintext"}, - {RD_KAFKA_PROTO_SSL, "ssl", _UNSUPPORTED_SSL}, - {RD_KAFKA_PROTO_SASL_PLAINTEXT, "sasl_plaintext"}, - {RD_KAFKA_PROTO_SASL_SSL, "sasl_ssl", _UNSUPPORTED_SSL}, - {0, NULL}}}, + {RD_KAFKA_PROTO_SSL, "ssl", _UNSUPPORTED_SSL}, + {RD_KAFKA_PROTO_SASL_PLAINTEXT, "sasl_plaintext"}, + {RD_KAFKA_PROTO_SASL_SSL, "sasl_ssl", _UNSUPPORTED_SSL}, + {0, NULL}}}, {_RK_GLOBAL, "ssl.cipher.suites", _RK_C_STR, _RK(ssl.cipher_suites), "A cipher suite is a named combination of authentication, " @@ -821,6 +873,29 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "If OpenSSL is dynamically linked the OpenSSL library's default " "path will be used (see `OPENSSLDIR` in `openssl version -a`).", _UNSUPPORTED_SSL}, + {_RK_GLOBAL, "https.ca.location", _RK_C_STR, _RK(https.ca_location), + "File or directory path to CA certificate(s) for verifying " + "HTTPS endpoints, like `sasl.oauthbearer.token.endpoint.url` used for " + "OAUTHBEARER/OIDC authentication. " + "Mutually exclusive with `https.ca.pem`. " + "Defaults: " + "On Windows the system's CA certificates are automatically looked " + "up in the Windows Root certificate store. " + "On Mac OSX this configuration defaults to `probe`. " + "It is recommended to install openssl using Homebrew, " + "to provide CA certificates. " + "On Linux install the distribution's ca-certificates package. " + "If OpenSSL is statically linked or `https.ca.location` is set to " + "`probe` a list of standard paths will be probed and the first one " + "found will be used as the default CA certificate location path. " + "If OpenSSL is dynamically linked the OpenSSL library's default " + "path will be used (see `OPENSSLDIR` in `openssl version -a`).", + _UNSUPPORTED_HTTPS}, + {_RK_GLOBAL, "https.ca.pem", _RK_C_STR, _RK(https.ca_pem), + "CA certificate string (PEM format) for verifying HTTPS endpoints. " + "Mutually exclusive with `https.ca.location`. " + "Optional: see `https.ca.location`.", + _UNSUPPORTED_HTTPS}, {_RK_GLOBAL | _RK_SENSITIVE, "ssl.ca.pem", _RK_C_STR, _RK(ssl.ca_pem), "CA certificate string (PEM format) for verifying the broker's key.", _UNSUPPORTED_SSL}, @@ -885,7 +960,7 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "OpenSSL >= 1.0.2 required.", .vdef = RD_KAFKA_SSL_ENDPOINT_ID_HTTPS, .s2i = {{RD_KAFKA_SSL_ENDPOINT_ID_NONE, "none"}, - {RD_KAFKA_SSL_ENDPOINT_ID_HTTPS, "https"}}, + {RD_KAFKA_SSL_ENDPOINT_ID_HTTPS, "https"}}, _UNSUPPORTED_OPENSSL_1_0_2}, {_RK_GLOBAL, "ssl.certificate.verify_cb", _RK_C_PTR, _RK(ssl.cert_verify_cb), @@ -897,11 +972,13 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "Java TrustStores are not supported, use `ssl.ca.location` " "and a certificate file instead. " "See " - "https://github.com/edenhill/librdkafka/wiki/Using-SSL-with-librdkafka " + "https://github.com/confluentinc/librdkafka/" + "wiki/Using-SSL-with-librdkafka " "for more information."}, {_RK_GLOBAL, "sasl.jaas.config", _RK_C_INVALID, _RK(dummy), "Java JAAS configuration is not supported, see " - "https://github.com/edenhill/librdkafka/wiki/Using-SASL-with-librdkafka " + "https://github.com/confluentinc/librdkafka/" + "wiki/Using-SASL-with-librdkafka " "for more information."}, {_RK_GLOBAL | _RK_HIGH, "sasl.mechanisms", _RK_C_STR, _RK(sasl.mechanisms), @@ -1003,7 +1080,7 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "and `sasl.oauthbearer.token.endpoint.url`.", .vdef = RD_KAFKA_SASL_OAUTHBEARER_METHOD_DEFAULT, .s2i = {{RD_KAFKA_SASL_OAUTHBEARER_METHOD_DEFAULT, "default"}, - {RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC, "oidc"}}, + {RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC, "oidc"}}, _UNSUPPORTED_OIDC}, {_RK_GLOBAL, "sasl.oauthbearer.client.id", _RK_C_STR, _RK(sasl.oauthbearer.client_id), @@ -1012,7 +1089,11 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "authorization server handles. " "Only used when `sasl.oauthbearer.method` is set to \"oidc\".", _UNSUPPORTED_OIDC}, - {_RK_GLOBAL, "sasl.oauthbearer.client.secret", _RK_C_STR, + {_RK_GLOBAL, "sasl.oauthbearer.client.credentials.client.id", _RK_C_ALIAS, + .sdef = "sasl.oauthbearer.client.id"}, + {_RK_GLOBAL, "sasl.oauthbearer.client.credentials.client.secret", + _RK_C_ALIAS, .sdef = "sasl.oauthbearer.client.secret"}, + {_RK_GLOBAL | _RK_SENSITIVE, "sasl.oauthbearer.client.secret", _RK_C_STR, _RK(sasl.oauthbearer.client_secret), "Client secret only known to the application and the " "authorization server. This should be a sufficiently random string " @@ -1037,6 +1118,94 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "OAuth/OIDC issuer token endpoint HTTP(S) URI used to retrieve token. " "Only used when `sasl.oauthbearer.method` is set to \"oidc\".", _UNSUPPORTED_OIDC}, + { + _RK_GLOBAL, + "sasl.oauthbearer.grant.type", + _RK_C_S2I, + _RK(sasl.oauthbearer.grant_type), + "OAuth grant type to use when communicating with the identity " + "provider.", + _UNSUPPORTED_OIDC, + .vdef = RD_KAFKA_SASL_OAUTHBEARER_GRANT_TYPE_CLIENT_CREDENTIALS, + .s2i = {{RD_KAFKA_SASL_OAUTHBEARER_GRANT_TYPE_CLIENT_CREDENTIALS, + "client_credentials"}, + {RD_KAFKA_SASL_OAUTHBEARER_GRANT_TYPE_JWT_BEARER, + "urn:ietf:params:oauth:grant-type:jwt-bearer"}}, + }, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.algorithm", _RK_C_S2I, + _RK(sasl.oauthbearer.assertion.algorithm), + "Algorithm the client should use to sign the assertion sent " + "to the identity provider and in the OAuth alg header in the JWT " + "assertion.", + _UNSUPPORTED_OIDC, + .vdef = RD_KAFKA_SASL_OAUTHBEARER_ASSERTION_ALGORITHM_RS256, + .s2i = {{RD_KAFKA_SASL_OAUTHBEARER_ASSERTION_ALGORITHM_RS256, "RS256"}, + {RD_KAFKA_SASL_OAUTHBEARER_ASSERTION_ALGORITHM_ES256, "ES256"}}}, + {_RK_GLOBAL | _RK_SENSITIVE, "sasl.oauthbearer.assertion.private.key.file", + _RK_C_STR, _RK(sasl.oauthbearer.assertion.private_key.file), + "Path to client's private key (PEM) used for authentication " + "when using the JWT assertion.", + _UNSUPPORTED_OIDC}, + {_RK_GLOBAL | _RK_SENSITIVE, + "sasl.oauthbearer.assertion.private.key.passphrase", _RK_C_STR, + _RK(sasl.oauthbearer.assertion.private_key.passphrase), + "Private key passphrase for `sasl.oauthbearer.assertion.private.key.file`" + " or `sasl.oauthbearer.assertion.private.key.pem`.", + _UNSUPPORTED_OIDC}, + {_RK_GLOBAL | _RK_SENSITIVE, "sasl.oauthbearer.assertion.private.key.pem", + _RK_C_STR, _RK(sasl.oauthbearer.assertion.private_key.pem), + "Client's private key (PEM) used for authentication " + "when using the JWT assertion.", + _UNSUPPORTED_OIDC}, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.file", _RK_C_STR, + _RK(sasl.oauthbearer.assertion.file), + "Path to the assertion file. " + "Only used when `sasl.oauthbearer.method` is set to \"oidc\" and JWT " + "assertion is needed.", + _UNSUPPORTED_OIDC}, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.claim.aud", _RK_C_STR, + _RK(sasl.oauthbearer.assertion.claim.audience), + "JWT audience claim. " + "Only used when `sasl.oauthbearer.method` is set to \"oidc\" and JWT " + "assertion is needed.", + _UNSUPPORTED_OIDC}, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.claim.exp.seconds", _RK_C_INT, + _RK(sasl.oauthbearer.assertion.claim.expiration_s), + "Assertion expiration time in seconds. " + "Only used when `sasl.oauthbearer.method` is set to \"oidc\" and JWT " + "assertion is needed.", + 1, INT_MAX, 300, _UNSUPPORTED_OIDC}, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.claim.iss", _RK_C_STR, + _RK(sasl.oauthbearer.assertion.claim.issuer), + "JWT issuer claim. " + "Only used when `sasl.oauthbearer.method` is set to \"oidc\" and JWT " + "assertion is needed.", + _UNSUPPORTED_OIDC}, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.claim.jti.include", _RK_C_BOOL, + _RK(sasl.oauthbearer.assertion.claim.jti_include), + "JWT ID claim. When set to `true`, a random UUID is generated. " + "Only used when `sasl.oauthbearer.method` is set to \"oidc\" and JWT " + "assertion is needed.", + 0, 1, 0, _UNSUPPORTED_OIDC}, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.claim.nbf.seconds", _RK_C_INT, + _RK(sasl.oauthbearer.assertion.claim.not_before_s), + "Assertion not before time in seconds. " + "Only used when `sasl.oauthbearer.method` is set to \"oidc\" and JWT " + "assertion is needed.", + 0, INT_MAX, 60, _UNSUPPORTED_OIDC}, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.claim.sub", _RK_C_STR, + _RK(sasl.oauthbearer.assertion.claim.subject), + "JWT subject claim. " + "Only used when `sasl.oauthbearer.method` is set to \"oidc\" and JWT " + "assertion is needed.", + _UNSUPPORTED_OIDC}, + {_RK_GLOBAL, "sasl.oauthbearer.assertion.jwt.template.file", _RK_C_STR, + _RK(sasl.oauthbearer.assertion.jwt_template_file), + "Path to the JWT template file. " + "Only used when `sasl.oauthbearer.method` is set to \"oidc\" and JWT " + "assertion is needed.", + _UNSUPPORTED_OIDC}, + /* Plugins */ {_RK_GLOBAL, "plugin.library.paths", _RK_C_STR, _RK(plugin_paths), @@ -1104,9 +1273,10 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "members of the group to assign partitions to group members. If " "there is more than one eligible strategy, preference is " "determined by the order of this list (strategies earlier in the " - "list have higher priority). " - "Cooperative and non-cooperative (eager) strategies must not be " - "mixed. " + "list have higher priority). Cooperative and non-cooperative (eager)" + "strategies must not be mixed. `partition.assignment.strategy` is not " + "supported for " + "`group.protocol=consumer`. Use `group.remote.assignor` instead. " "Available strategies: range, roundrobin, cooperative-sticky.", .sdef = "range,roundrobin"}, {_RK_GLOBAL | _RK_CGRP | _RK_HIGH, "session.timeout.ms", _RK_C_INT, @@ -1116,20 +1286,52 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "to indicate its liveness to the broker. If no hearts are " "received by the broker for a group member within the " "session timeout, the broker will remove the consumer from " - "the group and trigger a rebalance. " - "The allowed range is configured with the **broker** configuration " + "the group and trigger a rebalance. The " + "allowed range is configured with the **broker** configuration " "properties `group.min.session.timeout.ms` and " - "`group.max.session.timeout.ms`. " + "`group.max.session.timeout.ms`. `session.timeout.ms` is not supported " + "for `group.protocol=consumer`. It is set with the broker configuration " + "property " + "`group.consumer.session.timeout.ms` by default or can be configured " + "through the AdminClient IncrementalAlterConfigs API. " + "The allowed range is configured with the broker configuration " + "properties `group.consumer.min.session.timeout.ms` and " + "`group.consumer.max.session.timeout.ms`. " "Also see `max.poll.interval.ms`.", 1, 3600 * 1000, 45 * 1000}, {_RK_GLOBAL | _RK_CGRP, "heartbeat.interval.ms", _RK_C_INT, _RK(group_heartbeat_intvl_ms), - "Group session keepalive heartbeat interval.", 1, 3600 * 1000, 3 * 1000}, + "Group session keepalive heartbeat interval. " + "`heartbeat.interval.ms` is not supported for `group.protocol=consumer`. " + "It is set with the broker configuration property " + "`group.consumer.heartbeat.interval.ms` by default or can be configured " + "through the AdminClient IncrementalAlterConfigs API. The allowed range " + "is configured with the broker configuration properties " + "`group.consumer.min.heartbeat.interval.ms` and " + "`group.consumer.max.heartbeat.interval.ms`.", + 1, 3600 * 1000, 3 * 1000}, {_RK_GLOBAL | _RK_CGRP, "group.protocol.type", _RK_C_KSTR, _RK(group_protocol_type), - "Group protocol type. NOTE: Currently, the only supported group " - "protocol type is `consumer`.", + "Group protocol type for the `classic` group protocol. NOTE: Currently, " + "the only supported group protocol type is `consumer`. " + "`group.protocol.type` is not supported for `group.protocol=consumer`", .sdef = "consumer"}, + {_RK_GLOBAL | _RK_CGRP | _RK_HIGH, "group.protocol", _RK_C_S2I, + _RK(group_protocol), + "Group protocol to use. Use `classic` for the original protocol and " + "`consumer` for the new " + "protocol introduced in KIP-848. Available protocols: classic or " + "consumer. Default is `classic`, " + "but will change to `consumer` in next releases.", + .vdef = RD_KAFKA_GROUP_PROTOCOL_CLASSIC, + .s2i = {{RD_KAFKA_GROUP_PROTOCOL_CLASSIC, "classic"}, + {RD_KAFKA_GROUP_PROTOCOL_CONSUMER, "consumer"}}}, + {_RK_GLOBAL | _RK_CGRP | _RK_MED, "group.remote.assignor", _RK_C_STR, + _RK(group_remote_assignor), + "Server side assignor to use. Keep it null to make server select a " + "suitable assignor for the group. " + "Available assignors: uniform or range. Default is null", + .sdef = NULL}, {_RK_GLOBAL | _RK_CGRP, "coordinator.query.interval.ms", _RK_C_INT, _RK(coord_query_intvl_ms), "How often to query for the current client group coordinator. " @@ -1197,6 +1399,16 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "Maximum time the broker may wait to fill the Fetch response " "with fetch.min.bytes of messages.", 0, 300 * 1000, 500}, + {_RK_GLOBAL | _RK_CONSUMER | _RK_MED, "fetch.queue.backoff.ms", _RK_C_INT, + _RK(fetch_queue_backoff_ms), + "How long to postpone the next fetch request for a " + "topic+partition in case the current fetch queue thresholds " + "(queued.min.messages or queued.max.messages.kbytes) have " + "been exceded. " + "This property may need to be decreased if the queue thresholds are " + "set low and the application is experiencing long (~1s) delays " + "between messages. Low values may increase CPU utilization.", + 0, 300 * 1000, 1000}, {_RK_GLOBAL | _RK_CONSUMER | _RK_MED, "fetch.message.max.bytes", _RK_C_INT, _RK(fetch_msg_max_bytes), "Initial maximum number of bytes per topic+partition to request when " @@ -1239,8 +1451,8 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "(requires Apache Kafka 0.8.2 or later on the broker).", .vdef = RD_KAFKA_OFFSET_METHOD_BROKER, .s2i = {{RD_KAFKA_OFFSET_METHOD_NONE, "none"}, - {RD_KAFKA_OFFSET_METHOD_FILE, "file"}, - {RD_KAFKA_OFFSET_METHOD_BROKER, "broker"}}}, + {RD_KAFKA_OFFSET_METHOD_FILE, "file"}, + {RD_KAFKA_OFFSET_METHOD_BROKER, "broker"}}}, {_RK_GLOBAL | _RK_CONSUMER | _RK_HIGH, "isolation.level", _RK_C_S2I, _RK(isolation_level), "Controls how to read messages written transactionally: " @@ -1249,7 +1461,7 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "transactional messages which have been aborted.", .vdef = RD_KAFKA_READ_COMMITTED, .s2i = {{RD_KAFKA_READ_UNCOMMITTED, "read_uncommitted"}, - {RD_KAFKA_READ_COMMITTED, "read_committed"}}}, + {RD_KAFKA_READ_COMMITTED, "read_committed"}}}, {_RK_GLOBAL | _RK_CONSUMER, "consume_cb", _RK_C_PTR, _RK(consume_cb), "Message consume callback (set with rd_kafka_conf_set_consume_cb())"}, {_RK_GLOBAL | _RK_CONSUMER, "rebalance_cb", _RK_C_PTR, _RK(rebalance_cb), @@ -1332,7 +1544,8 @@ static const struct rd_kafka_property rd_kafka_properties[] = { {_RK_GLOBAL | _RK_PRODUCER | _RK_HIGH, "queue.buffering.max.messages", _RK_C_INT, _RK(queue_buffering_max_msgs), "Maximum number of messages allowed on the producer queue. " - "This queue is shared by all topics and partitions. A value of 0 disables " + "This queue is shared by all topics and partitions. A value of 0 " + "disables " "this limit.", 0, INT_MAX, 100000}, {_RK_GLOBAL | _RK_PRODUCER | _RK_HIGH, "queue.buffering.max.kbytes", @@ -1360,10 +1573,21 @@ static const struct rd_kafka_property rd_kafka_properties[] = { 0, INT32_MAX, INT32_MAX}, {_RK_GLOBAL | _RK_PRODUCER, "retries", _RK_C_ALIAS, .sdef = "message.send.max.retries"}, - {_RK_GLOBAL | _RK_PRODUCER | _RK_MED, "retry.backoff.ms", _RK_C_INT, - _RK(retry_backoff_ms), - "The backoff time in milliseconds before retrying a protocol request.", 1, - 300 * 1000, 100}, + + {_RK_GLOBAL | _RK_MED, "retry.backoff.ms", _RK_C_INT, _RK(retry_backoff_ms), + "The backoff time in milliseconds before retrying a protocol request, " + "this is the first backoff time, " + "and will be backed off exponentially until number of retries is " + "exhausted, and it's capped by retry.backoff.max.ms.", + 1, 300 * 1000, 100}, + + {_RK_GLOBAL | _RK_MED, "retry.backoff.max.ms", _RK_C_INT, + _RK(retry_backoff_max_ms), + "The max backoff time in milliseconds before retrying a protocol " + "request, " + "this is the atmost backoff allowed for exponentially backed off " + "requests.", + 1, 300 * 1000, 1000}, {_RK_GLOBAL | _RK_PRODUCER, "queue.buffering.backpressure.threshold", _RK_C_INT, _RK(queue_backpressure_thres), @@ -1384,11 +1608,11 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "the topic configuration property `compression.codec`. ", .vdef = RD_KAFKA_COMPRESSION_NONE, .s2i = {{RD_KAFKA_COMPRESSION_NONE, "none"}, - {RD_KAFKA_COMPRESSION_GZIP, "gzip", _UNSUPPORTED_ZLIB}, - {RD_KAFKA_COMPRESSION_SNAPPY, "snappy", _UNSUPPORTED_SNAPPY}, - {RD_KAFKA_COMPRESSION_KLZ4, "lz4"}, - {RD_KAFKA_COMPRESSION_ZSTD, "zstd", _UNSUPPORTED_ZSTD}, - {0}}}, + {RD_KAFKA_COMPRESSION_GZIP, "gzip", _UNSUPPORTED_ZLIB}, + {RD_KAFKA_COMPRESSION_SNAPPY, "snappy", _UNSUPPORTED_SNAPPY}, + {RD_KAFKA_COMPRESSION_KLZ4, "lz4"}, + {RD_KAFKA_COMPRESSION_ZSTD, "zstd", _UNSUPPORTED_ZSTD}, + {0}}}, {_RK_GLOBAL | _RK_PRODUCER | _RK_MED, "compression.type", _RK_C_ALIAS, .sdef = "compression.codec"}, {_RK_GLOBAL | _RK_PRODUCER | _RK_MED, "batch.num.messages", _RK_C_INT, @@ -1427,6 +1651,29 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "A higher value allows for more effective batching of these " "messages.", 0, 900000, 10}, + {_RK_GLOBAL, "client.dns.lookup", _RK_C_S2I, _RK(client_dns_lookup), + "Controls how the client uses DNS lookups. By default, when the lookup " + "returns multiple IP addresses for a hostname, they will all be " + "attempted " + "for connection before the connection is considered failed. This applies " + "to both bootstrap and advertised servers. If the value is set to " + "`resolve_canonical_bootstrap_servers_only`, each entry will be resolved " + "and expanded into a list of canonical names. " + "**WARNING**: `resolve_canonical_bootstrap_servers_only` " + "must only be used with `GSSAPI` (Kerberos) as `sasl.mechanism`, " + "as it's the only purpose of this configuration value. " + "**NOTE**: Default here is different from the Java client's default " + "behavior, which connects only to the first IP address returned for a " + "hostname. ", + .vdef = RD_KAFKA_USE_ALL_DNS_IPS, + .s2i = {{RD_KAFKA_USE_ALL_DNS_IPS, "use_all_dns_ips"}, + {RD_KAFKA_RESOLVE_CANONICAL_BOOTSTRAP_SERVERS_ONLY, + "resolve_canonical_bootstrap_servers_only"}}}, + {_RK_GLOBAL, "enable.metrics.push", _RK_C_BOOL, _RK(enable_metrics_push), + "Whether to enable pushing of client metrics to the cluster, if the " + "cluster has a client metrics subscription which matches this client", + 0, 1, 1}, + /* @@ -1493,7 +1740,8 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "`murmur2_random` - Java Producer compatible Murmur2 hash of key " "(NULL keys are randomly partitioned. This is functionally equivalent " "to the default partitioner in the Java Producer.), " - "`fnv1a` - FNV-1a hash of key (NULL keys are mapped to single partition), " + "`fnv1a` - FNV-1a hash of key (NULL keys are mapped to single " + "partition), " "`fnv1a_random` - FNV-1a hash of key (NULL keys are randomly " "partitioned).", .sdef = "consistent_random", @@ -1514,12 +1762,12 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "inherit = inherit global compression.codec configuration.", .vdef = RD_KAFKA_COMPRESSION_INHERIT, .s2i = {{RD_KAFKA_COMPRESSION_NONE, "none"}, - {RD_KAFKA_COMPRESSION_GZIP, "gzip", _UNSUPPORTED_ZLIB}, - {RD_KAFKA_COMPRESSION_SNAPPY, "snappy", _UNSUPPORTED_SNAPPY}, - {RD_KAFKA_COMPRESSION_KLZ4, "lz4"}, - {RD_KAFKA_COMPRESSION_ZSTD, "zstd", _UNSUPPORTED_ZSTD}, - {RD_KAFKA_COMPRESSION_INHERIT, "inherit"}, - {0}}}, + {RD_KAFKA_COMPRESSION_GZIP, "gzip", _UNSUPPORTED_ZLIB}, + {RD_KAFKA_COMPRESSION_SNAPPY, "snappy", _UNSUPPORTED_SNAPPY}, + {RD_KAFKA_COMPRESSION_KLZ4, "lz4"}, + {RD_KAFKA_COMPRESSION_ZSTD, "zstd", _UNSUPPORTED_ZSTD}, + {RD_KAFKA_COMPRESSION_INHERIT, "inherit"}, + {0}}}, {_RK_TOPIC | _RK_PRODUCER | _RK_HIGH, "compression.type", _RK_C_ALIAS, .sdef = "compression.codec"}, {_RK_TOPIC | _RK_PRODUCER | _RK_MED, "compression.level", _RK_C_INT, @@ -1603,7 +1851,7 @@ static const struct rd_kafka_property rd_kafka_properties[] = { "Apache Kafka 0.8.2 or later on the broker.).", .vdef = RD_KAFKA_OFFSET_METHOD_BROKER, .s2i = {{RD_KAFKA_OFFSET_METHOD_FILE, "file"}, - {RD_KAFKA_OFFSET_METHOD_BROKER, "broker"}}}, + {RD_KAFKA_OFFSET_METHOD_BROKER, "broker"}}}, {_RK_TOPIC | _RK_CONSUMER, "consume.callback.max.messages", _RK_C_INT, _RKT(consume_callback_max_msgs), @@ -2263,7 +2511,7 @@ static int rd_kafka_anyconf_set(int scope, const struct rd_kafka_property *_prop; \ rd_kafka_conf_res_t _res; \ _prop = rd_kafka_conf_prop_find(SCOPE, NAME); \ - rd_assert(_prop && * "invalid property name"); \ + rd_assert(_prop && *"invalid property name"); \ _res = rd_kafka_anyconf_set_prop( \ SCOPE, CONF, _prop, (const void *)VALUE, \ 1 /*allow-specifics*/, NULL, 0); \ @@ -3711,10 +3959,33 @@ const char *rd_kafka_conf_finalize(rd_kafka_type_t cltype, if (conf->ssl.ca && (conf->ssl.ca_location || conf->ssl.ca_pem)) return "`ssl.ca.location` or `ssl.ca.pem`, and memory-based " "set_ssl_cert(CERT_CA) are mutually exclusive."; + +#if WITH_OAUTHBEARER_OIDC + if (conf->https.ca_location && conf->https.ca_pem) + return "`https.ca.location` and `https.ca.pem` " + "are mutually exclusive"; + + if (conf->https.ca_location && + rd_strcmp(conf->https.ca_location, "probe") && + !rd_file_stat(conf->https.ca_location, NULL)) + return "`https.ca.location` must be " + "an existing file or directory"; + +#if !CURL_AT_LEAST_VERSION(7, 77, 0) + if (conf->https.ca_pem) + return "`https.ca.pem` requires libcurl 7.77.0 or later"; +#endif +#endif + + #ifdef __APPLE__ else if (!conf->ssl.ca && !conf->ssl.ca_location && !conf->ssl.ca_pem) /* Default ssl.ca.location to 'probe' on OSX */ rd_kafka_conf_set(conf, "ssl.ca.location", "probe", NULL, 0); + + /* Default https.ca.location to 'probe' on OSX */ + if (!conf->https.ca_location && !conf->https.ca_pem) + rd_kafka_conf_set(conf, "https.ca.location", "probe", NULL, 0); #endif #endif @@ -3734,7 +4005,17 @@ const char *rd_kafka_conf_finalize(rd_kafka_type_t cltype, "mutually exclusive"; if (conf->sasl.oauthbearer.method == - RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC) { + RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC && + !conf->sasl.oauthbearer.token_endpoint_url) { + return "`sasl.oauthbearer.token.endpoint.url` " + "is mandatory when " + "`sasl.oauthbearer.method=oidc` is set"; + } + + if (conf->sasl.oauthbearer.method == + RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC && + conf->sasl.oauthbearer.grant_type == + RD_KAFKA_SASL_OAUTHBEARER_GRANT_TYPE_CLIENT_CREDENTIALS) { if (!conf->sasl.oauthbearer.client_id) return "`sasl.oauthbearer.client.id` is " "mandatory when " @@ -3745,14 +4026,150 @@ const char *rd_kafka_conf_finalize(rd_kafka_type_t cltype, "mandatory when " "`sasl.oauthbearer.method=oidc` is set"; } + } + if (conf->sasl.oauthbearer.method == + RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC && + conf->sasl.oauthbearer.grant_type == + RD_KAFKA_SASL_OAUTHBEARER_GRANT_TYPE_JWT_BEARER) { + if (conf->sasl.oauthbearer.assertion.file) { + if (conf->sasl.oauthbearer.assertion.private_key + .file) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "private." + "key.file` cannot both be set"; - if (!conf->sasl.oauthbearer.token_endpoint_url) { - return "`sasl.oauthbearer.token.endpoint.url` " - "is mandatory when " - "`sasl.oauthbearer.method=oidc` is set"; + if (conf->sasl.oauthbearer.assertion.private_key + .pem) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "private." + "key.pem` cannot both be set"; + + if (conf->sasl.oauthbearer.assertion.private_key + .passphrase) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "private." + "key.passphrase` cannot both be " + "set"; + + if (conf->sasl.oauthbearer.assertion + .jwt_template_file) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "jwt.template.file` cannot both " + "be set"; + + if (conf->sasl.oauthbearer.assertion.claim + .subject) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "claim.sub` cannot both be set"; + + if (conf->sasl.oauthbearer.assertion.claim + .audience) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "claim.aud` cannot both be set"; + + if (conf->sasl.oauthbearer.assertion.claim + .issuer) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "claim.iss` cannot both be set"; + + if (rd_kafka_conf_is_modified( + conf, + "sasl.oauthbearer." + "assertion.claim.jti.include")) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "claim.jti.include` cannot both " + "be set"; + + if (rd_kafka_conf_is_modified( + conf, + "sasl.oauthbearer." + "assertion.claim.exp.seconds")) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "claim.exp.seconds` cannot both " + "be set"; + + + if (rd_kafka_conf_is_modified( + conf, + "sasl.oauthbearer." + "assertion.claim.nbf.seconds")) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "file` and " + "`sasl.oauthbearer.assertion." + "claim.nbf.seconds` cannot both " + "be set"; + } else { + if (conf->sasl.oauthbearer.assertion.private_key + .file && + conf->sasl.oauthbearer.assertion.private_key + .pem) + return "Mutually exclusive properties " + "set. " + "`sasl.oauthbearer.assertion." + "private." + "key.file` and " + "`sasl.oauthbearer.assertion." + "private." + "key.pem` cannot both be set"; + + if (!conf->sasl.oauthbearer.assertion + .private_key.file && + !conf->sasl.oauthbearer.assertion + .private_key.pem) + return "`sasl.oauthbearer.assertion." + "private." + "key.file` or " + "`sasl.oauthbearer.assertion." + "private." + "key.pem` is mandatory when " + "`sasl.oauthbearer.grant.type` " + "is set to " + "`urn:ietf:params:oauth:grant-" + "type:jwt-" + "bearer`"; } } + + /* Enable background thread for the builtin OIDC handler, * unless a refresh callback has been set. */ if (conf->sasl.oauthbearer.method == @@ -3767,6 +4184,43 @@ const char *rd_kafka_conf_finalize(rd_kafka_type_t cltype, if (cltype == RD_KAFKA_CONSUMER) { + if (conf->group_protocol == RD_KAFKA_GROUP_PROTOCOL_CLASSIC) { + if (conf->max_poll_interval_ms < + conf->group_session_timeout_ms) + return "`max.poll.interval.ms`must be >= " + "`session.timeout.ms`"; + } else { + + if (rd_kafka_conf_is_modified(conf, + "session.timeout.ms")) { + return "`session.timeout.ms` is not supported " + "for `group.protocol=consumer`. It is " + "defined broker side"; + } + + if (rd_kafka_conf_is_modified( + conf, "partition.assignment.strategy")) { + return "`partition.assignment.strategy` is not " + "supported for " + "`group.protocol=consumer`. Use " + "`group.remote.assignor` instead"; + } + + if (rd_kafka_conf_is_modified(conf, + "group.protocol.type")) { + return "`group.protocol.type` is not supported " + "for `group.protocol=consumer`"; + } + + if (rd_kafka_conf_is_modified( + conf, "heartbeat.interval.ms")) { + return "`heartbeat.interval.ms` is not " + "supported " + "for `group.protocol=consumer`. It is " + "defined broker side"; + } + } + /* Automatically adjust `fetch.max.bytes` to be >= * `message.max.bytes` and <= `queued.max.message.kbytes` * unless set by user. */ @@ -3797,10 +4251,6 @@ const char *rd_kafka_conf_finalize(rd_kafka_type_t cltype, conf->fetch_max_bytes + 512); } - if (conf->max_poll_interval_ms < conf->group_session_timeout_ms) - return "`max.poll.interval.ms`must be >= " - "`session.timeout.ms`"; - /* Simplifies rd_kafka_is_idempotent() which is producer-only */ conf->eos.idempotence = 0; @@ -3895,7 +4345,7 @@ const char *rd_kafka_conf_finalize(rd_kafka_type_t cltype, if (conf->reconnect_backoff_max_ms < conf->reconnect_backoff_ms) return "`reconnect.backoff.max.ms` must be >= " - "`reconnect.max.ms`"; + "`reconnect.backoff.ms`"; if (conf->sparse_connections) { /* Set sparse connection random selection interval to @@ -3903,6 +4353,10 @@ const char *rd_kafka_conf_finalize(rd_kafka_type_t cltype, conf->sparse_connect_intvl = RD_MAX(11, RD_MIN(conf->reconnect_backoff_ms / 2, 1000)); } + if (!rd_kafka_conf_is_modified( + conf, "topic.metadata.refresh.fast.interval.ms")) + conf->metadata_refresh_fast_interval_ms = + conf->retry_backoff_ms; if (!rd_kafka_conf_is_modified(conf, "connections.max.idle.ms") && conf->brokerlist && rd_strcasestr(conf->brokerlist, "azure")) { @@ -4091,6 +4545,31 @@ int rd_kafka_conf_warn(rd_kafka_t *rk) { "recommend not using set_default_topic_conf"); /* Additional warnings */ + if (rk->rk_conf.retry_backoff_ms > rk->rk_conf.retry_backoff_max_ms) { + rd_kafka_log( + rk, LOG_WARNING, "CONFWARN", + "Configuration `retry.backoff.ms` with value %d is greater " + "than configuration `retry.backoff.max.ms` with value %d. " + "A static backoff with value `retry.backoff.max.ms` will " + "be applied.", + rk->rk_conf.retry_backoff_ms, + rk->rk_conf.retry_backoff_max_ms); + } + + if (rd_kafka_conf_is_modified( + &rk->rk_conf, "topic.metadata.refresh.fast.interval.ms") && + rk->rk_conf.metadata_refresh_fast_interval_ms > + rk->rk_conf.retry_backoff_max_ms) { + rd_kafka_log( + rk, LOG_WARNING, "CONFWARN", + "Configuration `topic.metadata.refresh.fast.interval.ms` " + "with value %d is greater than configuration " + "`retry.backoff.max.ms` with value %d. " + "A static backoff with value `retry.backoff.max.ms` will " + "be applied.", + rk->rk_conf.metadata_refresh_fast_interval_ms, + rk->rk_conf.retry_backoff_max_ms); + } if (rk->rk_type == RD_KAFKA_CONSUMER) { if (rk->rk_conf.fetch_wait_max_ms + 1000 > rk->rk_conf.socket_timeout_ms) @@ -4329,7 +4808,7 @@ int unittest_conf(void) { /* Verify that software.client.* string-safing works */ conf = rd_kafka_conf_new(); res = rd_kafka_conf_set(conf, "client.software.name", - " .~aba. va! !.~~", NULL, 0); + " .~aba. va! !.~~", NULL, 0); RD_UT_ASSERT(res == RD_KAFKA_CONF_OK, "%d", res); res = rd_kafka_conf_set(conf, "client.software.version", "!1.2.3.4.5!!! a", NULL, 0); @@ -4348,7 +4827,7 @@ int unittest_conf(void) { readlen = sizeof(readval); res2 = rd_kafka_conf_get(conf, "client.software.version", readval, - &readlen); + &readlen); RD_UT_ASSERT(res2 == RD_KAFKA_CONF_OK, "%d", res2); RD_UT_ASSERT(!strcmp(readval, "1.2.3.4.5----a"), "client.software.* safification failed: \"%s\"", readval); diff --git a/src/third_party/librdkafka/dist/src/rdkafka_conf.h b/src/third_party/librdkafka/dist/src/rdkafka_conf.h index a0970d37be4..c38e81eb530 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_conf.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_conf.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2014-2018 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,7 +34,7 @@ #include "rdkafka_cert.h" #if WITH_SSL && OPENSSL_VERSION_NUMBER >= 0x10100000 && \ - !defined(OPENSSL_IS_BORINGSSL) + !defined(OPENSSL_NO_ENGINE) #define WITH_SSL_ENGINE 1 /* Deprecated in OpenSSL 3 */ #include @@ -150,17 +151,42 @@ typedef enum { typedef enum { RD_KAFKA_SASL_OAUTHBEARER_METHOD_DEFAULT, - RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC + RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC, } rd_kafka_oauthbearer_method_t; +typedef enum { + RD_KAFKA_SASL_OAUTHBEARER_GRANT_TYPE_CLIENT_CREDENTIALS, + RD_KAFKA_SASL_OAUTHBEARER_GRANT_TYPE_JWT_BEARER, +} rd_kafka_oauthbearer_grant_type_t; + +typedef enum { + RD_KAFKA_SASL_OAUTHBEARER_ASSERTION_ALGORITHM_RS256, + RD_KAFKA_SASL_OAUTHBEARER_ASSERTION_ALGORITHM_ES256, +} rd_kafka_oauthbearer_assertion_algorithm_t; + typedef enum { RD_KAFKA_SSL_ENDPOINT_ID_NONE, RD_KAFKA_SSL_ENDPOINT_ID_HTTPS, /**< RFC2818 */ } rd_kafka_ssl_endpoint_id_t; +typedef enum { + RD_KAFKA_USE_ALL_DNS_IPS, + RD_KAFKA_RESOLVE_CANONICAL_BOOTSTRAP_SERVERS_ONLY, +} rd_kafka_client_dns_lookup_t; + +typedef enum { + RD_KAFKA_GROUP_PROTOCOL_CLASSIC, + RD_KAFKA_GROUP_PROTOCOL_CONSUMER, +} rd_kafka_group_protocol_t; + +typedef enum { + RD_KAFKA_METADATA_RECOVERY_STRATEGY_NONE, + RD_KAFKA_METADATA_RECOVERY_STRATEGY_REBOOTSTRAP, +} rd_kafka_metadata_recovery_strategy_t; + /* Increase in steps of 64 as needed. * This must be larger than sizeof(rd_kafka_[topic_]conf_t) */ -#define RD_KAFKA_CONF_PROPS_IDX_MAX (64 * 33) +#define RD_KAFKA_CONF_PROPS_IDX_MAX (64 * 35) /** * @struct rd_kafka_anyconf_t @@ -191,6 +217,7 @@ struct rd_kafka_conf_s { int msg_copy_max_size; int recv_max_msg_size; int max_inflight; + int metadata_recovery_rebootstrap_trigger_ms; int metadata_request_timeout_ms; int metadata_refresh_interval_ms; int metadata_refresh_fast_cnt; @@ -224,6 +251,8 @@ struct rd_kafka_conf_s { int api_version_fallback_ms; char *broker_version_fallback; rd_kafka_secproto_t security_protocol; + rd_kafka_client_dns_lookup_t client_dns_lookup; + rd_kafka_metadata_recovery_strategy_t metadata_recovery_strategy; struct { #if WITH_SSL @@ -269,6 +298,11 @@ struct rd_kafka_conf_s { void *opaque); } ssl; + struct { + char *ca_location; + char *ca_pem; + } https; + struct { const struct rd_kafka_sasl_provider *provider; char *principal; @@ -298,10 +332,33 @@ struct rd_kafka_conf_s { int enable_callback_queue; struct { rd_kafka_oauthbearer_method_t method; + rd_kafka_oauthbearer_grant_type_t grant_type; char *token_endpoint_url; char *client_id; char *client_secret; char *scope; + struct { + rd_kafka_oauthbearer_assertion_algorithm_t + algorithm; + char *file; + char *jwt_template_file; + + struct { + char *subject; + char *audience; + char *issuer; + rd_bool_t jti_include; + int not_before_s; + int expiration_s; + } claim; + struct { + char *file; + char *passphrase; + char *pem; + } private_key; + + } assertion; + char *extensions_str; /* SASL/OAUTHBEARER token refresh event callback */ void (*token_refresh_cb)(rd_kafka_t *rk, @@ -342,6 +399,7 @@ struct rd_kafka_conf_s { /* Client group configuration */ int coord_query_intvl_ms; int max_poll_interval_ms; + int enable_metrics_push; int builtin_features; /* @@ -355,9 +413,12 @@ struct rd_kafka_conf_s { int fetch_msg_max_bytes; int fetch_max_bytes; int fetch_min_bytes; + int fetch_queue_backoff_ms; int fetch_error_backoff_ms; + rd_kafka_group_protocol_t group_protocol; char *group_id_str; char *group_instance_id; + char *group_remote_assignor; int allow_auto_create_topics; rd_kafka_pattern_list_t *topic_blacklist; @@ -416,6 +477,7 @@ struct rd_kafka_conf_s { int queue_backpressure_thres; int max_retries; int retry_backoff_ms; + int retry_backoff_max_ms; int batch_num_messages; int batch_size; rd_kafka_compression_t compression_codec; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_confval.h b/src/third_party/librdkafka/dist/src/rdkafka_confval.h index 3f2bad549eb..ca826169571 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_confval.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_confval.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2014-2018 Magnus Edenhill + * Copyright (c) 2014-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_coord.c b/src/third_party/librdkafka/dist/src/rdkafka_coord.c index 9e41bab72ad..a880f23a465 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_coord.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_coord.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_coord.h b/src/third_party/librdkafka/dist/src/rdkafka_coord.h index 4e00a552bc2..a04ca222e25 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_coord.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_coord.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_error.c b/src/third_party/librdkafka/dist/src/rdkafka_error.c index 4a218daffee..680593630d9 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_error.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_error.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_error.h b/src/third_party/librdkafka/dist/src/rdkafka_error.h index 79984f5efb5..4b4d912f30e 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_error.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_error.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_event.c b/src/third_party/librdkafka/dist/src/rdkafka_event.c index ffd1a17805c..7e8cd200aec 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_event.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_event.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -60,6 +61,8 @@ const char *rd_kafka_event_name(const rd_kafka_event_t *rkev) { return "CreatePartitionsResult"; case RD_KAFKA_EVENT_ALTERCONFIGS_RESULT: return "AlterConfigsResult"; + case RD_KAFKA_EVENT_INCREMENTALALTERCONFIGS_RESULT: + return "IncrementalAlterConfigsResult"; case RD_KAFKA_EVENT_DESCRIBECONFIGS_RESULT: return "DescribeConfigsResult"; case RD_KAFKA_EVENT_DELETERECORDS_RESULT: @@ -68,6 +71,10 @@ const char *rd_kafka_event_name(const rd_kafka_event_t *rkev) { return "ListConsumerGroupsResult"; case RD_KAFKA_EVENT_DESCRIBECONSUMERGROUPS_RESULT: return "DescribeConsumerGroupsResult"; + case RD_KAFKA_EVENT_DESCRIBETOPICS_RESULT: + return "DescribeTopicsResult"; + case RD_KAFKA_EVENT_DESCRIBECLUSTER_RESULT: + return "DescribeClusterResult"; case RD_KAFKA_EVENT_DELETEGROUPS_RESULT: return "DeleteGroupsResult"; case RD_KAFKA_EVENT_DELETECONSUMERGROUPOFFSETS_RESULT: @@ -84,6 +91,14 @@ const char *rd_kafka_event_name(const rd_kafka_event_t *rkev) { return "ListConsumerGroupOffsetsResult"; case RD_KAFKA_EVENT_OAUTHBEARER_TOKEN_REFRESH: return "SaslOAuthBearerTokenRefresh"; + case RD_KAFKA_EVENT_DESCRIBEUSERSCRAMCREDENTIALS_RESULT: + return "DescribeUserScramCredentials"; + case RD_KAFKA_EVENT_ALTERUSERSCRAMCREDENTIALS_RESULT: + return "AlterUserScramCredentials"; + case RD_KAFKA_EVENT_LISTOFFSETS_RESULT: + return "ListOffsetsResult"; + case RD_KAFKA_EVENT_ELECTLEADERS_RESULT: + return "ElectLeadersResult"; default: return "?unknown?"; } @@ -329,6 +344,15 @@ rd_kafka_event_AlterConfigs_result(rd_kafka_event_t *rkev) { return (const rd_kafka_AlterConfigs_result_t *)rkev; } +const rd_kafka_IncrementalAlterConfigs_result_t * +rd_kafka_event_IncrementalAlterConfigs_result(rd_kafka_event_t *rkev) { + if (!rkev || + rkev->rko_evtype != RD_KAFKA_EVENT_INCREMENTALALTERCONFIGS_RESULT) + return NULL; + else + return (const rd_kafka_IncrementalAlterConfigs_result_t *)rkev; +} + const rd_kafka_DescribeConfigs_result_t * rd_kafka_event_DescribeConfigs_result(rd_kafka_event_t *rkev) { @@ -364,6 +388,22 @@ rd_kafka_event_DescribeConsumerGroups_result(rd_kafka_event_t *rkev) { return (const rd_kafka_DescribeConsumerGroups_result_t *)rkev; } +const rd_kafka_DescribeTopics_result_t * +rd_kafka_event_DescribeTopics_result(rd_kafka_event_t *rkev) { + if (!rkev || rkev->rko_evtype != RD_KAFKA_EVENT_DESCRIBETOPICS_RESULT) + return NULL; + else + return (const rd_kafka_DescribeTopics_result_t *)rkev; +} + +const rd_kafka_DescribeCluster_result_t * +rd_kafka_event_DescribeCluster_result(rd_kafka_event_t *rkev) { + if (!rkev || rkev->rko_evtype != RD_KAFKA_EVENT_DESCRIBECLUSTER_RESULT) + return NULL; + else + return (const rd_kafka_DescribeCluster_result_t *)rkev; +} + const rd_kafka_DeleteGroups_result_t * rd_kafka_event_DeleteGroups_result(rd_kafka_event_t *rkev) { if (!rkev || rkev->rko_evtype != RD_KAFKA_EVENT_DELETEGROUPS_RESULT) @@ -416,6 +456,34 @@ rd_kafka_event_AlterConsumerGroupOffsets_result(rd_kafka_event_t *rkev) { const rd_kafka_AlterConsumerGroupOffsets_result_t *)rkev; } +const rd_kafka_DescribeUserScramCredentials_result_t * +rd_kafka_event_DescribeUserScramCredentials_result(rd_kafka_event_t *rkev) { + if (!rkev || rkev->rko_evtype != + RD_KAFKA_EVENT_DESCRIBEUSERSCRAMCREDENTIALS_RESULT) + return NULL; + else + return ( + const rd_kafka_DescribeUserScramCredentials_result_t *)rkev; +} + +const rd_kafka_AlterUserScramCredentials_result_t * +rd_kafka_event_AlterUserScramCredentials_result(rd_kafka_event_t *rkev) { + if (!rkev || + rkev->rko_evtype != RD_KAFKA_EVENT_ALTERUSERSCRAMCREDENTIALS_RESULT) + return NULL; + else + return ( + const rd_kafka_AlterUserScramCredentials_result_t *)rkev; +} + +const rd_kafka_ListOffsets_result_t * +rd_kafka_event_ListOffsets_result(rd_kafka_event_t *rkev) { + if (!rkev || rkev->rko_evtype != RD_KAFKA_EVENT_LISTOFFSETS_RESULT) + return NULL; + else + return (const rd_kafka_ListOffsets_result_t *)rkev; +} + const rd_kafka_ListConsumerGroupOffsets_result_t * rd_kafka_event_ListConsumerGroupOffsets_result(rd_kafka_event_t *rkev) { if (!rkev || @@ -424,3 +492,11 @@ rd_kafka_event_ListConsumerGroupOffsets_result(rd_kafka_event_t *rkev) { else return (const rd_kafka_ListConsumerGroupOffsets_result_t *)rkev; } + +const rd_kafka_ElectLeaders_result_t * +rd_kafka_event_ElectLeaders_result(rd_kafka_event_t *rkev) { + if (!rkev || rkev->rko_evtype != RD_KAFKA_EVENT_ELECTLEADERS_RESULT) + return NULL; + else + return (const rd_kafka_ElectLeaders_result_t *)rkev; +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_event.h b/src/third_party/librdkafka/dist/src/rdkafka_event.h index 3f9c22e34bb..cf63e414eb2 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_event.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_event.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -98,10 +99,13 @@ static RD_UNUSED RD_INLINE int rd_kafka_event_setup(rd_kafka_t *rk, case RD_KAFKA_EVENT_DELETETOPICS_RESULT: case RD_KAFKA_EVENT_CREATEPARTITIONS_RESULT: case RD_KAFKA_EVENT_ALTERCONFIGS_RESULT: + case RD_KAFKA_EVENT_INCREMENTALALTERCONFIGS_RESULT: case RD_KAFKA_EVENT_DESCRIBECONFIGS_RESULT: case RD_KAFKA_EVENT_DELETERECORDS_RESULT: case RD_KAFKA_EVENT_LISTCONSUMERGROUPS_RESULT: case RD_KAFKA_EVENT_DESCRIBECONSUMERGROUPS_RESULT: + case RD_KAFKA_EVENT_DESCRIBETOPICS_RESULT: + case RD_KAFKA_EVENT_DESCRIBECLUSTER_RESULT: case RD_KAFKA_EVENT_DELETEGROUPS_RESULT: case RD_KAFKA_EVENT_DELETECONSUMERGROUPOFFSETS_RESULT: case RD_KAFKA_EVENT_CREATEACLS_RESULT: @@ -110,6 +114,10 @@ static RD_UNUSED RD_INLINE int rd_kafka_event_setup(rd_kafka_t *rk, case RD_KAFKA_EVENT_ALTERCONSUMERGROUPOFFSETS_RESULT: case RD_KAFKA_EVENT_LISTCONSUMERGROUPOFFSETS_RESULT: case RD_KAFKA_EVENT_OAUTHBEARER_TOKEN_REFRESH: + case RD_KAFKA_EVENT_DESCRIBEUSERSCRAMCREDENTIALS_RESULT: + case RD_KAFKA_EVENT_ALTERUSERSCRAMCREDENTIALS_RESULT: + case RD_KAFKA_EVENT_LISTOFFSETS_RESULT: + case RD_KAFKA_EVENT_ELECTLEADERS_RESULT: return 1; default: diff --git a/src/third_party/librdkafka/dist/src/rdkafka_feature.c b/src/third_party/librdkafka/dist/src/rdkafka_feature.c index c06059ace25..0c194d0d860 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_feature.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_feature.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2016, Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -110,17 +111,28 @@ static const struct rd_kafka_feature_map { }, }, { - /* @brief >=0.9.0: Broker-based balanced consumer groups. */ + /* @brief >=0.9.0: Broker-based balanced consumer groups (classic). */ .feature = RD_KAFKA_FEATURE_BROKER_BALANCED_CONSUMER, .depends = { - {RD_KAFKAP_FindCoordinator, 0, 0}, - {RD_KAFKAP_OffsetCommit, 1, 2}, - {RD_KAFKAP_OffsetFetch, 1, 1}, - {RD_KAFKAP_JoinGroup, 0, 0}, - {RD_KAFKAP_SyncGroup, 0, 0}, - {RD_KAFKAP_Heartbeat, 0, 0}, - {RD_KAFKAP_LeaveGroup, 0, 0}, + {RD_KAFKAP_FindCoordinator, 0, INT16_MAX}, + {RD_KAFKAP_OffsetCommit, 1, INT16_MAX}, + {RD_KAFKAP_OffsetFetch, 1, INT16_MAX}, + {RD_KAFKAP_JoinGroup, 0, INT16_MAX}, + {RD_KAFKAP_SyncGroup, 0, INT16_MAX}, + {RD_KAFKAP_Heartbeat, 0, INT16_MAX}, + {RD_KAFKAP_LeaveGroup, 0, INT16_MAX}, + {-1}, + }, + }, + { + /* @brief Broker-based balanced consumer groups (KIP 848). */ + .feature = RD_KAFKA_FEATURE_BROKER_BALANCED_CONSUMER, + .depends = + { + {RD_KAFKAP_ConsumerGroupHeartbeat, 0, INT16_MAX}, + {RD_KAFKAP_OffsetCommit, 9, INT16_MAX}, + {RD_KAFKAP_OffsetFetch, 9, INT16_MAX}, {-1}, }, }, @@ -144,7 +156,18 @@ static const struct rd_kafka_feature_map { .feature = RD_KAFKA_FEATURE_SASL_GSSAPI, .depends = { - {RD_KAFKAP_JoinGroup, 0, 0}, + {RD_KAFKAP_JoinGroup, 0, INT16_MAX}, + {-1}, + }, + }, + { + /* @brief >=0.10.0: SASL (GSSAPI) authentication. + * Fallback in case JoinGroup is removed along with the + * "classic" consumer group protocol. */ + .feature = RD_KAFKA_FEATURE_SASL_GSSAPI, + .depends = + { + {RD_KAFKAP_SaslHandshake, 0, INT16_MAX}, {-1}, }, }, @@ -204,7 +227,7 @@ static const struct rd_kafka_feature_map { .depends = { {RD_KAFKAP_SaslHandshake, 1, 1}, - {RD_KAFKAP_SaslAuthenticate, 0, 0}, + {RD_KAFKAP_SaslAuthenticate, 0, 1}, {-1}, }, }, @@ -272,20 +295,20 @@ int rd_kafka_get_legacy_ApiVersions(const char *broker_version, struct rd_kafka_ApiVersion **apisp, size_t *api_cntp, const char *fallback) { +#define _VERMAP(PFX, APIS) \ + { PFX, APIS, RD_ARRAYSIZE(APIS) } static const struct { const char *pfx; struct rd_kafka_ApiVersion *apis; size_t api_cnt; - } vermap[] = { -#define _VERMAP(PFX, APIS) {PFX, APIS, RD_ARRAYSIZE(APIS)} - _VERMAP("0.9.0", rd_kafka_ApiVersion_0_9_0), - _VERMAP("0.8.2", rd_kafka_ApiVersion_0_8_2), - _VERMAP("0.8.1", rd_kafka_ApiVersion_0_8_1), - _VERMAP("0.8.0", rd_kafka_ApiVersion_0_8_0), - {"0.7.", NULL}, /* Unsupported */ - {"0.6.", NULL}, /* Unsupported */ - _VERMAP("", rd_kafka_ApiVersion_Queryable), - {NULL}}; + } vermap[] = {_VERMAP("0.9.0", rd_kafka_ApiVersion_0_9_0), + _VERMAP("0.8.2", rd_kafka_ApiVersion_0_8_2), + _VERMAP("0.8.1", rd_kafka_ApiVersion_0_8_1), + _VERMAP("0.8.0", rd_kafka_ApiVersion_0_8_0), + {"0.7.", NULL}, /* Unsupported */ + {"0.6.", NULL}, /* Unsupported */ + _VERMAP("", rd_kafka_ApiVersion_Queryable), + {NULL}}; int i; int fallback_i = -1; int ret = 0; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_feature.h b/src/third_party/librdkafka/dist/src/rdkafka_feature.h index 266285cf30d..7de17596426 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_feature.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_feature.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2016, Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_fetcher.c b/src/third_party/librdkafka/dist/src/rdkafka_fetcher.c index 5003e3d8df8..e275ee5a88b 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_fetcher.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_fetcher.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2022 Magnus Edenhill + * Copyright (c) 2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,6 +37,7 @@ #include "rdkafka_offset.h" #include "rdkafka_msgset.h" #include "rdkafka_fetcher.h" +#include "rdkafka_request.h" /** @@ -51,15 +53,29 @@ static void rd_kafka_broker_fetch_backoff(rd_kafka_broker_t *rkb, /** * @brief Backoff the next Fetch for specific partition + * + * @returns the absolute backoff time (the current time for no backoff). */ -static void rd_kafka_toppar_fetch_backoff(rd_kafka_broker_t *rkb, - rd_kafka_toppar_t *rktp, - rd_kafka_resp_err_t err) { - int backoff_ms = rkb->rkb_rk->rk_conf.fetch_error_backoff_ms; +static rd_ts_t rd_kafka_toppar_fetch_backoff(rd_kafka_broker_t *rkb, + rd_kafka_toppar_t *rktp, + rd_kafka_resp_err_t err) { + int backoff_ms; /* Don't back off on reaching end of partition */ - if (err == RD_KAFKA_RESP_ERR__PARTITION_EOF) - return; + if (err == RD_KAFKA_RESP_ERR__PARTITION_EOF) { + rktp->rktp_ts_fetch_backoff = 0; + return rd_clock(); /* Immediate: No practical backoff */ + } + + if (err == RD_KAFKA_RESP_ERR__QUEUE_FULL) + backoff_ms = rkb->rkb_rk->rk_conf.fetch_queue_backoff_ms; + else + backoff_ms = rkb->rkb_rk->rk_conf.fetch_error_backoff_ms; + + if (unlikely(!backoff_ms)) { + rktp->rktp_ts_fetch_backoff = 0; + return rd_clock(); /* Immediate: No practical backoff */ + } /* Certain errors that may require manual intervention should have * a longer backoff time. */ @@ -73,8 +89,9 @@ static void rd_kafka_toppar_fetch_backoff(rd_kafka_broker_t *rkb, rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, backoff_ms, err ? ": " : "", err ? rd_kafka_err2str(err) : ""); -} + return rktp->rktp_ts_fetch_backoff; +} /** * @brief Handle preferred replica in fetch response. @@ -171,55 +188,89 @@ static void rd_kafka_fetch_reply_handle_partition_error( rd_kafka_resp_err_t err, int64_t HighwaterMarkOffset) { + rd_rkb_dbg(rkb, FETCH, "FETCHERR", + "%.*s [%" PRId32 "]: Fetch failed at %s: %s", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition, + rd_kafka_fetch_pos2str(rktp->rktp_offsets.fetch_pos), + rd_kafka_err2name(err)); + /* Some errors should be passed to the * application while some handled by rdkafka */ switch (err) { /* Errors handled by rdkafka */ + case RD_KAFKA_RESP_ERR_OFFSET_NOT_AVAILABLE: case RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART: case RD_KAFKA_RESP_ERR_LEADER_NOT_AVAILABLE: - case RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION: + case RD_KAFKA_RESP_ERR_NOT_LEADER_OR_FOLLOWER: case RD_KAFKA_RESP_ERR_BROKER_NOT_AVAILABLE: case RD_KAFKA_RESP_ERR_REPLICA_NOT_AVAILABLE: case RD_KAFKA_RESP_ERR_KAFKA_STORAGE_ERROR: + case RD_KAFKA_RESP_ERR_UNKNOWN_LEADER_EPOCH: case RD_KAFKA_RESP_ERR_FENCED_LEADER_EPOCH: + case RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_ID: + if (err == RD_KAFKA_RESP_ERR_OFFSET_NOT_AVAILABLE) { + /* Occurs when: + * - Msg exists on broker but + * offset > HWM, or: + * - HWM is >= offset, but msg not + * yet available at that offset + * (replica is out of sync). + * - partition leader is out of sync. + * + * Handle by requesting metadata update, changing back + * to the leader, and then retrying FETCH + * (with backoff). + */ + rd_rkb_dbg(rkb, MSG, "FETCH", + "Topic %s [%" PRId32 + "]: %s not " + "available on broker %" PRId32 + " (leader %" PRId32 + "): updating metadata and retrying", + rktp->rktp_rkt->rkt_topic->str, + rktp->rktp_partition, + rd_kafka_fetch_pos2str( + rktp->rktp_offsets.fetch_pos), + rktp->rktp_broker_id, rktp->rktp_leader_id); + } + + if (err == RD_KAFKA_RESP_ERR_UNKNOWN_LEADER_EPOCH) { + rd_rkb_dbg(rkb, MSG | RD_KAFKA_DBG_CONSUMER, "FETCH", + "Topic %s [%" PRId32 + "]: Fetch failed at %s: %s: broker %" PRId32 + "has not yet caught up on latest metadata: " + "retrying", + rktp->rktp_rkt->rkt_topic->str, + rktp->rktp_partition, + rd_kafka_fetch_pos2str( + rktp->rktp_offsets.fetch_pos), + rd_kafka_err2str(err), rktp->rktp_broker_id); + } + + if (rktp->rktp_broker_id != rktp->rktp_leader_id) { + rd_kafka_toppar_delegate_to_leader(rktp); + } /* Request metadata information update*/ rd_kafka_toppar_leader_unavailable(rktp, "fetch", err); break; - case RD_KAFKA_RESP_ERR_OFFSET_NOT_AVAILABLE: - /* Occurs when: - * - Msg exists on broker but - * offset > HWM, or: - * - HWM is >= offset, but msg not - * yet available at that offset - * (replica is out of sync). - * - * Handle by retrying FETCH (with backoff). - */ - rd_rkb_dbg(rkb, MSG, "FETCH", - "Topic %s [%" PRId32 "]: Offset %" PRId64 - " not " - "available on broker %" PRId32 " (leader %" PRId32 - "): retrying", - rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rktp->rktp_offsets.fetch_offset, - rktp->rktp_broker_id, rktp->rktp_leader_id); - break; - case RD_KAFKA_RESP_ERR_OFFSET_OUT_OF_RANGE: { - int64_t err_offset; + rd_kafka_fetch_pos_t err_pos; if (rktp->rktp_broker_id != rktp->rktp_leader_id && - rktp->rktp_offsets.fetch_offset > HighwaterMarkOffset) { + rktp->rktp_offsets.fetch_pos.offset > HighwaterMarkOffset) { rd_kafka_log(rkb->rkb_rk, LOG_WARNING, "FETCH", - "Topic %s [%" PRId32 "]: Offset %" PRId64 + "Topic %s [%" PRId32 + "]: %s " " out of range (HighwaterMark %" PRId64 " fetching from " "broker %" PRId32 " (leader %" PRId32 "): reverting to leader", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rktp->rktp_offsets.fetch_offset, + rd_kafka_fetch_pos2str( + rktp->rktp_offsets.fetch_pos), HighwaterMarkOffset, rktp->rktp_broker_id, rktp->rktp_leader_id); @@ -232,9 +283,10 @@ static void rd_kafka_fetch_reply_handle_partition_error( } /* Application error */ - err_offset = rktp->rktp_offsets.fetch_offset; - rktp->rktp_offsets.fetch_offset = RD_KAFKA_OFFSET_INVALID; - rd_kafka_offset_reset(rktp, rd_kafka_broker_id(rkb), err_offset, + err_pos = rktp->rktp_offsets.fetch_pos; + rktp->rktp_offsets.fetch_pos.offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_offsets.fetch_pos.leader_epoch = -1; + rd_kafka_offset_reset(rktp, rd_kafka_broker_id(rkb), err_pos, err, "fetch failed due to requested offset " "not available on the broker"); @@ -248,7 +300,7 @@ static void rd_kafka_fetch_reply_handle_partition_error( rd_kafka_consumer_err( rktp->rktp_fetchq, rd_kafka_broker_id(rkb), err, tver->version, NULL, rktp, - rktp->rktp_offsets.fetch_offset, + rktp->rktp_offsets.fetch_pos.offset, "Fetch from broker %" PRId32 " failed: %s", rd_kafka_broker_id(rkb), rd_kafka_err2str(err)); rktp->rktp_last_error = err; @@ -259,17 +311,17 @@ static void rd_kafka_fetch_reply_handle_partition_error( /* Application errors */ case RD_KAFKA_RESP_ERR__PARTITION_EOF: if (rkb->rkb_rk->rk_conf.enable_partition_eof) - rd_kafka_consumer_err(rktp->rktp_fetchq, - rd_kafka_broker_id(rkb), err, - tver->version, NULL, rktp, - rktp->rktp_offsets.fetch_offset, - "Fetch from broker %" PRId32 - " reached end of " - "partition at offset %" PRId64 - " (HighwaterMark %" PRId64 ")", - rd_kafka_broker_id(rkb), - rktp->rktp_offsets.fetch_offset, - HighwaterMarkOffset); + rd_kafka_consumer_err( + rktp->rktp_fetchq, rd_kafka_broker_id(rkb), err, + tver->version, NULL, rktp, + rktp->rktp_offsets.fetch_pos.offset, + "Fetch from broker %" PRId32 + " reached end of " + "partition at offset %" PRId64 + " (HighwaterMark %" PRId64 ")", + rd_kafka_broker_id(rkb), + rktp->rktp_offsets.fetch_pos.offset, + HighwaterMarkOffset); break; case RD_KAFKA_RESP_ERR_MSG_SIZE_TOO_LARGE: @@ -277,9 +329,12 @@ static void rd_kafka_fetch_reply_handle_partition_error( rd_dassert(tver->version > 0); rd_kafka_consumer_err( rktp->rktp_fetchq, rd_kafka_broker_id(rkb), err, - tver->version, NULL, rktp, rktp->rktp_offsets.fetch_offset, - "Fetch from broker %" PRId32 " failed: %s", - rd_kafka_broker_id(rkb), rd_kafka_err2str(err)); + tver->version, NULL, rktp, + rktp->rktp_offsets.fetch_pos.offset, + "Fetch from broker %" PRId32 " failed at %s: %s", + rd_kafka_broker_id(rkb), + rd_kafka_fetch_pos2str(rktp->rktp_offsets.fetch_pos), + rd_kafka_err2str(err)); break; } @@ -287,20 +342,173 @@ static void rd_kafka_fetch_reply_handle_partition_error( rd_kafka_toppar_fetch_backoff(rkb, rktp, err); } +static void rd_kafkap_Fetch_reply_tags_set_topic_cnt( + rd_kafkap_Fetch_reply_tags_t *reply_tags, + int32_t TopicCnt) { + reply_tags->TopicCnt = TopicCnt; + rd_dassert(!reply_tags->Topics); + reply_tags->Topics = rd_calloc(TopicCnt, sizeof(*reply_tags->Topics)); +} +static void +rd_kafkap_Fetch_reply_tags_set_topic(rd_kafkap_Fetch_reply_tags_t *reply_tags, + int TopicIdx, + rd_kafka_Uuid_t TopicId, + int32_t PartitionCnt) { + reply_tags->Topics[TopicIdx].TopicId = TopicId; + reply_tags->Topics[TopicIdx].PartitionCnt = PartitionCnt; + rd_dassert(!reply_tags->Topics[TopicIdx].Partitions); + reply_tags->Topics[TopicIdx].Partitions = rd_calloc( + PartitionCnt, sizeof(*reply_tags->Topics[TopicIdx].Partitions)); +} + + +static void +rd_kafkap_Fetch_reply_tags_destroy(rd_kafkap_Fetch_reply_tags_t *reply_tags) { + int i; + for (i = 0; i < reply_tags->TopicCnt; i++) { + RD_IF_FREE(reply_tags->Topics[i].Partitions, rd_free); + } + RD_IF_FREE(reply_tags->Topics, rd_free); + RD_IF_FREE(reply_tags->NodeEndpoints.NodeEndpoints, rd_free); +} + +static int rd_kafkap_Fetch_reply_tags_partition_parse( + rd_kafka_buf_t *rkbuf, + uint64_t tagtype, + uint64_t taglen, + rd_kafkap_Fetch_reply_tags_Topic_t *TopicTags, + rd_kafkap_Fetch_reply_tags_Partition_t *PartitionTags) { + switch (tagtype) { + case 1: /* CurrentLeader */ + if (rd_kafka_buf_read_CurrentLeader( + rkbuf, &PartitionTags->CurrentLeader) == -1) + goto err_parse; + TopicTags->partitions_with_leader_change_cnt++; + return 1; + default: + return 0; + } +err_parse: + return -1; +} + +static int +rd_kafkap_Fetch_reply_tags_parse(rd_kafka_buf_t *rkbuf, + uint64_t tagtype, + uint64_t taglen, + rd_kafkap_Fetch_reply_tags_t *tags) { + switch (tagtype) { + case 0: /* NodeEndpoints */ + if (rd_kafka_buf_read_NodeEndpoints(rkbuf, + &tags->NodeEndpoints) == -1) + goto err_parse; + return 1; + default: + return 0; + } +err_parse: + return -1; +} + +static void +rd_kafka_handle_Fetch_metadata_update(rd_kafka_broker_t *rkb, + rd_kafkap_Fetch_reply_tags_t *FetchTags) { + if (FetchTags->topics_with_leader_change_cnt && + FetchTags->NodeEndpoints.NodeEndpoints) { + rd_kafka_metadata_t *md = NULL; + rd_kafka_metadata_internal_t *mdi = NULL; + rd_tmpabuf_t tbuf; + int32_t nodeid; + rd_kafka_op_t *rko; + int i, changed_topic, changed_partition; + + rd_kafka_broker_lock(rkb); + nodeid = rkb->rkb_nodeid; + rd_kafka_broker_unlock(rkb); + + rd_tmpabuf_new(&tbuf, 0, rd_true /*assert on fail*/); + rd_tmpabuf_add_alloc(&tbuf, sizeof(*mdi)); + rd_kafkap_leader_discovery_tmpabuf_add_alloc_brokers( + &tbuf, &FetchTags->NodeEndpoints); + rd_kafkap_leader_discovery_tmpabuf_add_alloc_topics( + &tbuf, FetchTags->topics_with_leader_change_cnt); + for (i = 0; i < FetchTags->TopicCnt; i++) { + if (!FetchTags->Topics[i] + .partitions_with_leader_change_cnt) + continue; + rd_kafkap_leader_discovery_tmpabuf_add_alloc_topic( + &tbuf, NULL, + FetchTags->Topics[i] + .partitions_with_leader_change_cnt); + } + rd_tmpabuf_finalize(&tbuf); + + mdi = rd_tmpabuf_alloc(&tbuf, sizeof(*mdi)); + md = &mdi->metadata; + + rd_kafkap_leader_discovery_metadata_init(mdi, nodeid); + + rd_kafkap_leader_discovery_set_brokers( + &tbuf, mdi, &FetchTags->NodeEndpoints); + + rd_kafkap_leader_discovery_set_topic_cnt( + &tbuf, mdi, FetchTags->topics_with_leader_change_cnt); + + changed_topic = 0; + for (i = 0; i < FetchTags->TopicCnt; i++) { + int j; + if (!FetchTags->Topics[i] + .partitions_with_leader_change_cnt) + continue; + + rd_kafkap_leader_discovery_set_topic( + &tbuf, mdi, changed_topic, + FetchTags->Topics[i].TopicId, NULL, + FetchTags->Topics[i] + .partitions_with_leader_change_cnt); + + changed_partition = 0; + for (j = 0; j < FetchTags->Topics[i].PartitionCnt; + j++) { + if (FetchTags->Topics[i] + .Partitions[j] + .CurrentLeader.LeaderId < 0) + continue; + + rd_kafkap_Fetch_reply_tags_Partition_t + *Partition = + &FetchTags->Topics[i].Partitions[j]; + rd_kafkap_leader_discovery_set_CurrentLeader( + &tbuf, mdi, changed_topic, + changed_partition, Partition->Partition, + &Partition->CurrentLeader); + changed_partition++; + } + changed_topic++; + } + + rko = rd_kafka_op_new(RD_KAFKA_OP_METADATA_UPDATE); + rko->rko_u.metadata.md = md; + rko->rko_u.metadata.mdi = mdi; + rd_kafka_q_enq(rkb->rkb_rk->rk_ops, rko); + } +} /** * @brief Per-partition FetchResponse parsing and handling. * * @returns an error on buffer parse failure, else RD_KAFKA_RESP_ERR_NO_ERROR. */ -static rd_kafka_resp_err_t -rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, - const rd_kafkap_str_t *topic, - rd_kafka_topic_t *rkt /*possibly NULL*/, - rd_kafka_buf_t *rkbuf, - rd_kafka_buf_t *request, - int16_t ErrorCode) { +static rd_kafka_resp_err_t rd_kafka_fetch_reply_handle_partition( + rd_kafka_broker_t *rkb, + const rd_kafkap_str_t *topic, + rd_kafka_topic_t *rkt /*possibly NULL*/, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + int16_t ErrorCode, + rd_kafkap_Fetch_reply_tags_Topic_t *TopicTags, + rd_kafkap_Fetch_reply_tags_Partition_t *PartitionTags) { const int log_decode_errors = LOG_ERR; struct rd_kafka_toppar_ver *tver, tver_skel; rd_kafka_toppar_t *rktp = NULL; @@ -321,6 +529,8 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, rd_kafka_buf_read_i32(rkbuf, &hdr.Partition); rd_kafka_buf_read_i16(rkbuf, &hdr.ErrorCode); + if (PartitionTags) + PartitionTags->Partition = hdr.Partition; if (ErrorCode) hdr.ErrorCode = ErrorCode; rd_kafka_buf_read_i64(rkbuf, &hdr.HighwaterMarkOffset); @@ -331,11 +541,13 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, hdr.LogStartOffset = RD_KAFKA_OFFSET_INVALID; if (rd_kafka_buf_ApiVersion(request) >= 4) { int32_t AbortedTxnCnt; + int k; rd_kafka_buf_read_i64(rkbuf, &hdr.LastStableOffset); if (rd_kafka_buf_ApiVersion(request) >= 5) rd_kafka_buf_read_i64(rkbuf, &hdr.LogStartOffset); - rd_kafka_buf_read_i32(rkbuf, &AbortedTxnCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &AbortedTxnCnt, + RD_KAFKAP_ABORTED_TRANSACTIONS_MAX); if (rkb->rkb_rk->rk_conf.isolation_level == RD_KAFKA_READ_UNCOMMITTED) { @@ -350,9 +562,11 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, "fetch response: ignoring.", RD_KAFKAP_STR_PR(topic), hdr.Partition, AbortedTxnCnt); - - rd_kafka_buf_skip(rkbuf, - AbortedTxnCnt * (8 + 8)); + for (k = 0; k < AbortedTxnCnt; k++) { + rd_kafka_buf_skip(rkbuf, (8 + 8)); + /* AbortedTransaction tags */ + rd_kafka_buf_skip_tags(rkbuf); + } } } else { /* Older brokers may return LSO -1, @@ -361,17 +575,6 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, end_offset = hdr.LastStableOffset; if (AbortedTxnCnt > 0) { - int k; - - if (unlikely(AbortedTxnCnt > 1000000)) - rd_kafka_buf_parse_fail( - rkbuf, - "%.*s [%" PRId32 - "]: " - "invalid AbortedTxnCnt %" PRId32, - RD_KAFKAP_STR_PR(topic), - hdr.Partition, AbortedTxnCnt); - aborted_txns = rd_kafka_aborted_txns_new(AbortedTxnCnt); for (k = 0; k < AbortedTxnCnt; k++) { @@ -380,6 +583,8 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, rd_kafka_buf_read_i64(rkbuf, &PID); rd_kafka_buf_read_i64(rkbuf, &FirstOffset); + /* AbortedTransaction tags */ + rd_kafka_buf_skip_tags(rkbuf); rd_kafka_aborted_txns_add( aborted_txns, PID, FirstOffset); } @@ -392,8 +597,8 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, rd_kafka_buf_read_i32(rkbuf, &hdr.PreferredReadReplica); else hdr.PreferredReadReplica = -1; - - rd_kafka_buf_read_i32(rkbuf, &hdr.MessageSetSize); + /* Compact Records Array */ + rd_kafka_buf_read_arraycnt(rkbuf, &hdr.MessageSetSize, -1); if (unlikely(hdr.MessageSetSize < 0)) rd_kafka_buf_parse_fail( @@ -416,9 +621,7 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, hdr.ErrorCode, RD_KAFKAP_STR_PR(topic), hdr.Partition); rd_kafka_buf_skip(rkbuf, hdr.MessageSetSize); - if (aborted_txns) - rd_kafka_aborted_txns_destroy(aborted_txns); - return RD_KAFKA_RESP_ERR_NO_ERROR; + goto done; } rd_kafka_toppar_lock(rktp); @@ -446,11 +649,7 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, rktp->rktp_partition, hdr.MessageSetSize); rd_kafka_buf_skip(rkbuf, hdr.MessageSetSize); } - - if (aborted_txns) - rd_kafka_aborted_txns_destroy(aborted_txns); - rd_kafka_toppar_destroy(rktp); /* from get */ - return RD_KAFKA_RESP_ERR_NO_ERROR; + goto done; } rd_kafka_toppar_lock(rktp); @@ -464,11 +663,8 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, "]: partition broker has changed: " "discarding fetch response", RD_KAFKAP_STR_PR(topic), hdr.Partition); - rd_kafka_toppar_destroy(rktp); /* from get */ rd_kafka_buf_skip(rkbuf, hdr.MessageSetSize); - if (aborted_txns) - rd_kafka_aborted_txns_destroy(aborted_txns); - return RD_KAFKA_RESP_ERR_NO_ERROR; + goto done; } fetch_version = rktp->rktp_fetch_version; @@ -481,7 +677,7 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, * desynchronized clusters): if so ignore it. */ tver_skel.rktp = rktp; tver = rd_list_find(request->rkbuf_rktp_vers, &tver_skel, - rd_kafka_toppar_ver_cmp); + rd_kafka_toppar_ver_cmp); rd_kafka_assert(NULL, tver); if (tver->rktp != rktp || tver->version < fetch_version) { rd_rkb_dbg(rkb, MSG, "DROP", @@ -491,11 +687,8 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, tver->version, fetch_version); rd_atomic64_add(&rktp->rktp_c.rx_ver_drops, 1); - rd_kafka_toppar_destroy(rktp); /* from get */ rd_kafka_buf_skip(rkbuf, hdr.MessageSetSize); - if (aborted_txns) - rd_kafka_aborted_txns_destroy(aborted_txns); - return RD_KAFKA_RESP_ERR_NO_ERROR; + goto done; } rd_rkb_dbg(rkb, MSG, "FETCH", @@ -508,10 +701,10 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, /* If this is the last message of the queue, * signal EOF back to the application. */ - if (end_offset == rktp->rktp_offsets.fetch_offset && - rktp->rktp_offsets.eof_offset != rktp->rktp_offsets.fetch_offset) { + if (end_offset == rktp->rktp_offsets.fetch_pos.offset && + rktp->rktp_offsets.eof_offset != end_offset) { hdr.ErrorCode = RD_KAFKA_RESP_ERR__PARTITION_EOF; - rktp->rktp_offsets.eof_offset = rktp->rktp_offsets.fetch_offset; + rktp->rktp_offsets.eof_offset = end_offset; } if (unlikely(hdr.ErrorCode != RD_KAFKA_RESP_ERR_NO_ERROR)) { @@ -519,24 +712,15 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, rd_kafka_fetch_reply_handle_partition_error( rkb, rktp, tver, hdr.ErrorCode, hdr.HighwaterMarkOffset); - rd_kafka_toppar_destroy(rktp); /* from get()*/ - rd_kafka_buf_skip(rkbuf, hdr.MessageSetSize); - - if (aborted_txns) - rd_kafka_aborted_txns_destroy(aborted_txns); - return RD_KAFKA_RESP_ERR_NO_ERROR; + goto done; } /* No error, clear any previous fetch error. */ rktp->rktp_last_error = RD_KAFKA_RESP_ERR_NO_ERROR; - if (unlikely(hdr.MessageSetSize <= 0)) { - rd_kafka_toppar_destroy(rktp); /*from get()*/ - if (aborted_txns) - rd_kafka_aborted_txns_destroy(aborted_txns); - return RD_KAFKA_RESP_ERR_NO_ERROR; - } + if (unlikely(hdr.MessageSetSize <= 0)) + goto done; /** * Parse MessageSet @@ -548,8 +732,6 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, /* Parse messages */ err = rd_kafka_msgset_parse(rkbuf, request, rktp, aborted_txns, tver); - if (aborted_txns) - rd_kafka_aborted_txns_destroy(aborted_txns); rd_slice_widen(&rkbuf->rkbuf_reader, &save_slice); /* Continue with next partition regardless of @@ -559,15 +741,31 @@ rd_kafka_fetch_reply_handle_partition(rd_kafka_broker_t *rkb, if (unlikely(err)) rd_kafka_toppar_fetch_backoff(rkb, rktp, err); - rd_kafka_toppar_destroy(rktp); /*from get()*/ - - return RD_KAFKA_RESP_ERR_NO_ERROR; + goto done; err_parse: + if (aborted_txns) + rd_kafka_aborted_txns_destroy(aborted_txns); if (rktp) rd_kafka_toppar_destroy(rktp); /*from get()*/ - return rkbuf->rkbuf_err; + +done: + if (aborted_txns) + rd_kafka_aborted_txns_destroy(aborted_txns); + if (likely(rktp != NULL)) + rd_kafka_toppar_destroy(rktp); /*from get()*/ + + if (PartitionTags) { + /* Set default LeaderId and LeaderEpoch */ + PartitionTags->CurrentLeader.LeaderId = -1; + PartitionTags->CurrentLeader.LeaderEpoch = -1; + } + rd_kafka_buf_read_tags(rkbuf, + rd_kafkap_Fetch_reply_tags_partition_parse, + TopicTags, PartitionTags); + + return RD_KAFKA_RESP_ERR_NO_ERROR; } /** @@ -580,9 +778,11 @@ rd_kafka_fetch_reply_handle(rd_kafka_broker_t *rkb, rd_kafka_buf_t *request) { int32_t TopicArrayCnt; int i; - const int log_decode_errors = LOG_ERR; - rd_kafka_topic_t *rkt = NULL; - int16_t ErrorCode = RD_KAFKA_RESP_ERR_NO_ERROR; + const int log_decode_errors = LOG_ERR; + rd_kafka_topic_t *rkt = NULL; + int16_t ErrorCode = RD_KAFKA_RESP_ERR_NO_ERROR; + rd_kafkap_Fetch_reply_tags_t FetchTags = RD_ZERO_INIT; + rd_bool_t has_fetch_tags = rd_false; if (rd_kafka_buf_ApiVersion(request) >= 1) { int32_t Throttle_Time; @@ -598,35 +798,69 @@ rd_kafka_fetch_reply_handle(rd_kafka_broker_t *rkb, rd_kafka_buf_read_i32(rkbuf, &SessionId); } - rd_kafka_buf_read_i32(rkbuf, &TopicArrayCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicArrayCnt, RD_KAFKAP_TOPICS_MAX); /* Verify that TopicArrayCnt seems to be in line with remaining size */ rd_kafka_buf_check_len(rkbuf, TopicArrayCnt * (3 /*topic min size*/ + 4 /*PartitionArrayCnt*/ + 4 + 2 + 8 + 4 /*inner header*/)); + if (rd_kafka_buf_ApiVersion(request) >= 12) { + has_fetch_tags = rd_true; + rd_kafkap_Fetch_reply_tags_set_topic_cnt(&FetchTags, + TopicArrayCnt); + } + for (i = 0; i < TopicArrayCnt; i++) { - rd_kafkap_str_t topic; + rd_kafkap_str_t topic = RD_ZERO_INIT; + rd_kafka_Uuid_t topic_id = RD_KAFKA_UUID_ZERO; int32_t PartitionArrayCnt; int j; - rd_kafka_buf_read_str(rkbuf, &topic); - rd_kafka_buf_read_i32(rkbuf, &PartitionArrayCnt); + if (rd_kafka_buf_ApiVersion(request) > 12) { + rd_kafka_buf_read_uuid(rkbuf, &topic_id); + rkt = rd_kafka_topic_find_by_topic_id(rkb->rkb_rk, + topic_id); + if (rkt) + topic = *rkt->rkt_topic; + } else { + rd_kafka_buf_read_str(rkbuf, &topic); + rkt = rd_kafka_topic_find0(rkb->rkb_rk, &topic); + } - rkt = rd_kafka_topic_find0(rkb->rkb_rk, &topic); + rd_kafka_buf_read_arraycnt(rkbuf, &PartitionArrayCnt, + RD_KAFKAP_PARTITIONS_MAX); + if (rd_kafka_buf_ApiVersion(request) >= 12) { + rd_kafkap_Fetch_reply_tags_set_topic( + &FetchTags, i, topic_id, PartitionArrayCnt); + } for (j = 0; j < PartitionArrayCnt; j++) { if (rd_kafka_fetch_reply_handle_partition( - rkb, &topic, rkt, rkbuf, request, ErrorCode)) + rkb, &topic, rkt, rkbuf, request, ErrorCode, + has_fetch_tags ? &FetchTags.Topics[i] : NULL, + has_fetch_tags + ? &FetchTags.Topics[i].Partitions[j] + : NULL)) goto err_parse; } + if (has_fetch_tags && + FetchTags.Topics[i].partitions_with_leader_change_cnt) { + FetchTags.topics_with_leader_change_cnt++; + } if (rkt) { rd_kafka_topic_destroy0(rkt); rkt = NULL; } + /* Topic Tags */ + rd_kafka_buf_skip_tags(rkbuf); } + /* Top level tags */ + rd_kafka_buf_read_tags(rkbuf, rd_kafkap_Fetch_reply_tags_parse, + &FetchTags); + if (rd_kafka_buf_read_remain(rkbuf) != 0) { rd_kafka_buf_parse_fail(rkbuf, "Remaining data after message set " @@ -634,12 +868,15 @@ rd_kafka_fetch_reply_handle(rd_kafka_broker_t *rkb, rd_kafka_buf_read_remain(rkbuf)); RD_NOTREACHED(); } + rd_kafka_handle_Fetch_metadata_update(rkb, &FetchTags); + rd_kafkap_Fetch_reply_tags_destroy(&FetchTags); return 0; err_parse: if (rkt) rd_kafka_topic_destroy0(rkt); + rd_kafkap_Fetch_reply_tags_destroy(&FetchTags); rd_rkb_dbg(rkb, MSG, "BADMSG", "Bad message (Fetch v%d): " "is broker.version.fallback incorrectly set?", @@ -649,6 +886,11 @@ err_parse: +/** + * @broker FetchResponse handling. + * + * @locality broker thread (or any thread if err == __DESTROY). + */ static void rd_kafka_broker_fetch_reply(rd_kafka_t *rk, rd_kafka_broker_t *rkb, rd_kafka_resp_err_t err, @@ -677,6 +919,7 @@ static void rd_kafka_broker_fetch_reply(rd_kafka_t *rk, case RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION: case RD_KAFKA_RESP_ERR_BROKER_NOT_AVAILABLE: case RD_KAFKA_RESP_ERR_REPLICA_NOT_AVAILABLE: + case RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_ID: /* Request metadata information update */ rd_snprintf(tmp, sizeof(tmp), "FetchRequest failed: %s", rd_kafka_err2str(err)); @@ -700,7 +943,21 @@ static void rd_kafka_broker_fetch_reply(rd_kafka_t *rk, } } +/** + * @brief Check if any toppars have a zero topic id. + * + */ +static rd_bool_t can_use_topic_ids(rd_kafka_broker_t *rkb) { + rd_kafka_toppar_t *rktp = rkb->rkb_active_toppar_next; + do { + if (RD_KAFKA_UUID_IS_ZERO(rktp->rktp_rkt->rkt_topic_id)) + return rd_false; + } while ((rktp = CIRCLEQ_LOOP_NEXT(&rkb->rkb_active_toppars, rktp, + rktp_activelink)) != + rkb->rkb_active_toppar_next); + return rd_true; +} /** * @brief Build and send a Fetch request message for all underflowed toppars @@ -734,21 +991,28 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now) { if (unlikely(rkb->rkb_active_toppar_cnt == 0)) return 0; - rkbuf = rd_kafka_buf_new_request( + ApiVersion = rd_kafka_broker_ApiVersion_supported(rkb, RD_KAFKAP_Fetch, + 0, 16, NULL); + + /* Fallback to version 12 if topic id is null which can happen if + * inter.broker.protocol.version is < 2.8 */ + if (ApiVersion > 12 && !can_use_topic_ids(rkb)) + ApiVersion = 12; + + rkbuf = rd_kafka_buf_new_flexver_request( rkb, RD_KAFKAP_Fetch, 1, - /* ReplicaId+MaxWaitTime+MinBytes+MaxBytes+IsolationLevel+ + /* MaxWaitTime+MinBytes+MaxBytes+IsolationLevel+ * SessionId+Epoch+TopicCnt */ - 4 + 4 + 4 + 4 + 1 + 4 + 4 + 4 + + 4 + 4 + 4 + 1 + 4 + 4 + 4 + /* N x PartCnt+Partition+CurrentLeaderEpoch+FetchOffset+ - * LogStartOffset+MaxBytes+?TopicNameLen?*/ - (rkb->rkb_active_toppar_cnt * (4 + 4 + 4 + 8 + 8 + 4 + 40)) + + * LastFetchedEpoch+LogStartOffset+MaxBytes+?TopicNameLen?*/ + (rkb->rkb_active_toppar_cnt * + (4 + 4 + 4 + 8 + 4 + 8 + 4 + 40)) + /* ForgottenTopicsCnt */ 4 + /* N x ForgottenTopicsData */ - 0); - - ApiVersion = rd_kafka_broker_ApiVersion_supported(rkb, RD_KAFKAP_Fetch, - 0, 11, NULL); + 0, + ApiVersion >= 12); if (rkb->rkb_features & RD_KAFKA_FEATURE_MSGVER2) rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, @@ -762,8 +1026,10 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now) { /* FetchRequest header */ - /* ReplicaId */ - rd_kafka_buf_write_i32(rkbuf, -1); + if (rd_kafka_buf_ApiVersion(rkbuf) <= 14) + /* ReplicaId */ + rd_kafka_buf_write_i32(rkbuf, -1); + /* MaxWaitTime */ rd_kafka_buf_write_i32(rkbuf, rkb->rkb_rk->rk_conf.fetch_wait_max_ms); /* MinBytes */ @@ -787,7 +1053,7 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now) { } /* Write zero TopicArrayCnt but store pointer for later update */ - of_TopicArrayCnt = rd_kafka_buf_write_i32(rkbuf, 0); + of_TopicArrayCnt = rd_kafka_buf_write_arraycnt_pos(rkbuf); /* Prepare map for storing the fetch version for each partition, * this will later be checked in Fetch response to purge outdated @@ -806,19 +1072,31 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now) { if (rkt_last != rktp->rktp_rkt) { if (rkt_last != NULL) { /* Update PartitionArrayCnt */ - rd_kafka_buf_update_i32(rkbuf, - of_PartitionArrayCnt, - PartitionArrayCnt); + rd_kafka_buf_finalize_arraycnt( + rkbuf, of_PartitionArrayCnt, + PartitionArrayCnt); + /* Topic tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + } + if (rd_kafka_buf_ApiVersion(rkbuf) > 12) { + /* Topic id must be non-zero here */ + rd_dassert(!RD_KAFKA_UUID_IS_ZERO( + rktp->rktp_rkt->rkt_topic_id)); + /* Topic ID */ + rd_kafka_buf_write_uuid( + rkbuf, &rktp->rktp_rkt->rkt_topic_id); + } else { + /* Topic name */ + rd_kafka_buf_write_kstr( + rkbuf, rktp->rktp_rkt->rkt_topic); } - /* Topic name */ - rd_kafka_buf_write_kstr(rkbuf, - rktp->rktp_rkt->rkt_topic); TopicArrayCnt++; rkt_last = rktp->rktp_rkt; /* Partition count */ - of_PartitionArrayCnt = rd_kafka_buf_write_i32(rkbuf, 0); - PartitionArrayCnt = 0; + of_PartitionArrayCnt = + rd_kafka_buf_write_arraycnt_pos(rkbuf); + PartitionArrayCnt = 0; } PartitionArrayCnt++; @@ -826,13 +1104,30 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now) { /* Partition */ rd_kafka_buf_write_i32(rkbuf, rktp->rktp_partition); - if (rd_kafka_buf_ApiVersion(rkbuf) >= 9) + if (rd_kafka_buf_ApiVersion(rkbuf) >= 9) { /* CurrentLeaderEpoch */ - rd_kafka_buf_write_i32(rkbuf, -1); - + if (rktp->rktp_leader_epoch < 0 && + rd_kafka_has_reliable_leader_epochs(rkb)) { + /* If current leader epoch is set to -1 and + * the broker has reliable leader epochs, + * send 0 instead, so that epoch is checked + * and optionally metadata is refreshed. + * This can happen if metadata is read initially + * without an existing topic (see + * rd_kafka_topic_metadata_update2). + */ + rd_kafka_buf_write_i32(rkbuf, 0); + } else { + rd_kafka_buf_write_i32(rkbuf, + rktp->rktp_leader_epoch); + } + } /* FetchOffset */ - rd_kafka_buf_write_i64(rkbuf, rktp->rktp_offsets.fetch_offset); - + rd_kafka_buf_write_i64(rkbuf, + rktp->rktp_offsets.fetch_pos.offset); + if (rd_kafka_buf_ApiVersion(rkbuf) >= 12) + /* LastFetchedEpoch - only used by follower replica */ + rd_kafka_buf_write_i32(rkbuf, -1); if (rd_kafka_buf_ApiVersion(rkbuf) >= 5) /* LogStartOffset - only used by follower replica */ rd_kafka_buf_write_i64(rkbuf, -1); @@ -840,16 +1135,21 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now) { /* MaxBytes */ rd_kafka_buf_write_i32(rkbuf, rktp->rktp_fetch_msg_max_bytes); + /* Partition tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + rd_rkb_dbg(rkb, FETCH, "FETCH", "Fetch topic %.*s [%" PRId32 "] at offset %" PRId64 - " (v%d)", + " (leader epoch %" PRId32 + ", current leader epoch %" PRId32 ", v%d)", RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), rktp->rktp_partition, - rktp->rktp_offsets.fetch_offset, - rktp->rktp_fetch_version); + rktp->rktp_offsets.fetch_pos.offset, + rktp->rktp_offsets.fetch_pos.leader_epoch, + rktp->rktp_leader_epoch, rktp->rktp_fetch_version); /* We must have a valid fetch offset when we get here */ - rd_dassert(rktp->rktp_offsets.fetch_offset >= 0); + rd_dassert(rktp->rktp_offsets.fetch_pos.offset >= 0); /* Add toppar + op version mapping. */ tver = rd_list_add(rkbuf->rkbuf_rktp_vers, NULL); @@ -876,18 +1176,20 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now) { if (rkt_last != NULL) { /* Update last topic's PartitionArrayCnt */ - rd_kafka_buf_update_i32(rkbuf, of_PartitionArrayCnt, - PartitionArrayCnt); + rd_kafka_buf_finalize_arraycnt(rkbuf, of_PartitionArrayCnt, + PartitionArrayCnt); + /* Topic tags */ + rd_kafka_buf_write_tags_empty(rkbuf); } /* Update TopicArrayCnt */ - rd_kafka_buf_update_i32(rkbuf, of_TopicArrayCnt, TopicArrayCnt); + rd_kafka_buf_finalize_arraycnt(rkbuf, of_TopicArrayCnt, TopicArrayCnt); if (rd_kafka_buf_ApiVersion(rkbuf) >= 7) /* Length of the ForgottenTopics list (KIP-227). Broker * use only - not used by the consumer. */ - rd_kafka_buf_write_i32(rkbuf, 0); + rd_kafka_buf_write_arraycnt(rkbuf, 0); if (rd_kafka_buf_ApiVersion(rkbuf) >= 11) /* RackId */ @@ -913,7 +1215,45 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now) { return cnt; } +/** + * @brief Decide whether it should start fetching from next fetch start + * or continue with current fetch pos. + * + * @param rktp the toppar + * + * @returns rd_true if it should start fetching from next fetch start, + * rd_false otherwise. + * + * @locality any + * @locks toppar_lock() MUST be held + */ +static rd_bool_t rd_kafka_toppar_fetch_decide_start_from_next_fetch_start( + rd_kafka_toppar_t *rktp) { + return rktp->rktp_op_version > rktp->rktp_fetch_version || + rd_kafka_fetch_pos_cmp(&rktp->rktp_next_fetch_start, + &rktp->rktp_last_next_fetch_start) || + rktp->rktp_offsets.fetch_pos.offset == RD_KAFKA_OFFSET_INVALID; +} +/** + * @brief Return next fetch start position: + * if it should start fetching from next fetch start + * or continue with current fetch pos. + * + * @param rktp The toppar + * + * @returns Next fetch start position + * + * @locality any + * @locks toppar_lock() MUST be held + */ +rd_kafka_fetch_pos_t +rd_kafka_toppar_fetch_decide_next_fetch_start_pos(rd_kafka_toppar_t *rktp) { + if (rd_kafka_toppar_fetch_decide_start_from_next_fetch_start(rktp)) + return rktp->rktp_next_fetch_start; + else + return rktp->rktp_offsets.fetch_pos; +} /** * @brief Decide whether this toppar should be on the fetch list or not. @@ -943,7 +1283,7 @@ rd_ts_t rd_kafka_toppar_fetch_decide(rd_kafka_toppar_t *rktp, rd_interval(&rktp->rktp_lease_intvl, 5 * 60 * 1000 * 1000 /*5 minutes*/, 0) > 0; if (lease_expired) { - /* delete_to_leader() requires no locks to be held */ + /* delegate_to_leader() requires no locks to be held */ rd_kafka_toppar_unlock(rktp); rd_kafka_toppar_delegate_to_leader(rktp); rd_kafka_toppar_lock(rktp); @@ -975,9 +1315,7 @@ rd_ts_t rd_kafka_toppar_fetch_decide(rd_kafka_toppar_t *rktp, /* Update broker thread's fetch op version */ version = rktp->rktp_op_version; - if (version > rktp->rktp_fetch_version || - rktp->rktp_next_offset != rktp->rktp_last_next_offset || - rktp->rktp_offsets.fetch_offset == RD_KAFKA_OFFSET_INVALID) { + if (rd_kafka_toppar_fetch_decide_start_from_next_fetch_start(rktp)) { /* New version barrier, something was modified from the * control plane. Reset and start over. * Alternatively only the next_offset changed but not the @@ -985,21 +1323,22 @@ rd_ts_t rd_kafka_toppar_fetch_decide(rd_kafka_toppar_t *rktp, * offset.reset (such as on PARTITION_EOF or * OFFSET_OUT_OF_RANGE). */ - rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "FETCHDEC", - "Topic %s [%" PRId32 - "]: fetch decide: " - "updating to version %d (was %d) at " - "offset %" PRId64 " (was %" PRId64 ")", - rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition, version, - rktp->rktp_fetch_version, rktp->rktp_next_offset, - rktp->rktp_offsets.fetch_offset); + rd_kafka_dbg( + rktp->rktp_rkt->rkt_rk, TOPIC, "FETCHDEC", + "Topic %s [%" PRId32 + "]: fetch decide: " + "updating to version %d (was %d) at %s " + "(was %s)", + rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, + version, rktp->rktp_fetch_version, + rd_kafka_fetch_pos2str(rktp->rktp_next_fetch_start), + rd_kafka_fetch_pos2str(rktp->rktp_offsets.fetch_pos)); rd_kafka_offset_stats_reset(&rktp->rktp_offsets); /* New start offset */ - rktp->rktp_offsets.fetch_offset = rktp->rktp_next_offset; - rktp->rktp_last_next_offset = rktp->rktp_next_offset; + rktp->rktp_offsets.fetch_pos = rktp->rktp_next_fetch_start; + rktp->rktp_last_next_fetch_start = rktp->rktp_next_fetch_start; rktp->rktp_fetch_version = version; @@ -1016,25 +1355,28 @@ rd_ts_t rd_kafka_toppar_fetch_decide(rd_kafka_toppar_t *rktp, should_fetch = 0; reason = "paused"; - } else if (RD_KAFKA_OFFSET_IS_LOGICAL(rktp->rktp_next_offset)) { + } else if (RD_KAFKA_OFFSET_IS_LOGICAL( + rktp->rktp_next_fetch_start.offset)) { should_fetch = 0; reason = "no concrete offset"; - + } else if (rktp->rktp_ts_fetch_backoff > rd_clock()) { + reason = "fetch backed off"; + ts_backoff = rktp->rktp_ts_fetch_backoff; + should_fetch = 0; } else if (rd_kafka_q_len(rktp->rktp_fetchq) >= rkb->rkb_rk->rk_conf.queued_min_msgs) { /* Skip toppars who's local message queue is already above * the lower threshold. */ - reason = "queued.min.messages exceeded"; + reason = "queued.min.messages exceeded"; + ts_backoff = rd_kafka_toppar_fetch_backoff( + rkb, rktp, RD_KAFKA_RESP_ERR__QUEUE_FULL); should_fetch = 0; } else if ((int64_t)rd_kafka_q_size(rktp->rktp_fetchq) >= rkb->rkb_rk->rk_conf.queued_max_msg_bytes) { - reason = "queued.max.messages.kbytes exceeded"; - should_fetch = 0; - - } else if (rktp->rktp_ts_fetch_backoff > rd_clock()) { - reason = "fetch backed off"; - ts_backoff = rktp->rktp_ts_fetch_backoff; + reason = "queued.max.messages.kbytes exceeded"; + ts_backoff = rd_kafka_toppar_fetch_backoff( + rkb, rktp, RD_KAFKA_RESP_ERR__QUEUE_FULL); should_fetch = 0; } @@ -1046,13 +1388,13 @@ done: rd_rkb_dbg( rkb, FETCH, "FETCH", "Topic %s [%" PRId32 - "] in state %s at offset %s " + "] in state %s at %s " "(%d/%d msgs, %" PRId64 "/%d kb queued, " "opv %" PRId32 ") is %s%s", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, rd_kafka_fetch_states[rktp->rktp_fetch_state], - rd_kafka_offset2str(rktp->rktp_next_offset), + rd_kafka_fetch_pos2str(rktp->rktp_next_fetch_start), rd_kafka_q_len(rktp->rktp_fetchq), rkb->rkb_rk->rk_conf.queued_min_msgs, rd_kafka_q_size(rktp->rktp_fetchq) / 1024, diff --git a/src/third_party/librdkafka/dist/src/rdkafka_fetcher.h b/src/third_party/librdkafka/dist/src/rdkafka_fetcher.h index 0e3af82bb24..e304f1369f6 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_fetcher.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_fetcher.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2022 Magnus Edenhill + * Copyright (c) 2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,6 +33,9 @@ int rd_kafka_broker_fetch_toppars(rd_kafka_broker_t *rkb, rd_ts_t now); +rd_kafka_fetch_pos_t +rd_kafka_toppar_fetch_decide_next_fetch_start_pos(rd_kafka_toppar_t *rktp); + rd_ts_t rd_kafka_toppar_fetch_decide(rd_kafka_toppar_t *rktp, rd_kafka_broker_t *rkb, int force_remove); diff --git a/src/third_party/librdkafka/dist/src/rdkafka_header.c b/src/third_party/librdkafka/dist/src/rdkafka_header.c index 98359b424c9..eb3024c51ed 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_header.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_header.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_header.h b/src/third_party/librdkafka/dist/src/rdkafka_header.h index bd6b0e9593a..6d6747ea669 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_header.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_header.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_idempotence.c b/src/third_party/librdkafka/dist/src/rdkafka_idempotence.c index 3245e856ed1..1c189f5c872 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_idempotence.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_idempotence.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_idempotence.h b/src/third_party/librdkafka/dist/src/rdkafka_idempotence.h index 5be8d606d5c..87de3b97a01 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_idempotence.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_idempotence.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_int.h b/src/third_party/librdkafka/dist/src/rdkafka_int.h index 0b9939128eb..386de857dc2 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_int.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_int.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -78,10 +79,34 @@ struct rd_kafka_topic_s; struct rd_kafka_msg_s; struct rd_kafka_broker_s; struct rd_kafka_toppar_s; - +typedef struct rd_kafka_metadata_internal_s rd_kafka_metadata_internal_t; +typedef struct rd_kafka_toppar_s rd_kafka_toppar_t; typedef struct rd_kafka_lwtopic_s rd_kafka_lwtopic_t; +/** + * Protocol level sanity + */ +#define RD_KAFKAP_BROKERS_MAX 10000 +#define RD_KAFKAP_TOPICS_MAX 1000000 +#define RD_KAFKAP_PARTITIONS_MAX 100000 + + +#define RD_KAFKA_OFFSET_IS_LOGICAL(OFF) ((OFF) < 0) + + +/** + * @struct Represents a fetch position: + * an offset and an partition leader epoch (if known, else -1). + */ +typedef struct rd_kafka_fetch_pos_s { + int64_t offset; + int32_t leader_epoch; + rd_bool_t validated; +} rd_kafka_fetch_pos_t; + + + #include "rdkafka_op.h" #include "rdkafka_queue.h" #include "rdkafka_msg.h" @@ -102,11 +127,12 @@ typedef struct rd_kafka_lwtopic_s rd_kafka_lwtopic_t; /** * Protocol level sanity */ -#define RD_KAFKAP_BROKERS_MAX 10000 -#define RD_KAFKAP_TOPICS_MAX 1000000 -#define RD_KAFKAP_PARTITIONS_MAX 100000 -#define RD_KAFKAP_GROUPS_MAX 100000 - +#define RD_KAFKAP_BROKERS_MAX 10000 +#define RD_KAFKAP_TOPICS_MAX 1000000 +#define RD_KAFKAP_PARTITIONS_MAX 100000 +#define RD_KAFKAP_GROUPS_MAX 100000 +#define RD_KAFKAP_CONFIGS_MAX 10000 +#define RD_KAFKAP_ABORTED_TRANSACTIONS_MAX 1000000 #define RD_KAFKA_OFFSET_IS_LOGICAL(OFF) ((OFF) < 0) @@ -208,8 +234,51 @@ rd_kafka_txn_state2str(rd_kafka_txn_state_t state) { return names[state]; } +/** + * @enum Telemetry States + */ +typedef enum { + /** Initial state, awaiting telemetry broker to be assigned */ + RD_KAFKA_TELEMETRY_AWAIT_BROKER, + /** Telemetry broker assigned and GetSubscriptions scheduled */ + RD_KAFKA_TELEMETRY_GET_SUBSCRIPTIONS_SCHEDULED, + /** GetSubscriptions request sent to the assigned broker */ + RD_KAFKA_TELEMETRY_GET_SUBSCRIPTIONS_SENT, + /** PushTelemetry scheduled to send */ + RD_KAFKA_TELEMETRY_PUSH_SCHEDULED, + /** PushTelemetry sent to the assigned broker */ + RD_KAFKA_TELEMETRY_PUSH_SENT, + /** Client is being terminated and last PushTelemetry is scheduled to + * send */ + RD_KAFKA_TELEMETRY_TERMINATING_PUSH_SCHEDULED, + /** Client is being terminated and last PushTelemetry is sent */ + RD_KAFKA_TELEMETRY_TERMINATING_PUSH_SENT, + /** Telemetry is terminated */ + RD_KAFKA_TELEMETRY_TERMINATED, +} rd_kafka_telemetry_state_t; +static RD_UNUSED const char * +rd_kafka_telemetry_state2str(rd_kafka_telemetry_state_t state) { + static const char *names[] = {"AwaitBroker", + "GetSubscriptionsScheduled", + "GetSubscriptionsSent", + "PushScheduled", + "PushSent", + "TerminatingPushScheduled", + "TerminatingPushSent", + "Terminated"}; + return names[state]; +} + +static RD_UNUSED const char *rd_kafka_type2str(rd_kafka_type_t type) { + static const char *types[] = { + [RD_KAFKA_PRODUCER] = "producer", + [RD_KAFKA_CONSUMER] = "consumer", + }; + return types[type]; +} + /** * Kafka handle, internal representation of the application's rd_kafka_t. */ @@ -221,17 +290,25 @@ struct rd_kafka_s { TAILQ_HEAD(, rd_kafka_broker_s) rk_brokers; rd_list_t rk_broker_by_id; /* Fast id lookups. */ rd_atomic32_t rk_broker_cnt; - /**< Number of brokers in state >= UP */ - rd_atomic32_t rk_broker_up_cnt; - /**< Number of logical brokers in state >= UP, this is a sub-set - * of rk_broker_up_cnt. */ - rd_atomic32_t rk_logical_broker_up_cnt; - /**< Number of brokers that are down, only includes brokers - * that have had at least one connection attempt. */ - rd_atomic32_t rk_broker_down_cnt; - /**< Logical brokers currently without an address. + /** Logical brokers count. * Used for calculating ERR__ALL_BROKERS_DOWN. */ - rd_atomic32_t rk_broker_addrless_cnt; + rd_atomic32_t rk_logical_broker_cnt; + /** Number of configured or learned brokers in state >= UP */ + rd_atomic32_t rk_broker_up_cnt; + /** Number of brokers that are down, only includes brokers + * that have had at least one connection attempt + * and are configured or learned. */ + rd_atomic32_t rk_broker_down_cnt; + + /**< Additional bootstrap servers list. + * contains all brokers added through rd_kafka_brokers_add(). + * Doesn't contain the initially configured bootstrap brokers. */ + rd_list_t additional_brokerlists; + + /** Decommissioned threads to await */ + rd_list_t wait_decommissioned_thrds; + /** Decommissioned brokers to await */ + rd_list_t wait_decommissioned_brokers; mtx_t rk_internal_rkb_lock; rd_kafka_broker_t *rk_internal_rkb; @@ -311,7 +388,11 @@ struct rd_kafka_s { * (or equivalent). * Used to enforce * max.poll.interval.ms. - * Only relevant for consumer. */ + * Set to INT64_MAX while polling + * to avoid reaching + * max.poll.interval.ms. during that time + * frame. Only relevant for consumer. */ + /* First fatal error. */ struct { rd_atomic32_t err; /**< rd_kafka_resp_err_t */ @@ -327,17 +408,20 @@ struct rd_kafka_s { rd_ts_t rk_ts_metadata; /* Timestamp of most recent * metadata. */ - struct rd_kafka_metadata *rk_full_metadata; /* Last full metadata. */ - rd_ts_t rk_ts_full_metadata; /* Timesstamp of .. */ + rd_ts_t rk_ts_full_metadata; /* Timestamp of most + * recent full + * metadata */ struct rd_kafka_metadata_cache rk_metadata_cache; /* Metadata cache */ char *rk_clusterid; /* ClusterId from metadata */ int32_t rk_controllerid; /* ControllerId from metadata */ /**< Producer: Delivery report mode */ - enum { RD_KAFKA_DR_MODE_NONE, /**< No delivery reports */ - RD_KAFKA_DR_MODE_CB, /**< Delivery reports through callback */ - RD_KAFKA_DR_MODE_EVENT, /**< Delivery reports through event API*/ + enum { + RD_KAFKA_DR_MODE_NONE, /**< No delivery reports */ + RD_KAFKA_DR_MODE_CB, /**< Delivery reports through callback */ + RD_KAFKA_DR_MODE_EVENT, /**< Delivery reports through event + API*/ } rk_drmode; /* Simple consumer count: @@ -525,6 +609,16 @@ struct rd_kafka_s { } rk_curr_msgs; rd_kafka_timers_t rk_timers; + + /** Metadata refresh timer */ + rd_kafka_timer_t metadata_refresh_tmr; + /** 1s interval timer */ + rd_kafka_timer_t one_s_tmr; + /** Rebootstrap timer. + * Will add bootstrap brokers again + * when it's fired. */ + rd_kafka_timer_t rebootstrap_tmr; + thrd_t rk_thread; int rk_initialized; /**< Will be > 0 when the rd_kafka_t @@ -569,6 +663,12 @@ struct rd_kafka_s { * Use 10 < reconnect.backoff.jitter.ms / 2 < 1000. */ rd_interval_t sparse_connect_random; + + /** Sparse connection timer: fires after remaining time of + * `sparse_connect_random` interval + 1ms. + */ + rd_kafka_timer_t sparse_connect_random_tmr; + /**< Lock for sparse_connect_random */ mtx_t sparse_connect_lock; @@ -592,6 +692,64 @@ struct rd_kafka_s { rd_kafka_q_t *callback_q; /**< SASL callback queue, if any. */ } rk_sasl; + struct { + /* Fields for the control flow - unless guarded by lock, only + * accessed from main thread. */ + /**< Current state of the telemetry state machine. */ + rd_kafka_telemetry_state_t state; + /**< Preferred broker for sending telemetry (Lock protected). */ + rd_kafka_broker_t *preferred_broker; + /**< Timer for all the requests we schedule. */ + rd_kafka_timer_t request_timer; + /**< Lock for preferred telemetry broker and state. */ + mtx_t lock; + /**< Used to wait for termination (Lock protected). */ + cnd_t termination_cnd; + + /* Fields obtained from broker as a result of GetSubscriptions - + * only accessed from main thread. + */ + rd_kafka_Uuid_t client_instance_id; + int32_t subscription_id; + rd_kafka_compression_t *accepted_compression_types; + size_t accepted_compression_types_cnt; + int32_t push_interval_ms; + int32_t telemetry_max_bytes; + rd_bool_t delta_temporality; + char **requested_metrics; + size_t requested_metrics_cnt; + /* TODO: Use rd_list_t to store the metrics */ + int *matched_metrics; + size_t matched_metrics_cnt; + + struct { + rd_ts_t ts_last; /**< Timestamp of last push */ + rd_ts_t ts_start; /**< Timestamp from when collection + * started */ + /** Total rebalance latency (ms) up to previous push */ + uint64_t rebalance_latency_total; + } rk_historic_c; + + struct { + rd_avg_t rk_avg_poll_idle_ratio; + rd_avg_t rk_avg_commit_latency; /**< Current commit + * latency avg */ + rd_avg_t + rk_avg_rebalance_latency; /**< Current rebalance + * latency avg */ + } rd_avg_current; + + struct { + rd_avg_t rk_avg_poll_idle_ratio; + rd_avg_t rk_avg_commit_latency; /**< Rolled over commit + * latency avg */ + rd_avg_t + rk_avg_rebalance_latency; /**< Rolled over rebalance + * latency avg */ + } rd_avg_rollover; + + } rk_telemetry; + /* Test mocks */ struct { rd_kafka_mock_cluster_t *cluster; /**< Mock cluster, created @@ -736,15 +894,13 @@ rd_kafka_curr_msgs_wait_zero(rd_kafka_t *rk, int timeout_ms, unsigned int *curr_msgsp) { unsigned int cnt; - struct timespec tspec; - - rd_timeout_init_timespec(&tspec, timeout_ms); + rd_ts_t abs_timeout = rd_timeout_init(timeout_ms); mtx_lock(&rk->rk_curr_msgs.lock); while ((cnt = rk->rk_curr_msgs.cnt) > 0) { if (cnd_timedwait_abs(&rk->rk_curr_msgs.cnd, &rk->rk_curr_msgs.lock, - &tspec) == thrd_timedout) + abs_timeout) == thrd_timedout) break; } mtx_unlock(&rk->rk_curr_msgs.lock); @@ -753,6 +909,9 @@ rd_kafka_curr_msgs_wait_zero(rd_kafka_t *rk, return cnt == 0; } +void rd_kafka_decommissioned_broker_thread_join(rd_kafka_t *rk, + void *rkb_decommissioned); + void rd_kafka_destroy_final(rd_kafka_t *rk); void rd_kafka_global_init(void); @@ -833,9 +992,12 @@ const char *rd_kafka_purge_flags2str(int flags); #define RD_KAFKA_DBG_MOCK 0x10000 #define RD_KAFKA_DBG_ASSIGNOR 0x20000 #define RD_KAFKA_DBG_CONF 0x40000 +#define RD_KAFKA_DBG_TELEMETRY 0x80000 #define RD_KAFKA_DBG_ALL 0xfffff #define RD_KAFKA_DBG_NONE 0x0 +/* Jitter Percent for exponential retry backoff */ +#define RD_KAFKA_RETRY_JITTER_PERCENT 20 void rd_kafka_log0(const rd_kafka_conf_t *conf, const rd_kafka_t *rk, @@ -850,9 +1012,14 @@ void rd_kafka_log0(const rd_kafka_conf_t *conf, rd_kafka_log0(&rk->rk_conf, rk, NULL, level, RD_KAFKA_DBG_NONE, fac, \ __VA_ARGS__) +#define rd_kafka_conf_is_dbg(conf, ctx) \ + unlikely((conf).debug &(RD_KAFKA_DBG_##ctx)) + +#define rd_kafka_is_dbg(rk, ctx) (rd_kafka_conf_is_dbg(rk->rk_conf, ctx)) + #define rd_kafka_dbg(rk, ctx, fac, ...) \ do { \ - if (unlikely((rk)->rk_conf.debug & (RD_KAFKA_DBG_##ctx))) \ + if (rd_kafka_is_dbg(rk, ctx)) \ rd_kafka_log0(&rk->rk_conf, rk, NULL, LOG_DEBUG, \ (RD_KAFKA_DBG_##ctx), fac, __VA_ARGS__); \ } while (0) @@ -860,7 +1027,7 @@ void rd_kafka_log0(const rd_kafka_conf_t *conf, /* dbg() not requiring an rk, just the conf object, for early logging */ #define rd_kafka_dbg0(conf, ctx, fac, ...) \ do { \ - if (unlikely((conf)->debug & (RD_KAFKA_DBG_##ctx))) \ + if (rd_kafka_conf_is_dbg(*conf, ctx)) \ rd_kafka_log0(conf, NULL, NULL, LOG_DEBUG, \ (RD_KAFKA_DBG_##ctx), fac, __VA_ARGS__); \ } while (0) @@ -880,10 +1047,11 @@ void rd_kafka_log0(const rd_kafka_conf_t *conf, #define rd_rkb_log(rkb, level, fac, ...) \ rd_rkb_log0(rkb, level, RD_KAFKA_DBG_NONE, fac, __VA_ARGS__) +#define rd_rkb_is_dbg(rkb, ctx) rd_kafka_is_dbg((rkb)->rkb_rk, ctx) + #define rd_rkb_dbg(rkb, ctx, fac, ...) \ do { \ - if (unlikely((rkb)->rkb_rk->rk_conf.debug & \ - (RD_KAFKA_DBG_##ctx))) { \ + if (rd_rkb_is_dbg(rkb, ctx)) { \ rd_rkb_log0(rkb, LOG_DEBUG, (RD_KAFKA_DBG_##ctx), fac, \ __VA_ARGS__); \ } \ @@ -920,14 +1088,25 @@ int rd_kafka_set_fatal_error0(rd_kafka_t *rk, rd_kafka_error_t *rd_kafka_get_fatal_error(rd_kafka_t *rk); +#define rd_kafka_producer_can_have_fatal_errors(rk) \ + (rk->rk_type == RD_KAFKA_PRODUCER && rk->rk_conf.eos.idempotence) + +#define rd_kafka_consumer_can_have_fatal_errors(rk) \ + (rk->rk_type == RD_KAFKA_CONSUMER && \ + (rk->rk_conf.group_instance_id || \ + rk->rk_conf.group_protocol == RD_KAFKA_GROUP_PROTOCOL_CONSUMER)) + static RD_INLINE RD_UNUSED rd_kafka_resp_err_t rd_kafka_fatal_error_code(rd_kafka_t *rk) { /* This is an optimization to avoid an atomic read which are costly * on some platforms: - * Fatal errors are currently only raised by the idempotent producer - * and static consumers (group.instance.id). */ - if ((rk->rk_type == RD_KAFKA_PRODUCER && rk->rk_conf.eos.idempotence) || - (rk->rk_type == RD_KAFKA_CONSUMER && rk->rk_conf.group_instance_id)) + * Fatal errors are currently raised by: + * 1) the idempotent producer + * 2) static consumers (group.instance.id) + * 3) Group using consumer protocol (Introduced in KIP-848). See exact + * errors in rd_kafka_cgrp_handle_ConsumerGroupHeartbeat() */ + if (rd_kafka_producer_can_have_fatal_errors(rk) || + rd_kafka_consumer_can_have_fatal_errors(rk)) return rd_atomic32_get(&rk->rk_fatal.err); return RD_KAFKA_RESP_ERR_NO_ERROR; @@ -971,7 +1150,7 @@ static RD_INLINE RD_UNUSED int rd_kafka_max_poll_exceeded(rd_kafka_t *rk) { last_poll = rd_atomic64_get(&rk->rk_ts_last_poll); /* Application is blocked in librdkafka function, see - * rd_kafka_app_poll_blocking(). */ + * rd_kafka_app_poll_start(). */ if (last_poll == INT64_MAX) return 0; @@ -997,9 +1176,32 @@ static RD_INLINE RD_UNUSED int rd_kafka_max_poll_exceeded(rd_kafka_t *rk) { * @locality any * @locks none */ -static RD_INLINE RD_UNUSED void rd_kafka_app_poll_blocking(rd_kafka_t *rk) { - if (rk->rk_type == RD_KAFKA_CONSUMER) +static RD_INLINE RD_UNUSED void rd_kafka_app_poll_start(rd_kafka_t *rk, + rd_kafka_q_t *rkq, + rd_ts_t now, + rd_bool_t is_blocking) { + if (rk->rk_type != RD_KAFKA_CONSUMER) + return; + + if (!now) + now = rd_clock(); + if (is_blocking) rd_atomic64_set(&rk->rk_ts_last_poll, INT64_MAX); + if (rkq->rkq_ts_last_poll_end) { + int64_t poll_idle_ratio = 0; + rd_ts_t poll_interval = now - rkq->rkq_ts_last_poll_start; + if (poll_interval) { + rd_ts_t idle_interval = rkq->rkq_ts_last_poll_end - + rkq->rkq_ts_last_poll_start; + poll_idle_ratio = + idle_interval * 1000000 / poll_interval; + } + rd_avg_add( + &rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio, + poll_idle_ratio); + rkq->rkq_ts_last_poll_start = now; + rkq->rkq_ts_last_poll_end = 0; + } } /** @@ -1010,9 +1212,25 @@ static RD_INLINE RD_UNUSED void rd_kafka_app_poll_blocking(rd_kafka_t *rk) { * @locality any * @locks none */ -static RD_INLINE RD_UNUSED void rd_kafka_app_polled(rd_kafka_t *rk) { - if (rk->rk_type == RD_KAFKA_CONSUMER) - rd_atomic64_set(&rk->rk_ts_last_poll, rd_clock()); +static RD_INLINE RD_UNUSED void rd_kafka_app_polled(rd_kafka_t *rk, + rd_kafka_q_t *rkq) { + if (rk->rk_type == RD_KAFKA_CONSUMER) { + rd_ts_t now = rd_clock(); + rd_atomic64_set(&rk->rk_ts_last_poll, now); + if (unlikely(rk->rk_cgrp && + rk->rk_cgrp->rkcg_group_protocol == + RD_KAFKA_GROUP_PROTOCOL_CONSUMER && + rk->rk_cgrp->rkcg_flags & + RD_KAFKA_CGRP_F_MAX_POLL_EXCEEDED)) { + rd_kafka_cgrp_consumer_expedite_next_heartbeat( + rk->rk_cgrp, + "app polled after poll interval exceeded"); + } + if (!rkq->rkq_ts_last_poll_end) + rkq->rkq_ts_last_poll_end = now; + rd_dassert(rkq->rkq_ts_last_poll_end >= + rkq->rkq_ts_last_poll_start); + } } @@ -1027,5 +1245,8 @@ rd_kafka_resp_err_t rd_kafka_background_thread_create(rd_kafka_t *rk, char *errstr, size_t errstr_size); +void rd_kafka_rebootstrap(rd_kafka_t *rk); + +void rd_kafka_rebootstrap_tmr_restart(rd_kafka_t *rk); #endif /* _RDKAFKA_INT_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_interceptor.c b/src/third_party/librdkafka/dist/src/rdkafka_interceptor.c index c962d2d99e7..b5bacece3cb 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_interceptor.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_interceptor.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_interceptor.h b/src/third_party/librdkafka/dist/src/rdkafka_interceptor.h index 85f061ba914..d9aa4153262 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_interceptor.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_interceptor.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_lz4.c b/src/third_party/librdkafka/dist/src/rdkafka_lz4.c index dcb415b2c7c..152c3d3aac3 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_lz4.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_lz4.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_lz4.h b/src/third_party/librdkafka/dist/src/rdkafka_lz4.h index e148b40c152..8d996a58871 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_lz4.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_lz4.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_metadata.c b/src/third_party/librdkafka/dist/src/rdkafka_metadata.c index e647afe5fcc..77c8befd42b 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_metadata.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_metadata.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,6 +39,49 @@ #include #include +/** + * @brief Id comparator for rd_kafka_metadata_broker_internal_t + */ +int rd_kafka_metadata_broker_internal_cmp(const void *_a, const void *_b) { + const rd_kafka_metadata_broker_internal_t *a = _a; + const rd_kafka_metadata_broker_internal_t *b = _b; + return RD_CMP(a->id, b->id); +} + + +/** + * @brief Id comparator for struct rd_kafka_metadata_broker* + */ +int rd_kafka_metadata_broker_cmp(const void *_a, const void *_b) { + const struct rd_kafka_metadata_broker *a = _a; + const struct rd_kafka_metadata_broker *b = _b; + return RD_CMP(a->id, b->id); +} + + +/** + * @brief Id comparator for rd_kafka_metadata_partition_internal_t + */ +static int rd_kafka_metadata_partition_internal_cmp(const void *_a, + const void *_b) { + const rd_kafka_metadata_partition_internal_t *a = _a; + const rd_kafka_metadata_partition_internal_t *b = _b; + return RD_CMP(a->id, b->id); +} + +/** + * @brief Helper function to clear a rd_kafka_metadata_partition. + * + * @note Does not deallocate the rd_kafka_metadata_partition itself. + * @note Should not be used if there is an metadata struct allocated with + * tmpabuf in which rd_kafka_metadata_partition is contained. + */ +void rd_kafka_metadata_partition_clear( + struct rd_kafka_metadata_partition *rkmp) { + RD_IF_FREE(rkmp->isrs, rd_free); + RD_IF_FREE(rkmp->replicas, rd_free); +} + rd_kafka_resp_err_t rd_kafka_metadata(rd_kafka_t *rk, @@ -48,73 +92,91 @@ rd_kafka_metadata(rd_kafka_t *rk, rd_kafka_q_t *rkq; rd_kafka_broker_t *rkb; rd_kafka_op_t *rko; + rd_kafka_resp_err_t err; rd_ts_t ts_end = rd_timeout_init(timeout_ms); rd_list_t topics; rd_bool_t allow_auto_create_topics = rk->rk_conf.allow_auto_create_topics; - /* Query any broker that is up, and if none are up pick the first one, - * if we're lucky it will be up before the timeout */ - rkb = rd_kafka_broker_any_usable(rk, timeout_ms, RD_DO_LOCK, 0, - "application metadata request"); - if (!rkb) - return RD_KAFKA_RESP_ERR__TRANSPORT; + do { + /* Query any broker that is up, and if none are up pick the + * first one, if we're lucky it will be up before the timeout. + * Previous decommissioning brokers won't be returned by the + * function after receiving the _DESTROY_BROKER error + * below. */ + rkb = + rd_kafka_broker_any_usable(rk, timeout_ms, RD_DO_LOCK, 0, + "application metadata request"); + if (!rkb) + return RD_KAFKA_RESP_ERR__TRANSPORT; - rkq = rd_kafka_q_new(rk); + rkq = rd_kafka_q_new(rk); - rd_list_init(&topics, 0, rd_free); - if (!all_topics) { - if (only_rkt) - rd_list_add(&topics, + rd_list_init(&topics, 0, rd_free); + if (!all_topics) { + if (only_rkt) + rd_list_add( + &topics, rd_strdup(rd_kafka_topic_name(only_rkt))); - else { - int cache_cnt; - rd_kafka_local_topics_to_list(rkb->rkb_rk, &topics, - &cache_cnt); - /* Don't trigger auto-create for cached topics */ - if (rd_list_cnt(&topics) == cache_cnt) - allow_auto_create_topics = rd_true; + else { + int cache_cnt; + rd_kafka_local_topics_to_list( + rkb->rkb_rk, &topics, &cache_cnt); + /* Don't trigger auto-create + * for cached topics */ + if (rd_list_cnt(&topics) == cache_cnt) + allow_auto_create_topics = rd_true; + } } - } - /* Async: request metadata */ - rko = rd_kafka_op_new(RD_KAFKA_OP_METADATA); - rd_kafka_op_set_replyq(rko, rkq, 0); - rko->rko_u.metadata.force = 1; /* Force metadata request regardless - * of outstanding metadata requests. */ - rd_kafka_MetadataRequest(rkb, &topics, "application requested", - allow_auto_create_topics, - /* cgrp_update: - * Only update consumer group state - * on response if this lists all - * topics in the cluster, since a - * partial request may make it seem - * like some subscribed topics are missing. */ - all_topics ? rd_true : rd_false, rko); + /* Async: request metadata */ + rko = rd_kafka_op_new(RD_KAFKA_OP_METADATA); + rd_kafka_op_set_replyq(rko, rkq, 0); + rko->rko_u.metadata.force = + 1; /* Force metadata request regardless + * of outstanding metadata requests. */ + rd_kafka_MetadataRequest( + rkb, &topics, NULL, "application requested", + allow_auto_create_topics, + /* cgrp_update: + * Only update consumer group state + * on response if this lists all + * topics in the cluster, since a + * partial request may make it seem + * like some subscribed topics are missing. */ + all_topics ? rd_true : rd_false, + -1 /* same subscription version */, + rd_false /* force_racks */, rko); - rd_list_destroy(&topics); - rd_kafka_broker_destroy(rkb); + rd_list_destroy(&topics); + rd_kafka_broker_destroy(rkb); - /* Wait for reply (or timeout) */ - rko = rd_kafka_q_pop(rkq, rd_timeout_remains_us(ts_end), 0); + /* Wait for reply (or timeout) */ + rko = rd_kafka_q_pop(rkq, rd_timeout_remains_us(ts_end), 0); - rd_kafka_q_destroy_owner(rkq); + rd_kafka_q_destroy_owner(rkq); - /* Timeout */ - if (!rko) - return RD_KAFKA_RESP_ERR__TIMED_OUT; + /* Timeout */ + if (!rko) + return RD_KAFKA_RESP_ERR__TIMED_OUT; - /* Error */ - if (rko->rko_err) { - rd_kafka_resp_err_t err = rko->rko_err; - rd_kafka_op_destroy(rko); - return err; - } + /* Error */ + err = rko->rko_err; + if (err) { + rd_kafka_op_destroy(rko); + if (err != RD_KAFKA_RESP_ERR__DESTROY_BROKER) + return err; + } + + /* In case selected broker was decommissioned, + * try again with a different broker. */ + } while (err == RD_KAFKA_RESP_ERR__DESTROY_BROKER); /* Reply: pass metadata pointer to application who now owns it*/ rd_kafka_assert(rk, rko->rko_u.metadata.md); - *metadatap = rko->rko_u.metadata.md; - rko->rko_u.metadata.md = NULL; + *metadatap = rko->rko_u.metadata.md; + rko->rko_u.metadata.md = NULL; + rko->rko_u.metadata.mdi = NULL; rd_kafka_op_destroy(rko); return RD_KAFKA_RESP_ERR_NO_ERROR; @@ -127,12 +189,13 @@ void rd_kafka_metadata_destroy(const struct rd_kafka_metadata *metadata) { } -/** - * @returns a newly allocated copy of metadata \p src of size \p size - */ -struct rd_kafka_metadata * -rd_kafka_metadata_copy(const struct rd_kafka_metadata *src, size_t size) { +static rd_kafka_metadata_internal_t *rd_kafka_metadata_copy_internal( + const rd_kafka_metadata_internal_t *src_internal, + size_t size, + rd_bool_t populate_racks) { struct rd_kafka_metadata *md; + rd_kafka_metadata_internal_t *mdi; + const struct rd_kafka_metadata *src = &src_internal->metadata; rd_tmpabuf_t tbuf; int i; @@ -142,24 +205,39 @@ rd_kafka_metadata_copy(const struct rd_kafka_metadata *src, size_t size) { * Because of this we copy all the structs verbatim but * any pointer fields needs to be copied explicitly to update * the pointer address. */ - rd_tmpabuf_new(&tbuf, size, 1 /*assert on fail*/); - md = rd_tmpabuf_write(&tbuf, src, sizeof(*md)); + rd_tmpabuf_new(&tbuf, size, rd_true /*assert on fail*/); + rd_tmpabuf_finalize(&tbuf); + mdi = rd_tmpabuf_write(&tbuf, src, sizeof(*mdi)); + md = &mdi->metadata; rd_tmpabuf_write_str(&tbuf, src->orig_broker_name); /* Copy Brokers */ md->brokers = rd_tmpabuf_write(&tbuf, src->brokers, - md->broker_cnt * sizeof(*md->brokers)); + src->broker_cnt * sizeof(*src->brokers)); + /* Copy internal Brokers */ + mdi->brokers = + rd_tmpabuf_write(&tbuf, src_internal->brokers, + src->broker_cnt * sizeof(*src_internal->brokers)); - for (i = 0; i < md->broker_cnt; i++) + for (i = 0; i < md->broker_cnt; i++) { md->brokers[i].host = rd_tmpabuf_write_str(&tbuf, src->brokers[i].host); + if (src_internal->brokers[i].rack_id) { + mdi->brokers[i].rack_id = rd_tmpabuf_write_str( + &tbuf, src_internal->brokers[i].rack_id); + } + } /* Copy TopicMetadata */ md->topics = rd_tmpabuf_write(&tbuf, src->topics, md->topic_cnt * sizeof(*md->topics)); + /* Copy internal TopicMetadata */ + mdi->topics = + rd_tmpabuf_write(&tbuf, src_internal->topics, + md->topic_cnt * sizeof(*src_internal->topics)); for (i = 0; i < md->topic_cnt; i++) { int j; @@ -173,8 +251,17 @@ rd_kafka_metadata_copy(const struct rd_kafka_metadata *src, size_t size) { rd_tmpabuf_write(&tbuf, src->topics[i].partitions, md->topics[i].partition_cnt * sizeof(*md->topics[i].partitions)); + /* Copy internal partitions */ + mdi->topics[i].partitions = rd_tmpabuf_write( + &tbuf, src_internal->topics[i].partitions, + md->topics[i].partition_cnt * + sizeof(*src_internal->topics[i].partitions)); for (j = 0; j < md->topics[i].partition_cnt; j++) { + int k; + char *rack; + rd_list_t *curr_list; + /* Copy replicas and ISRs */ md->topics[i].partitions[j].replicas = rd_tmpabuf_write( &tbuf, src->topics[i].partitions[j].replicas, @@ -185,6 +272,59 @@ rd_kafka_metadata_copy(const struct rd_kafka_metadata *src, size_t size) { &tbuf, src->topics[i].partitions[j].isrs, md->topics[i].partitions[j].isr_cnt * sizeof(*md->topics[i].partitions[j].isrs)); + + mdi->topics[i].partitions[j].racks_cnt = 0; + mdi->topics[i].partitions[j].racks = NULL; + + /* Iterate through replicas and populate racks, if + * needed. */ + if (!populate_racks) + continue; + + /* This is quite possibly a recomputation, because we've + * already done this for the src_internal. However, + * since the racks need to point inside the tmpbuf, we + * make this calculation again. Since this is done only + * in a case of a full metadata refresh, this will be + * fairly rare. */ + curr_list = rd_list_new(0, NULL); + for (k = 0; k < md->topics[i].partitions[j].replica_cnt; + k++) { + rd_kafka_metadata_broker_internal_t key = { + .id = md->topics[i] + .partitions[j] + .replicas[k]}; + rd_kafka_metadata_broker_internal_t *found = + bsearch( + &key, mdi->brokers, md->broker_cnt, + sizeof( + rd_kafka_metadata_broker_internal_t), + rd_kafka_metadata_broker_internal_cmp); + if (!found || !found->rack_id) + continue; + rd_list_add(curr_list, found->rack_id); + } + + if (!rd_list_cnt(curr_list)) { + rd_list_destroy(curr_list); + continue; + } + + rd_list_deduplicate(&curr_list, rd_strcmp2); + + mdi->topics[i].partitions[j].racks_cnt = + rd_list_cnt(curr_list); + mdi->topics[i].partitions[j].racks = rd_tmpabuf_alloc( + &tbuf, sizeof(char *) * rd_list_cnt(curr_list)); + RD_LIST_FOREACH(rack, curr_list, k) { + /* We don't copy here,`rack` points to memory + * inside `mdi` already, and it's allocated + * within a tmpabuf. So, the lifetime of + * mdi->topics[i].partitions[j].racks[k] is the + * same as the lifetime of the outer `mdi`. */ + mdi->topics[i].partitions[j].racks[k] = rack; + } + rd_list_destroy(curr_list); } } @@ -192,51 +332,274 @@ rd_kafka_metadata_copy(const struct rd_kafka_metadata *src, size_t size) { if (rd_tmpabuf_failed(&tbuf)) rd_kafka_assert(NULL, !*"metadata copy failed"); - /* Delibarely not destroying the tmpabuf since we return + /* Deliberately not destroying the tmpabuf since we return * its allocated memory. */ - return md; + return mdi; } +/** + * @returns a newly allocated copy of metadata \p src of size \p size + */ +rd_kafka_metadata_internal_t * +rd_kafka_metadata_copy(const rd_kafka_metadata_internal_t *src_internal, + size_t size) { + return rd_kafka_metadata_copy_internal(src_internal, size, rd_false); +} + /** - * @brief Handle a Metadata response message. - * - * @param topics are the requested topics (may be NULL) - * - * The metadata will be marshalled into 'struct rd_kafka_metadata*' structs. - * - * The marshalled metadata is returned in \p *mdp, (NULL on error). + * @returns a newly allocated copy of metadata \p src of size \p size, with + * partition racks included. + */ +rd_kafka_metadata_internal_t *rd_kafka_metadata_copy_add_racks( + const rd_kafka_metadata_internal_t *src_internal, + size_t size) { + return rd_kafka_metadata_copy_internal(src_internal, size, rd_true); +} - * @returns an error code on parse failure, else NO_ERRRO. +/** + * @brief Update topic state and information based on topic metadata. + * + * @param mdt Topic metadata. + * @param mdit Topic internal metadata. + * + * @locality rdkafka main thread + * @locks_acquired rd_kafka_wrlock(rk) + */ +static void rd_kafka_parse_Metadata_update_topic( + rd_kafka_broker_t *rkb, + const rd_kafka_metadata_topic_t *mdt, + const rd_kafka_metadata_topic_internal_t *mdit) { + + rd_rkb_dbg(rkb, METADATA, "METADATA", + /* The indent below is intentional */ + " Topic %s with %i partitions%s%s", mdt->topic, + mdt->partition_cnt, mdt->err ? ": " : "", + mdt->err ? rd_kafka_err2str(mdt->err) : ""); + + /* Ignore metadata completely for temporary errors. (issue #513) + * LEADER_NOT_AVAILABLE: Broker is rebalancing + */ + if (mdt->err == RD_KAFKA_RESP_ERR_LEADER_NOT_AVAILABLE && + mdt->partition_cnt == 0) { + rd_rkb_dbg(rkb, TOPIC, "METADATA", + "Temporary error in metadata reply for " + "topic %s (PartCnt %i): %s: ignoring", + mdt->topic, mdt->partition_cnt, + rd_kafka_err2str(mdt->err)); + } else { + /* Update local topic & partition state based + * on metadata */ + rd_kafka_topic_metadata_update2(rkb, mdt, mdit); + } +} + +/** + * @brief Only brokers with Metadata version >= 9 have reliable leader + * epochs. Before that version, leader epoch must be treated + * as missing (-1). + * + * @param rkb The broker + * @return Is this a broker version with reliable leader epochs? * * @locality rdkafka main thread */ -rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, - rd_kafka_buf_t *request, - rd_kafka_buf_t *rkbuf, - struct rd_kafka_metadata **mdp) { +rd_bool_t rd_kafka_has_reliable_leader_epochs(rd_kafka_broker_t *rkb) { + int features; + int16_t ApiVersion = 0; + + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_Metadata, 0, 9, &features); + + return ApiVersion >= 9; +} + +/* Populates the topic partition to rack mapping for the the topic given by + * `topic_idx` in the `mdi`. It's assumed that the internal broker metadata is + * already populated. */ +static void +rd_kafka_populate_metadata_topic_racks(rd_tmpabuf_t *tbuf, + size_t topic_idx, + rd_kafka_metadata_internal_t *mdi) { + rd_kafka_metadata_broker_internal_t *brokers_internal; + size_t broker_cnt; + int i; + rd_kafka_metadata_topic_t *mdt; + rd_kafka_metadata_topic_internal_t *mdti; + + rd_dassert(mdi->brokers); + rd_dassert(mdi->metadata.topic_cnt > (int)topic_idx); + + brokers_internal = mdi->brokers; + broker_cnt = mdi->metadata.broker_cnt; + + mdt = &mdi->metadata.topics[topic_idx]; + mdti = &mdi->topics[topic_idx]; + + for (i = 0; i < mdt->partition_cnt; i++) { + int j; + rd_kafka_metadata_partition_t *mdp = &mdt->partitions[i]; + rd_kafka_metadata_partition_internal_t *mdpi = + &mdti->partitions[i]; + + rd_list_t *curr_list; + char *rack; + + if (mdp->replica_cnt == 0) + continue; + + curr_list = + rd_list_new(0, NULL); /* use a list for de-duplication */ + for (j = 0; j < mdp->replica_cnt; j++) { + rd_kafka_metadata_broker_internal_t key = { + .id = mdp->replicas[j]}; + rd_kafka_metadata_broker_internal_t *broker = + bsearch(&key, brokers_internal, broker_cnt, + sizeof(rd_kafka_metadata_broker_internal_t), + rd_kafka_metadata_broker_internal_cmp); + if (!broker || !broker->rack_id) + continue; + rd_list_add(curr_list, broker->rack_id); + } + rd_list_deduplicate(&curr_list, rd_strcmp2); + + mdpi->racks_cnt = rd_list_cnt(curr_list); + mdpi->racks = + rd_tmpabuf_alloc(tbuf, sizeof(char *) * mdpi->racks_cnt); + RD_LIST_FOREACH(rack, curr_list, j) { + mdpi->racks[j] = rack; /* Don't copy, rack points inside + tbuf already*/ + } + rd_list_destroy(curr_list); + } +} + +/** + * @brief Decommission brokers that are not in the metadata. + */ +static void rd_kafka_metadata_decommission_unavailable_brokers( + rd_kafka_t *rk, + rd_kafka_metadata_t *md, + rd_kafka_broker_t *rkb_current) { + rd_kafka_broker_t *rkb; + rd_bool_t has_learned_brokers = rd_false; + rd_list_t brokers_to_decommission; + int i; + + rd_kafka_wrlock(rk); + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + if (rkb->rkb_source == RD_KAFKA_LEARNED) { + has_learned_brokers = rd_true; + break; + } + } + if (!has_learned_brokers) { + rd_kafka_wrunlock(rk); + return; + } + + rd_list_init(&brokers_to_decommission, + rd_atomic32_get(&rk->rk_broker_cnt), NULL); + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + rd_bool_t purge_broker; + + if (rkb->rkb_source == RD_KAFKA_LOGICAL) + continue; + + purge_broker = rd_true; + if (rkb->rkb_source == RD_KAFKA_LEARNED) { + /* Don't purge the broker if it's available in + * metadata. */ + for (i = 0; i < md->broker_cnt; i++) { + if (md->brokers[i].id == rkb->rkb_nodeid) { + purge_broker = rd_false; + break; + } + } + } + + if (!purge_broker) + continue; + + /* Don't try to decommission already decommissioning brokers + * otherwise they could be already destroyed when + * `rd_kafka_broker_decommission` is called below. */ + if (rd_list_find(&rk->wait_decommissioned_brokers, rkb, + rd_list_cmp_ptr) != NULL) + continue; + + rd_list_add(&brokers_to_decommission, rkb); + } + RD_LIST_FOREACH(rkb, &brokers_to_decommission, i) { + rd_kafka_broker_decommission(rk, rkb, + &rk->wait_decommissioned_thrds); + rd_list_add(&rk->wait_decommissioned_brokers, rkb); + } + rd_list_destroy(&brokers_to_decommission); + rd_kafka_wrunlock(rk); +} + +/* Internal implementation for parsing Metadata. */ +static rd_kafka_resp_err_t +rd_kafka_parse_Metadata0(rd_kafka_broker_t *rkb, + rd_kafka_buf_t *request, + rd_kafka_buf_t *rkbuf, + rd_kafka_metadata_internal_t **mdip, + rd_list_t *request_topics, + const char *reason) { rd_kafka_t *rk = rkb->rkb_rk; int i, j, k; rd_tmpabuf_t tbuf; - struct rd_kafka_metadata *md; + rd_kafka_metadata_internal_t *mdi = NULL; + rd_kafka_metadata_t *md = NULL; size_t rkb_namelen; - const int log_decode_errors = LOG_ERR; - rd_list_t *missing_topics = NULL; - const rd_list_t *requested_topics = request->rkbuf_u.Metadata.topics; - rd_bool_t all_topics = request->rkbuf_u.Metadata.all_topics; - rd_bool_t cgrp_update = - request->rkbuf_u.Metadata.cgrp_update && rk->rk_cgrp; - const char *reason = request->rkbuf_u.Metadata.reason - ? request->rkbuf_u.Metadata.reason - : "(no reason)"; - int ApiVersion = request->rkbuf_reqhdr.ApiVersion; - rd_kafkap_str_t cluster_id = RD_ZERO_INIT; - int32_t controller_id = -1; - rd_kafka_resp_err_t err = RD_KAFKA_RESP_ERR_NO_ERROR; - int broker_changes = 0; - int cache_changes = 0; + const int log_decode_errors = LOG_ERR; + rd_list_t *missing_topics = NULL; + rd_list_t *missing_topic_ids = NULL; + + const rd_list_t *requested_topics = request_topics; + const rd_list_t *requested_topic_ids = NULL; + rd_bool_t all_topics = rd_false; + rd_bool_t cgrp_update = rd_false; + rd_bool_t has_reliable_leader_epochs = + rd_kafka_has_reliable_leader_epochs(rkb); + int ApiVersion = rkbuf->rkbuf_reqhdr.ApiVersion; + rd_kafkap_str_t cluster_id = RD_ZERO_INIT; + int32_t controller_id = -1; + rd_kafka_resp_err_t err = RD_KAFKA_RESP_ERR_NO_ERROR; + int broker_changes = 0; + int cache_changes = 0; + int cgrp_subscription_version = -1; + int16_t ErrorCode = 0; + + /* If client rack is present, the metadata cache (topic or full) needs + * to contain the partition to rack map. */ + rd_bool_t has_client_rack = rk->rk_conf.client_rack && + RD_KAFKAP_STR_LEN(rk->rk_conf.client_rack); + rd_bool_t compute_racks = has_client_rack; + + if (request) { + requested_topics = request->rkbuf_u.Metadata.topics; + requested_topic_ids = request->rkbuf_u.Metadata.topic_ids; + all_topics = request->rkbuf_u.Metadata.all_topics; + cgrp_update = + request->rkbuf_u.Metadata.cgrp_update && rk->rk_cgrp; + compute_racks |= request->rkbuf_u.Metadata.force_racks; + cgrp_subscription_version = + request->rkbuf_u.Metadata.cgrp_subscription_version; + } + + /* If there's reason is NULL, set it to a human-readable string. */ + if (!reason) + reason = "(no reason)"; + + /* Ignore metadata updates when terminating */ + if (rd_kafka_terminating(rkb->rkb_rk)) { + err = RD_KAFKA_RESP_ERR__DESTROY; + goto done; + } rd_kafka_assert(NULL, thrd_is_current(rk->rk_thread)); @@ -244,21 +607,31 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, if (requested_topics) missing_topics = rd_list_copy(requested_topics, rd_list_string_copy, NULL); + if (requested_topic_ids) + missing_topic_ids = + rd_list_copy(requested_topic_ids, rd_list_Uuid_copy, NULL); rd_kafka_broker_lock(rkb); rkb_namelen = strlen(rkb->rkb_name) + 1; /* We assume that the marshalled representation is - * no more than 4 times larger than the wire representation. */ - rd_tmpabuf_new(&tbuf, - sizeof(*md) + rkb_namelen + (rkbuf->rkbuf_totlen * 4), - 0 /*dont assert on fail*/); + * no more than 4 times larger than the wire representation. + * This is increased to 5 times in case if we want to compute partition + * to rack mapping. */ + rd_tmpabuf_new(&tbuf, 0, rd_false /*dont assert on fail*/); + rd_tmpabuf_add_alloc(&tbuf, sizeof(*mdi)); + rd_tmpabuf_add_alloc(&tbuf, rkb_namelen); + rd_tmpabuf_add_alloc(&tbuf, rkbuf->rkbuf_totlen * + (4 + (compute_racks ? 1 : 0))); - if (!(md = rd_tmpabuf_alloc(&tbuf, sizeof(*md)))) { + rd_tmpabuf_finalize(&tbuf); + + if (!(mdi = rd_tmpabuf_alloc(&tbuf, sizeof(*mdi)))) { rd_kafka_broker_unlock(rkb); err = RD_KAFKA_RESP_ERR__CRIT_SYS_RESOURCE; goto err; } + md = &mdi->metadata; md->orig_broker_id = rkb->rkb_nodeid; md->orig_broker_name = rd_tmpabuf_write(&tbuf, rkb->rkb_name, rkb_namelen); @@ -268,10 +641,8 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, rd_kafka_buf_read_throttle_time(rkbuf); /* Read Brokers */ - rd_kafka_buf_read_i32a(rkbuf, md->broker_cnt); - if (md->broker_cnt > RD_KAFKAP_BROKERS_MAX) - rd_kafka_buf_parse_fail(rkbuf, "Broker_cnt %i > BROKERS_MAX %i", - md->broker_cnt, RD_KAFKAP_BROKERS_MAX); + rd_kafka_buf_read_arraycnt(rkbuf, &md->broker_cnt, + RD_KAFKAP_BROKERS_MAX); if (!(md->brokers = rd_tmpabuf_alloc(&tbuf, md->broker_cnt * sizeof(*md->brokers)))) @@ -279,63 +650,93 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, "%d brokers: tmpabuf memory shortage", md->broker_cnt); + if (!(mdi->brokers = rd_tmpabuf_alloc( + &tbuf, md->broker_cnt * sizeof(*mdi->brokers)))) + rd_kafka_buf_parse_fail( + rkbuf, "%d internal brokers: tmpabuf memory shortage", + md->broker_cnt); + + if (!(mdi->brokers_sorted = rd_tmpabuf_alloc( + &tbuf, md->broker_cnt * sizeof(*mdi->brokers_sorted)))) + rd_kafka_buf_parse_fail( + rkbuf, "%d sorted brokers: tmpabuf memory shortage", + md->broker_cnt); + for (i = 0; i < md->broker_cnt; i++) { rd_kafka_buf_read_i32a(rkbuf, md->brokers[i].id); rd_kafka_buf_read_str_tmpabuf(rkbuf, &tbuf, md->brokers[i].host); rd_kafka_buf_read_i32a(rkbuf, md->brokers[i].port); + mdi->brokers[i].id = md->brokers[i].id; if (ApiVersion >= 1) { - rd_kafkap_str_t rack; - rd_kafka_buf_read_str(rkbuf, &rack); + rd_kafka_buf_read_str_tmpabuf(rkbuf, &tbuf, + mdi->brokers[i].rack_id); + } else { + mdi->brokers[i].rack_id = NULL; } + + rd_kafka_buf_skip_tags(rkbuf); } - if (ApiVersion >= 2) + mdi->cluster_id = NULL; + if (ApiVersion >= 2) { rd_kafka_buf_read_str(rkbuf, &cluster_id); + if (cluster_id.str) + mdi->cluster_id = + rd_tmpabuf_write_str(&tbuf, cluster_id.str); + } + mdi->controller_id = -1; if (ApiVersion >= 1) { rd_kafka_buf_read_i32(rkbuf, &controller_id); + mdi->controller_id = controller_id; rd_rkb_dbg(rkb, METADATA, "METADATA", "ClusterId: %.*s, ControllerId: %" PRId32, RD_KAFKAP_STR_PR(&cluster_id), controller_id); } - + qsort(mdi->brokers, md->broker_cnt, sizeof(mdi->brokers[i]), + rd_kafka_metadata_broker_internal_cmp); + memcpy(mdi->brokers_sorted, md->brokers, + sizeof(*mdi->brokers_sorted) * md->broker_cnt); + qsort(mdi->brokers_sorted, md->broker_cnt, sizeof(*mdi->brokers_sorted), + rd_kafka_metadata_broker_cmp); /* Read TopicMetadata */ - rd_kafka_buf_read_i32a(rkbuf, md->topic_cnt); + rd_kafka_buf_read_arraycnt(rkbuf, &md->topic_cnt, RD_KAFKAP_TOPICS_MAX); rd_rkb_dbg(rkb, METADATA, "METADATA", "%i brokers, %i topics", md->broker_cnt, md->topic_cnt); - if (md->topic_cnt > RD_KAFKAP_TOPICS_MAX) - rd_kafka_buf_parse_fail( - rkbuf, "TopicMetadata_cnt %" PRId32 " > TOPICS_MAX %i", - md->topic_cnt, RD_KAFKAP_TOPICS_MAX); - if (!(md->topics = rd_tmpabuf_alloc(&tbuf, md->topic_cnt * sizeof(*md->topics)))) rd_kafka_buf_parse_fail( rkbuf, "%d topics: tmpabuf memory shortage", md->topic_cnt); + if (!(mdi->topics = rd_tmpabuf_alloc(&tbuf, md->topic_cnt * + sizeof(*mdi->topics)))) + rd_kafka_buf_parse_fail( + rkbuf, "%d internal topics: tmpabuf memory shortage", + md->topic_cnt); + for (i = 0; i < md->topic_cnt; i++) { rd_kafka_buf_read_i16a(rkbuf, md->topics[i].err); rd_kafka_buf_read_str_tmpabuf(rkbuf, &tbuf, md->topics[i].topic); - if (ApiVersion >= 1) { - int8_t is_internal; - rd_kafka_buf_read_i8(rkbuf, &is_internal); + + if (ApiVersion >= 10) { + rd_kafka_buf_read_uuid(rkbuf, &mdi->topics[i].topic_id); + } else { + mdi->topics[i].topic_id = RD_KAFKA_UUID_ZERO; } + if (ApiVersion >= 1) + rd_kafka_buf_read_bool(rkbuf, + &mdi->topics[i].is_internal); + /* PartitionMetadata */ - rd_kafka_buf_read_i32a(rkbuf, md->topics[i].partition_cnt); - if (md->topics[i].partition_cnt > RD_KAFKAP_PARTITIONS_MAX) - rd_kafka_buf_parse_fail(rkbuf, - "TopicMetadata[%i]." - "PartitionMetadata_cnt %i " - "> PARTITIONS_MAX %i", - i, md->topics[i].partition_cnt, - RD_KAFKAP_PARTITIONS_MAX); + rd_kafka_buf_read_arraycnt(rkbuf, &md->topics[i].partition_cnt, + RD_KAFKAP_PARTITIONS_MAX); if (!(md->topics[i].partitions = rd_tmpabuf_alloc( &tbuf, md->topics[i].partition_cnt * @@ -346,6 +747,16 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, md->topics[i].topic, md->topics[i].partition_cnt); + if (!(mdi->topics[i].partitions = rd_tmpabuf_alloc( + &tbuf, md->topics[i].partition_cnt * + sizeof(*mdi->topics[i].partitions)))) + rd_kafka_buf_parse_fail(rkbuf, + "%s: %d internal partitions: " + "tmpabuf memory shortage", + md->topics[i].topic, + md->topics[i].partition_cnt); + + for (j = 0; j < md->topics[i].partition_cnt; j++) { rd_kafka_buf_read_i16a(rkbuf, md->topics[i].partitions[j].err); @@ -354,20 +765,26 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, rd_kafka_buf_read_i32a( rkbuf, md->topics[i].partitions[j].leader); - /* Replicas */ - rd_kafka_buf_read_i32a( - rkbuf, md->topics[i].partitions[j].replica_cnt); - if (md->topics[i].partitions[j].replica_cnt > - RD_KAFKAP_BROKERS_MAX) - rd_kafka_buf_parse_fail( + mdi->topics[i].partitions[j].id = + md->topics[i].partitions[j].id; + if (ApiVersion >= 7) { + rd_kafka_buf_read_i32( rkbuf, - "TopicMetadata[%i]." - "PartitionMetadata[%i]." - "Replica_cnt " - "%i > BROKERS_MAX %i", - i, j, - md->topics[i].partitions[j].replica_cnt, - RD_KAFKAP_BROKERS_MAX); + &mdi->topics[i].partitions[j].leader_epoch); + if (!has_reliable_leader_epochs) + mdi->topics[i] + .partitions[j] + .leader_epoch = -1; + } else { + mdi->topics[i].partitions[j].leader_epoch = -1; + } + mdi->topics[i].partitions[j].racks_cnt = 0; + mdi->topics[i].partitions[j].racks = NULL; + + /* Replicas */ + rd_kafka_buf_read_arraycnt( + rkbuf, &md->topics[i].partitions[j].replica_cnt, + RD_KAFKAP_BROKERS_MAX); if (!(md->topics[i].partitions[j].replicas = rd_tmpabuf_alloc( @@ -393,18 +810,9 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, md->topics[i].partitions[j].replicas[k]); /* Isrs */ - rd_kafka_buf_read_i32a( - rkbuf, md->topics[i].partitions[j].isr_cnt); - if (md->topics[i].partitions[j].isr_cnt > - RD_KAFKAP_BROKERS_MAX) - rd_kafka_buf_parse_fail( - rkbuf, - "TopicMetadata[%i]." - "PartitionMetadata[%i]." - "Isr_cnt " - "%i > BROKERS_MAX %i", - i, j, md->topics[i].partitions[j].isr_cnt, - RD_KAFKAP_BROKERS_MAX); + rd_kafka_buf_read_arraycnt( + rkbuf, &md->topics[i].partitions[j].isr_cnt, + RD_KAFKAP_BROKERS_MAX); if (!(md->topics[i] .partitions[j] @@ -427,23 +835,62 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, k++) rd_kafka_buf_read_i32a( rkbuf, md->topics[i].partitions[j].isrs[k]); + + if (ApiVersion >= 5) { + /* OfflineReplicas int32 array (ignored) */ + int32_t offline_replicas_cnt; + + /* #OfflineReplicas */ + rd_kafka_buf_read_arraycnt( + rkbuf, &offline_replicas_cnt, + RD_KAFKAP_BROKERS_MAX); + rd_kafka_buf_skip(rkbuf, offline_replicas_cnt * + sizeof(int32_t)); + } + + rd_kafka_buf_skip_tags(rkbuf); } - /* Sort partitions by partition id */ - qsort(md->topics[i].partitions, md->topics[i].partition_cnt, - sizeof(*md->topics[i].partitions), - rd_kafka_metadata_partition_id_cmp); + mdi->topics[i].topic_authorized_operations = -1; + if (ApiVersion >= 8) { + int32_t TopicAuthorizedOperations; + /* TopicAuthorizedOperations */ + rd_kafka_buf_read_i32(rkbuf, + &TopicAuthorizedOperations); + mdi->topics[i].topic_authorized_operations = + TopicAuthorizedOperations; + } + + rd_kafka_buf_skip_tags(rkbuf); + } + + mdi->cluster_authorized_operations = -1; + if (ApiVersion >= 8 && ApiVersion <= 10) { + int32_t ClusterAuthorizedOperations; + /* ClusterAuthorizedOperations */ + rd_kafka_buf_read_i32(rkbuf, &ClusterAuthorizedOperations); + mdi->cluster_authorized_operations = + ClusterAuthorizedOperations; + } + + if (ApiVersion >= 13) { + rd_kafka_buf_read_i16(rkbuf, &ErrorCode); + } + + rd_kafka_buf_skip_tags(rkbuf); + + if (ErrorCode) { + rd_rkb_dbg(rkb, METADATA, "METADATA", + "Metadata response: received top level " + "error code %" PRId16 ": %s", + ErrorCode, rd_kafka_err2str(ErrorCode)); + err = ErrorCode; + goto err; } /* Entire Metadata response now parsed without errors: * update our internal state according to the response. */ - /* Avoid metadata updates when we're terminating. */ - if (rd_kafka_terminating(rkb->rkb_rk)) { - err = RD_KAFKA_RESP_ERR__DESTROY; - goto done; - } - if (md->broker_cnt == 0 && md->topic_cnt == 0) { rd_rkb_dbg(rkb, METADATA, "METADATA", "No brokers or topics in metadata: should retry"); @@ -461,63 +908,70 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, &md->brokers[i], NULL); } - /* Update partition count and leader for each topic we know about */ + rd_kafka_metadata_decommission_unavailable_brokers(rk, md, rkb); + for (i = 0; i < md->topic_cnt; i++) { - rd_kafka_metadata_topic_t *mdt = &md->topics[i]; - rd_rkb_dbg(rkb, METADATA, "METADATA", - " Topic #%i/%i: %s with %i partitions%s%s", i, - md->topic_cnt, mdt->topic, mdt->partition_cnt, - mdt->err ? ": " : "", - mdt->err ? rd_kafka_err2str(mdt->err) : ""); /* Ignore topics in blacklist */ if (rkb->rkb_rk->rk_conf.topic_blacklist && rd_kafka_pattern_match(rkb->rkb_rk->rk_conf.topic_blacklist, - mdt->topic)) { - rd_rkb_dbg(rkb, TOPIC, "BLACKLIST", + md->topics[i].topic)) { + rd_rkb_dbg(rkb, TOPIC | RD_KAFKA_DBG_METADATA, + "BLACKLIST", "Ignoring blacklisted topic \"%s\" " "in metadata", - mdt->topic); + md->topics[i].topic); continue; } - /* Ignore metadata completely for temporary errors. (issue #513) - * LEADER_NOT_AVAILABLE: Broker is rebalancing - */ - if (mdt->err == RD_KAFKA_RESP_ERR_LEADER_NOT_AVAILABLE && - mdt->partition_cnt == 0) { - rd_rkb_dbg(rkb, TOPIC, "METADATA", - "Temporary error in metadata reply for " - "topic %s (PartCnt %i): %s: ignoring", - mdt->topic, mdt->partition_cnt, - rd_kafka_err2str(mdt->err)); - } else { - /* Update local topic & partition state based - * on metadata */ - rd_kafka_topic_metadata_update2(rkb, mdt); - } + /* Sort partitions by partition id */ + qsort(md->topics[i].partitions, md->topics[i].partition_cnt, + sizeof(*md->topics[i].partitions), + rd_kafka_metadata_partition_id_cmp); + qsort(mdi->topics[i].partitions, md->topics[i].partition_cnt, + sizeof(*mdi->topics[i].partitions), + rd_kafka_metadata_partition_internal_cmp); - if (requested_topics) { + if (compute_racks) + rd_kafka_populate_metadata_topic_racks(&tbuf, i, mdi); + + /* Update topic state based on the topic metadata */ + rd_kafka_parse_Metadata_update_topic(rkb, &md->topics[i], + &mdi->topics[i]); + + if (requested_topics) rd_list_free_cb(missing_topics, rd_list_remove_cmp(missing_topics, - mdt->topic, + md->topics[i].topic, (void *)strcmp)); - if (!all_topics) { - rd_kafka_wrlock(rk); - rd_kafka_metadata_cache_topic_update( - rk, mdt, rd_false /*propagate later*/); - cache_changes++; - rd_kafka_wrunlock(rk); - } - } + if (requested_topic_ids) + rd_list_free_cb( + missing_topic_ids, + rd_list_remove_cmp(missing_topic_ids, + &mdi->topics[i].topic_id, + (void *)rd_kafka_Uuid_ptr_cmp)); + /* Only update cache when not asking + * for all topics or cache entry + * already exists. */ + rd_kafka_wrlock(rk); + cache_changes += rd_kafka_metadata_cache_topic_update( + rk, &md->topics[i], &mdi->topics[i], + rd_false /*propagate later*/, + /* use has_client_rack rather than + compute_racks. We need cached rack ids + only in case we need to rejoin the group + if they change and client.rack is set + (KIP-881). */ + has_client_rack, rd_kafka_has_reliable_leader_epochs(rkb)); + rd_kafka_wrunlock(rk); } - /* Requested topics not seen in metadata? Propogate to topic code. */ if (missing_topics) { char *topic; rd_rkb_dbg(rkb, TOPIC, "METADATA", - "%d/%d requested topic(s) seen in metadata", + "%d/%d requested topic(s) seen in metadata" + " (lookup by name)", rd_list_cnt(requested_topics) - rd_list_cnt(missing_topics), rd_list_cnt(requested_topics)); @@ -544,11 +998,48 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, } } } + if (missing_topic_ids) { + rd_kafka_Uuid_t *topic_id; + rd_rkb_dbg(rkb, TOPIC, "METADATA", + "%d/%d requested topic(s) seen in metadata" + " (lookup by id)", + rd_list_cnt(requested_topic_ids) - + rd_list_cnt(missing_topic_ids), + rd_list_cnt(requested_topic_ids)); + for (i = 0; i < rd_list_cnt(missing_topic_ids); i++) { + rd_kafka_Uuid_t *missing_topic_id = + missing_topic_ids->rl_elems[i]; + rd_rkb_dbg(rkb, TOPIC, "METADATA", "wanted %s", + rd_kafka_Uuid_base64str(missing_topic_id)); + } + RD_LIST_FOREACH(topic_id, missing_topic_ids, i) { + rd_kafka_topic_t *rkt; + + rd_kafka_rdlock(rk); + rkt = rd_kafka_topic_find_by_topic_id(rkb->rkb_rk, + *topic_id); + rd_kafka_rdunlock(rk); + if (rkt) { + /* Received metadata response contained no + * information about topic 'rkt' and thus + * indicates the topic is not available in the + * cluster. + * Mark the topic as non-existent */ + rd_kafka_topic_wrlock(rkt); + rd_kafka_topic_set_notexists( + rkt, RD_KAFKA_RESP_ERR__UNKNOWN_TOPIC); + rd_kafka_topic_wrunlock(rkt); + + rd_kafka_topic_destroy0(rkt); + } + } + } rd_kafka_wrlock(rkb->rkb_rk); rkb->rkb_rk->rk_ts_metadata = rd_clock(); + rd_kafka_rebootstrap_tmr_restart(rkb->rkb_rk); /* Update cached cluster id. */ if (RD_KAFKAP_STR_LEN(&cluster_id) > 0 && @@ -586,28 +1077,23 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, } if (all_topics) { - rd_kafka_metadata_cache_update(rkb->rkb_rk, md, - 1 /*abs update*/); - - if (rkb->rkb_rk->rk_full_metadata) - rd_kafka_metadata_destroy( - rkb->rkb_rk->rk_full_metadata); - rkb->rkb_rk->rk_full_metadata = - rd_kafka_metadata_copy(md, tbuf.of); rkb->rkb_rk->rk_ts_full_metadata = rkb->rkb_rk->rk_ts_metadata; rd_rkb_dbg(rkb, METADATA, "METADATA", - "Caching full metadata with " - "%d broker(s) and %d topic(s): %s", - md->broker_cnt, md->topic_cnt, reason); - } else { - if (cache_changes) - rd_kafka_metadata_cache_propagate_changes(rk); - rd_kafka_metadata_cache_expiry_start(rk); + "Cached full metadata with " + " %d topic(s): %s", + md->topic_cnt, reason); } - /* Remove cache hints for the originally requested topics. */ if (requested_topics) rd_kafka_metadata_cache_purge_hints(rk, requested_topics); + if (requested_topic_ids) + rd_kafka_metadata_cache_purge_hints_by_id(rk, + requested_topic_ids); + + if (cache_changes) { + rd_kafka_metadata_cache_propagate_changes(rk); + rd_kafka_metadata_cache_expiry_start(rk); + } rd_kafka_wrunlock(rkb->rkb_rk); @@ -623,10 +1109,18 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, * which may contain only a sub-set of the subscribed topics (namely * the effective subscription of available topics) as to not * propagate non-included topics as non-existent. */ - if (cgrp_update && (requested_topics || all_topics)) + if (cgrp_update && + (all_topics || + ((requested_topics || requested_topic_ids) && + rd_kafka_cgrp_same_subscription_version( + rkb->rkb_rk->rk_cgrp, cgrp_subscription_version)))) rd_kafka_cgrp_metadata_update_check(rkb->rkb_rk->rk_cgrp, rd_true /*do join*/); + if (rk->rk_type == RD_KAFKA_CONSUMER && rk->rk_cgrp && + rk->rk_cgrp->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CLASSIC) + rd_interval_reset(&rk->rk_cgrp->rkcg_join_intvl); + /* Try to acquire a Producer ID from this broker if we * don't have one. */ if (rd_kafka_is_idempotent(rkb->rkb_rk)) { @@ -638,6 +1132,8 @@ rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, done: if (missing_topics) rd_list_destroy(missing_topics); + if (missing_topic_ids) + rd_list_destroy(missing_topic_ids); /* This metadata request was triggered by someone wanting * the metadata information back as a reply, so send that reply now. @@ -645,7 +1141,7 @@ done: * the requestee will do. * The tbuf is explicitly not destroyed as we return its memory * to the caller. */ - *mdp = md; + *mdip = mdi; return RD_KAFKA_RESP_ERR_NO_ERROR; @@ -659,16 +1155,75 @@ err: rd_kafka_metadata_cache_purge_hints(rk, requested_topics); rd_kafka_wrunlock(rkb->rkb_rk); } + if (requested_topic_ids) { + /* Failed requests shall purge cache hints for + * the requested topics. */ + rd_kafka_wrlock(rkb->rkb_rk); + rd_kafka_metadata_cache_purge_hints_by_id(rk, + requested_topic_ids); + rd_kafka_wrunlock(rkb->rkb_rk); + } if (missing_topics) rd_list_destroy(missing_topics); - + if (missing_topic_ids) + rd_list_destroy(missing_topic_ids); rd_tmpabuf_destroy(&tbuf); return err; } +/** + * @brief Handle a Metadata response message. + * + * @param request Initial Metadata request, containing the topic information. + * Must not be NULL. + * We require the topic information while parsing to make sure + * that there are no missing topics. + * @param mdip A pointer to (rd_kafka_metadata_internal_t *) into which the + * metadata will be marshalled (set to NULL on error.) + * + * @returns an error code on parse failure, else NO_ERROR. + * + * @locality rdkafka main thread + */ +rd_kafka_resp_err_t +rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, + rd_kafka_buf_t *request, + rd_kafka_buf_t *rkbuf, + rd_kafka_metadata_internal_t **mdip) { + const char *reason = request->rkbuf_u.Metadata.reason; + return rd_kafka_parse_Metadata0(rkb, request, rkbuf, mdip, NULL, + reason); +} + +/** + * @brief Handle a Metadata response message for admin requests. + * + * @param request_topics List containing topics in Metadata request. Must not + * be NULL. It is more convenient in the Admin flow to + * preserve the topic names rather than the initial + * Metadata request. + * We require the topic information while parsing to make + * sure that there are no missing topics. + * @param mdip A pointer to (rd_kafka_metadata_internal_t *) into which the + * metadata will be marshalled (set to NULL on error.) + * + * @returns an error code on parse failure, else NO_ERROR. + * + * @locality rdkafka main thread + */ +rd_kafka_resp_err_t +rd_kafka_parse_Metadata_admin(rd_kafka_broker_t *rkb, + rd_kafka_buf_t *rkbuf, + rd_list_t *request_topics, + rd_kafka_metadata_internal_t **mdip) { + return rd_kafka_parse_Metadata0(rkb, NULL, rkbuf, mdip, request_topics, + "(admin request)"); +} + + /** * @brief Add all topics in current cached full metadata * that matches the topics in \p match @@ -690,16 +1245,11 @@ rd_kafka_metadata_topic_match(rd_kafka_t *rk, rd_kafka_topic_partition_list_t *errored) { int ti, i; size_t cnt = 0; - const struct rd_kafka_metadata *metadata; rd_kafka_topic_partition_list_t *unmatched; + rd_list_t cached_topics; + const char *topic; rd_kafka_rdlock(rk); - metadata = rk->rk_full_metadata; - if (!metadata) { - rd_kafka_rdunlock(rk); - return 0; - } - /* To keep track of which patterns and topics in `match` that * did not match any topic (or matched an errored topic), we * create a set of all topics to match in `unmatched` and then @@ -710,8 +1260,15 @@ rd_kafka_metadata_topic_match(rd_kafka_t *rk, /* For each topic in the cluster, scan through the match list * to find matching topic. */ - for (ti = 0; ti < metadata->topic_cnt; ti++) { - const char *topic = metadata->topics[ti].topic; + rd_list_init(&cached_topics, rk->rk_metadata_cache.rkmc_cnt, rd_free); + rd_kafka_metadata_cache_topics_to_list(rk, &cached_topics, rd_false); + RD_LIST_FOREACH(topic, &cached_topics, ti) { + const rd_kafka_metadata_topic_internal_t *mdti; + const rd_kafka_metadata_topic_t *mdt = + rd_kafka_metadata_cache_topic_get(rk, topic, &mdti, + rd_true /* valid */); + if (!mdt) + continue; /* Ignore topics in blacklist */ if (rk->rk_conf.topic_blacklist && @@ -729,17 +1286,16 @@ rd_kafka_metadata_topic_match(rd_kafka_t *rk, unmatched, match->elems[i].topic, RD_KAFKA_PARTITION_UA); - if (metadata->topics[ti].err) { + if (mdt->err) { rd_kafka_topic_partition_list_add( errored, topic, RD_KAFKA_PARTITION_UA) - ->err = metadata->topics[ti].err; + ->err = mdt->err; continue; /* Skip errored topics */ } - rd_list_add( - tinfos, - rd_kafka_topic_info_new( - topic, metadata->topics[ti].partition_cnt)); + rd_list_add(tinfos, rd_kafka_topic_info_new_with_rack( + topic, mdt->partition_cnt, + mdti->partitions)); cnt++; } @@ -757,6 +1313,7 @@ rd_kafka_metadata_topic_match(rd_kafka_t *rk, } rd_kafka_topic_partition_list_destroy(unmatched); + rd_list_destroy(&cached_topics); return cnt; } @@ -784,16 +1341,18 @@ rd_kafka_metadata_topic_filter(rd_kafka_t *rk, rd_kafka_rdlock(rk); /* For each topic in match, look up the topic in the cache. */ for (i = 0; i < match->cnt; i++) { - const char *topic = match->elems[i].topic; - const rd_kafka_metadata_topic_t *mtopic; + const char *topic = match->elems[i].topic; + const rd_kafka_metadata_topic_t *mtopic = NULL; /* Ignore topics in blacklist */ if (rk->rk_conf.topic_blacklist && rd_kafka_pattern_match(rk->rk_conf.topic_blacklist, topic)) continue; - mtopic = - rd_kafka_metadata_cache_topic_get(rk, topic, 1 /*valid*/); + struct rd_kafka_metadata_cache_entry *rkmce = + rd_kafka_metadata_cache_find(rk, topic, 1 /* valid */); + if (rkmce) + mtopic = &rkmce->rkmce_mtopic; if (!mtopic) rd_kafka_topic_partition_list_add(errored, topic, @@ -804,8 +1363,11 @@ rd_kafka_metadata_topic_filter(rd_kafka_t *rk, RD_KAFKA_PARTITION_UA) ->err = mtopic->err; else { - rd_list_add(tinfos, rd_kafka_topic_info_new( - topic, mtopic->partition_cnt)); + rd_list_add(tinfos, + rd_kafka_topic_info_new_with_rack( + topic, mtopic->partition_cnt, + rkmce->rkmce_metadata_internal_topic + .partitions)); cnt++; } @@ -869,6 +1431,7 @@ rd_kafka_metadata_refresh_topics(rd_kafka_t *rk, rd_bool_t force, rd_bool_t allow_auto_create, rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, const char *reason) { rd_list_t q_topics; int destroy_rkb = 0; @@ -887,8 +1450,7 @@ rd_kafka_metadata_refresh_topics(rd_kafka_t *rk, * these topics so that they will be included in * a future all known_topics query. */ rd_kafka_metadata_cache_hint(rk, topics, NULL, - RD_KAFKA_RESP_ERR__NOENT, - 0 /*dont replace*/); + RD_KAFKA_RESP_ERR__NOENT); rd_kafka_wrunlock(rk); rd_kafka_dbg(rk, METADATA, "METADATA", @@ -909,8 +1471,7 @@ rd_kafka_metadata_refresh_topics(rd_kafka_t *rk, * out any topics that are already being requested. * q_topics will contain remaining topics to query. */ rd_kafka_metadata_cache_hint(rk, topics, &q_topics, - RD_KAFKA_RESP_ERR__WAIT_CACHE, - rd_false /*dont replace*/); + RD_KAFKA_RESP_ERR__WAIT_CACHE); rd_kafka_wrunlock(rk); if (rd_list_cnt(&q_topics) == 0) { @@ -935,8 +1496,9 @@ rd_kafka_metadata_refresh_topics(rd_kafka_t *rk, "Requesting metadata for %d/%d topics: %s", rd_list_cnt(&q_topics), rd_list_cnt(topics), reason); - rd_kafka_MetadataRequest(rkb, &q_topics, reason, allow_auto_create, - cgrp_update, NULL); + rd_kafka_MetadataRequest( + rkb, &q_topics, NULL, reason, allow_auto_create, cgrp_update, + cgrp_subscription_version, rd_false /* force_racks */, NULL); rd_list_destroy(&q_topics); @@ -985,7 +1547,7 @@ rd_kafka_metadata_refresh_known_topics(rd_kafka_t *rk, else err = rd_kafka_metadata_refresh_topics( rk, rkb, &topics, force, allow_auto_create_topics, - rd_false /*!cgrp_update*/, reason); + rd_false /*!cgrp_update*/, -1, reason); rd_list_destroy(&topics); @@ -1025,7 +1587,8 @@ rd_kafka_metadata_refresh_consumer_topics(rd_kafka_t *rk, rkcg = rk->rk_cgrp; rd_assert(rkcg != NULL); - if (rkcg->rkcg_flags & RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION) { + if (rkcg->rkcg_group_protocol == RD_KAFKA_GROUP_PROTOCOL_CLASSIC && + rkcg->rkcg_flags & RD_KAFKA_CGRP_F_WILDCARD_SUBSCRIPTION) { /* If there is a wildcard subscription we need to request * all topics in the cluster so that we can perform * regexp matching. */ @@ -1051,7 +1614,8 @@ rd_kafka_metadata_refresh_consumer_topics(rd_kafka_t *rk, else err = rd_kafka_metadata_refresh_topics( rk, rkb, &topics, rd_true /*force*/, - allow_auto_create_topics, rd_true /*cgrp_update*/, reason); + allow_auto_create_topics, rd_true /*cgrp_update*/, + rd_atomic32_get(&rkcg->rkcg_subscription_version), reason); rd_list_destroy(&topics); @@ -1078,8 +1642,9 @@ rd_kafka_resp_err_t rd_kafka_metadata_refresh_brokers(rd_kafka_t *rk, const char *reason) { return rd_kafka_metadata_request(rk, rkb, NULL /*brokers only*/, rd_false /*!allow auto create topics*/, - rd_false /*no cgrp update */, reason, - NULL); + rd_false /*no cgrp update */, + -1 /* same subscription version */, + reason, NULL); } @@ -1111,9 +1676,10 @@ rd_kafka_resp_err_t rd_kafka_metadata_refresh_all(rd_kafka_t *rk, } rd_list_init(&topics, 0, NULL); /* empty list = all topics */ - rd_kafka_MetadataRequest(rkb, &topics, reason, - rd_false /*no auto create*/, - rd_true /*cgrp update*/, NULL); + rd_kafka_MetadataRequest( + rkb, &topics, NULL, reason, rd_false /*no auto create*/, + rd_true /*cgrp update*/, -1 /* same subscription version */, + rd_false /* force_rack */, NULL); rd_list_destroy(&topics); if (destroy_rkb) @@ -1139,6 +1705,7 @@ rd_kafka_metadata_request(rd_kafka_t *rk, const rd_list_t *topics, rd_bool_t allow_auto_create_topics, rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, const char *reason, rd_kafka_op_t *rko) { int destroy_rkb = 0; @@ -1150,8 +1717,9 @@ rd_kafka_metadata_request(rd_kafka_t *rk, destroy_rkb = 1; } - rd_kafka_MetadataRequest(rkb, topics, reason, allow_auto_create_topics, - cgrp_update, rko); + rd_kafka_MetadataRequest( + rkb, topics, NULL, reason, allow_auto_create_topics, cgrp_update, + cgrp_subscription_version, rd_false /* force racks */, rko); if (destroy_rkb) rd_kafka_broker_destroy(rkb); @@ -1215,16 +1783,14 @@ static void rd_kafka_metadata_leader_query_tmr_cb(rd_kafka_timers_t *rkts, rd_kafka_metadata_refresh_topics( rk, NULL, &topics, rd_true /*force*/, rk->rk_conf.allow_auto_create_topics, - rd_false /*!cgrp_update*/, "partition leader query"); - /* Back off next query exponentially until we reach - * the standard query interval - then stop the timer - * since the intervalled querier will do the job for us. */ - if (rk->rk_conf.metadata_refresh_interval_ms > 0 && - rtmr->rtmr_interval * 2 / 1000 >= - rk->rk_conf.metadata_refresh_interval_ms) - rd_kafka_timer_stop(rkts, rtmr, 1 /*lock*/); - else - rd_kafka_timer_exp_backoff(rkts, rtmr); + rd_false /*!cgrp_update*/, -1, "partition leader query"); + + /* Back off next query exponentially till we reach + * the retry backoff max ms */ + rd_kafka_timer_exp_backoff( + rkts, rtmr, rk->rk_conf.retry_backoff_ms * 1000, + rk->rk_conf.retry_backoff_max_ms * 1000, + RD_KAFKA_RETRY_JITTER_PERCENT); } rd_list_destroy(&topics); @@ -1238,23 +1804,26 @@ static void rd_kafka_metadata_leader_query_tmr_cb(rd_kafka_timers_t *rkts, * exponentially increased intervals until no topics are missing * leaders. * + * @param force If true, run the query immediately without waiting for the + * interval. + * * @locks none * @locality any */ -void rd_kafka_metadata_fast_leader_query(rd_kafka_t *rk) { +void rd_kafka_metadata_fast_leader_query(rd_kafka_t *rk, rd_bool_t force) { rd_ts_t next; - /* Restart the timer if it will speed things up. */ + /* Restart the timer if it will speed things up, or if forced. */ next = rd_kafka_timer_next( &rk->rk_timers, &rk->rk_metadata_cache.rkmc_query_tmr, 1 /*lock*/); - if (next == -1 /* not started */ || + if (force || next == -1 /* not started */ || next > (rd_ts_t)rk->rk_conf.metadata_refresh_fast_interval_ms * 1000) { rd_kafka_dbg(rk, METADATA | RD_KAFKA_DBG_TOPIC, "FASTQUERY", "Starting fast leader query"); rd_kafka_timer_start( &rk->rk_timers, &rk->rk_metadata_cache.rkmc_query_tmr, - rk->rk_conf.metadata_refresh_fast_interval_ms * 1000, + 0 /* First request should be tried immediately */, rd_kafka_metadata_leader_query_tmr_cb, NULL); } } @@ -1266,44 +1835,71 @@ void rd_kafka_metadata_fast_leader_query(rd_kafka_t *rk) { * * @param topics elements are checked for .topic and .partition_cnt * @param topic_cnt is the number of topic elements in \p topics. + * @param replication_factor is the number of replicas of each partition (set to + * -1 to ignore). + * @param num_brokers is the number of brokers in the cluster. * * @returns a newly allocated metadata object that must be freed with * rd_kafka_metadata_destroy(). * + * @note \p replication_factor and \p num_brokers must be used together for + * setting replicas of each partition. + * * @sa rd_kafka_metadata_copy() */ rd_kafka_metadata_t * rd_kafka_metadata_new_topic_mock(const rd_kafka_metadata_topic_t *topics, - size_t topic_cnt) { + size_t topic_cnt, + int replication_factor, + int num_brokers) { + rd_kafka_metadata_internal_t *mdi; rd_kafka_metadata_t *md; rd_tmpabuf_t tbuf; - size_t topic_names_size = 0; - int total_partition_cnt = 0; size_t i; + int curr_broker = 0; + + /* If the replication factor is given, num_brokers must also be given */ + rd_assert(replication_factor <= 0 || num_brokers > 0); + + /* Allocate contiguous buffer which will back all the memory + * needed by the final metadata_t object */ + rd_tmpabuf_new(&tbuf, sizeof(*mdi), rd_true /*assert on fail*/); + + rd_tmpabuf_add_alloc(&tbuf, topic_cnt * sizeof(*md->topics)); + rd_tmpabuf_add_alloc(&tbuf, topic_cnt * sizeof(*mdi->topics)); + rd_tmpabuf_add_alloc(&tbuf, num_brokers * sizeof(*md->brokers)); /* Calculate total partition count and topic names size before * allocating memory. */ for (i = 0; i < topic_cnt; i++) { - topic_names_size += 1 + strlen(topics[i].topic); - total_partition_cnt += topics[i].partition_cnt; + rd_tmpabuf_add_alloc(&tbuf, 1 + strlen(topics[i].topic)); + rd_tmpabuf_add_alloc(&tbuf, + topics[i].partition_cnt * + sizeof(*md->topics[i].partitions)); + rd_tmpabuf_add_alloc(&tbuf, + topics[i].partition_cnt * + sizeof(*mdi->topics[i].partitions)); + if (replication_factor > 0) + rd_tmpabuf_add_alloc_times( + &tbuf, replication_factor * sizeof(int), + topics[i].partition_cnt); } + rd_tmpabuf_finalize(&tbuf); - /* Allocate contiguous buffer which will back all the memory - * needed by the final metadata_t object */ - rd_tmpabuf_new( - &tbuf, - sizeof(*md) + (sizeof(*md->topics) * topic_cnt) + topic_names_size + - (64 /*topic name size..*/ * topic_cnt) + - (sizeof(*md->topics[0].partitions) * total_partition_cnt), - 1 /*assert on fail*/); - - md = rd_tmpabuf_alloc(&tbuf, sizeof(*md)); - memset(md, 0, sizeof(*md)); + mdi = rd_tmpabuf_alloc(&tbuf, sizeof(*mdi)); + memset(mdi, 0, sizeof(*mdi)); + md = &mdi->metadata; md->topic_cnt = (int)topic_cnt; md->topics = rd_tmpabuf_alloc(&tbuf, md->topic_cnt * sizeof(*md->topics)); + mdi->topics = + rd_tmpabuf_alloc(&tbuf, md->topic_cnt * sizeof(*mdi->topics)); + + md->broker_cnt = num_brokers; + mdi->brokers = + rd_tmpabuf_alloc(&tbuf, md->broker_cnt * sizeof(*mdi->brokers)); for (i = 0; i < (size_t)md->topic_cnt; i++) { int j; @@ -1316,12 +1912,42 @@ rd_kafka_metadata_new_topic_mock(const rd_kafka_metadata_topic_t *topics, md->topics[i].partitions = rd_tmpabuf_alloc( &tbuf, md->topics[i].partition_cnt * sizeof(*md->topics[i].partitions)); + mdi->topics[i].partitions = rd_tmpabuf_alloc( + &tbuf, md->topics[i].partition_cnt * + sizeof(*mdi->topics[i].partitions)); for (j = 0; j < md->topics[i].partition_cnt; j++) { + int k; memset(&md->topics[i].partitions[j], 0, sizeof(md->topics[i].partitions[j])); - md->topics[i].partitions[j].id = j; + memset(&mdi->topics[i].partitions[j], 0, + sizeof(mdi->topics[i].partitions[j])); + md->topics[i].partitions[j].id = j; + mdi->topics[i].partitions[j].id = j; + mdi->topics[i].partitions[j].leader_epoch = -1; + mdi->topics[i].partitions[j].racks_cnt = 0; + mdi->topics[i].partitions[j].racks = NULL; + md->topics[i].partitions[j].id = j; + + /* In case replication_factor is not given, don't set + * replicas. */ + if (replication_factor <= 0) + continue; + + md->topics[i].partitions[j].replicas = rd_tmpabuf_alloc( + &tbuf, replication_factor * sizeof(int)); + md->topics[i].partitions[j].leader = curr_broker; + md->topics[i].partitions[j].replica_cnt = + replication_factor; + for (k = 0; k < replication_factor; k++) { + md->topics[i].partitions[j].replicas[k] = + (j + k + curr_broker) % num_brokers; + } } + if (num_brokers > 0) + curr_broker = + (curr_broker + md->topics[i].partition_cnt) % + num_brokers; } /* Check for tmpabuf errors */ @@ -1333,6 +1959,24 @@ rd_kafka_metadata_new_topic_mock(const rd_kafka_metadata_topic_t *topics, return md; } +/* Implementation for rd_kafka_metadata_new_topic*mockv() */ +static rd_kafka_metadata_t * +rd_kafka_metadata_new_topic_mockv_internal(size_t topic_cnt, + int replication_factor, + int num_brokers, + va_list args) { + rd_kafka_metadata_topic_t *topics; + size_t i; + + topics = rd_alloca(sizeof(*topics) * topic_cnt); + for (i = 0; i < topic_cnt; i++) { + topics[i].topic = va_arg(args, char *); + topics[i].partition_cnt = va_arg(args, int); + } + + return rd_kafka_metadata_new_topic_mock( + topics, topic_cnt, replication_factor, num_brokers); +} /** * @brief Create mock Metadata (for testing) based on the @@ -1346,18 +1990,227 @@ rd_kafka_metadata_new_topic_mock(const rd_kafka_metadata_topic_t *topics, * @sa rd_kafka_metadata_new_topic_mock() */ rd_kafka_metadata_t *rd_kafka_metadata_new_topic_mockv(size_t topic_cnt, ...) { - rd_kafka_metadata_topic_t *topics; + rd_kafka_metadata_t *metadata; va_list ap; + + va_start(ap, topic_cnt); + metadata = + rd_kafka_metadata_new_topic_mockv_internal(topic_cnt, -1, 0, ap); + va_end(ap); + + return metadata; +} + +/** + * @brief Create mock Metadata (for testing) based on the + * var-arg tuples of (const char *topic, int partition_cnt). + * + * @param replication_factor is the number of replicas of each partition. + * @param num_brokers is the number of brokers in the cluster. + * @param topic_cnt is the number of topic,partition_cnt tuples. + * + * @returns a newly allocated metadata object that must be freed with + * rd_kafka_metadata_destroy(). + * + * @sa rd_kafka_metadata_new_topic_mock() + */ +rd_kafka_metadata_t *rd_kafka_metadata_new_topic_with_partition_replicas_mockv( + int replication_factor, + int num_brokers, + size_t topic_cnt, + ...) { + rd_kafka_metadata_t *metadata; + va_list ap; + + va_start(ap, topic_cnt); + metadata = rd_kafka_metadata_new_topic_mockv_internal( + topic_cnt, replication_factor, num_brokers, ap); + va_end(ap); + + return metadata; +} + +/** + * @brief Create mock Metadata (for testing) based on arrays topic_names and + * partition_cnts. + * + * @param replication_factor is the number of replicas of each partition. + * @param num_brokers is the number of brokers in the cluster. + * @param topic_names names of topics. + * @param partition_cnts number of partitions in each topic. + * @param topic_cnt number of topics. + * + * @return rd_kafka_metadata_t* + * + * @sa rd_kafka_metadata_new_topic_mock() + */ +rd_kafka_metadata_t * +rd_kafka_metadata_new_topic_with_partition_replicas_mock(int replication_factor, + int num_brokers, + char *topic_names[], + int *partition_cnts, + size_t topic_cnt) { + rd_kafka_metadata_topic_t *topics; size_t i; topics = rd_alloca(sizeof(*topics) * topic_cnt); - - va_start(ap, topic_cnt); for (i = 0; i < topic_cnt; i++) { - topics[i].topic = va_arg(ap, char *); - topics[i].partition_cnt = va_arg(ap, int); + topics[i].topic = topic_names[i]; + topics[i].partition_cnt = partition_cnts[i]; } - va_end(ap); - return rd_kafka_metadata_new_topic_mock(topics, topic_cnt); + return rd_kafka_metadata_new_topic_mock( + topics, topic_cnt, replication_factor, num_brokers); +} + +/** + * @brief Handle update of metadata received in the produce or fetch tags. + * + * @param rk Client instance. + * @param rko Metadata update operation. + * + * @locality main thread + * @locks none + * + * @return always RD_KAFKA_OP_RES_HANDLED + */ +rd_kafka_op_res_t +rd_kafka_metadata_update_op(rd_kafka_t *rk, rd_kafka_metadata_internal_t *mdi) { + int i, j; + rd_kafka_metadata_t *md = &mdi->metadata; + rd_bool_t cache_updated = rd_false; + rd_kafka_secproto_t rkb_proto = rk->rk_conf.security_protocol; + + + for (i = 0; i < md->broker_cnt; i++) { + rd_kafka_broker_update(rk, rkb_proto, &md->brokers[i], NULL); + } + + for (i = 0; i < md->topic_cnt; i++) { + struct rd_kafka_metadata_cache_entry *rkmce; + int32_t partition_cache_changes = 0; + rd_bool_t by_id = + !RD_KAFKA_UUID_IS_ZERO(mdi->topics[i].topic_id); + rd_kafka_Uuid_t topic_id = RD_KAFKA_UUID_ZERO; + char *topic = NULL; + + if (by_id) { + rkmce = rd_kafka_metadata_cache_find_by_id( + rk, mdi->topics[i].topic_id, 1); + topic_id = mdi->topics[i].topic_id; + } else { + rkmce = rd_kafka_metadata_cache_find( + rk, md->topics[i].topic, 1); + topic = md->topics[i].topic; + } + + if (!rkmce) { + if (by_id) { + rd_kafka_log( + rk, LOG_WARNING, "METADATAUPDATE", + "Topic id %s not found in cache", + rd_kafka_Uuid_base64str(&topic_id)); + } else { + rd_kafka_log(rk, LOG_WARNING, "METADATAUPDATE", + "Topic %s not found in cache", + topic); + } + continue; + } + topic = rkmce->rkmce_mtopic.topic; + topic_id = rkmce->rkmce_metadata_internal_topic.topic_id; + + for (j = 0; j < md->topics[i].partition_cnt; j++) { + rd_kafka_broker_t *rkb; + rd_kafka_metadata_partition_t *mdp = + &md->topics[i].partitions[j]; + ; + rd_kafka_metadata_partition_internal_t *mdpi = + &mdi->topics[i].partitions[j]; + int32_t part = mdp->id, current_leader_epoch; + + if (part >= rkmce->rkmce_mtopic.partition_cnt) { + rd_kafka_log(rk, LOG_WARNING, "METADATAUPDATE", + "Partition %s(%s)[%" PRId32 + "]: not found " + "in cache", + topic, + rd_kafka_Uuid_base64str(&topic_id), + part); + + continue; + } + + rkb = rd_kafka_broker_find_by_nodeid(rk, mdp->leader); + if (!rkb) { + rd_kafka_log(rk, LOG_WARNING, "METADATAUPDATE", + "Partition %s(%s)[%" PRId32 + "]: new leader" + "%" PRId32 " not found in cache", + topic, + rd_kafka_Uuid_base64str(&topic_id), + part, mdp->leader); + continue; + } + + current_leader_epoch = + rkmce->rkmce_metadata_internal_topic + .partitions[part] + .leader_epoch; + + if (mdpi->leader_epoch != -1 && + current_leader_epoch > mdpi->leader_epoch) { + rd_kafka_broker_destroy(rkb); + rd_kafka_dbg( + rk, METADATA, "METADATAUPDATE", + "Partition %s(%s)[%" PRId32 + "]: leader epoch " + "is " + "not newer %" PRId32 " >= %" PRId32, + topic, rd_kafka_Uuid_base64str(&topic_id), + part, current_leader_epoch, + mdpi->leader_epoch); + continue; + } + partition_cache_changes++; + + /* Need to acquire the write lock to avoid dirty reads + * from other threads acquiring read locks. */ + rd_kafka_wrlock(rk); + rkmce->rkmce_metadata_internal_topic.partitions[part] + .leader_epoch = mdpi->leader_epoch; + rkmce->rkmce_mtopic.partitions[part].leader = + mdp->leader; + rd_kafka_wrunlock(rk); + rd_kafka_broker_destroy(rkb); + + rd_kafka_dbg(rk, METADATA, "METADATAUPDATE", + "Partition %s(%s)[%" PRId32 + "]:" + " updated with leader %" PRId32 + " and epoch %" PRId32, + topic, rd_kafka_Uuid_base64str(&topic_id), + part, mdp->leader, mdpi->leader_epoch); + } + + if (partition_cache_changes > 0) { + cache_updated = rd_true; + rd_kafka_topic_metadata_update2( + rk->rk_internal_rkb, &rkmce->rkmce_mtopic, + &rkmce->rkmce_metadata_internal_topic); + } + } + + if (!cache_updated) { + rd_kafka_dbg(rk, METADATA, "METADATAUPDATE", + "Cache was not updated"); + return RD_KAFKA_OP_RES_HANDLED; + } + + rd_kafka_dbg(rk, METADATA, "METADATAUPDATE", + "Metadata cache updated, propagating changes"); + rd_kafka_metadata_cache_propagate_changes(rk); + rd_kafka_metadata_cache_expiry_start(rk); + + return RD_KAFKA_OP_RES_HANDLED; } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_metadata.h b/src/third_party/librdkafka/dist/src/rdkafka_metadata.h index b77bc19ed7c..7916dcea777 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_metadata.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_metadata.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,13 +32,96 @@ #include "rdavl.h" -rd_kafka_resp_err_t rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, - rd_kafka_buf_t *request, - rd_kafka_buf_t *rkbuf, - struct rd_kafka_metadata **mdp); +/** + * @brief Metadata partition internal container + */ +typedef struct rd_kafka_metadata_partition_internal_s { + /** Partition Id */ + int32_t id; + /** Partition leader epoch */ + int32_t leader_epoch; + /* Racks for this partition. Sorted and de-duplicated. */ + char **racks; + /* Count of the racks */ + size_t racks_cnt; +} rd_kafka_metadata_partition_internal_t; -struct rd_kafka_metadata * -rd_kafka_metadata_copy(const struct rd_kafka_metadata *md, size_t size); +/** + * @brief Metadata topic internal container + */ +typedef struct rd_kafka_metadata_topic_internal_s { + /** Internal metadata partition structs. + * same count as metadata.topics[i].partition_cnt. + * Sorted by Partition Id. */ + rd_kafka_metadata_partition_internal_t *partitions; + rd_kafka_Uuid_t topic_id; + int32_t topic_authorized_operations; /**< ACL operations allowed + * for topic, -1 if not + * supported by broker */ + rd_bool_t is_internal; /**< Is topic internal to Kafka? */ +} rd_kafka_metadata_topic_internal_t; + + +/** + * @brief Metadata broker internal container + */ +typedef struct rd_kafka_metadata_broker_internal_s { + /** Broker Id. */ + int32_t id; + /** Rack Id (optional). */ + char *rack_id; +} rd_kafka_metadata_broker_internal_t; + +/** + * @brief Metadata internal container + */ +typedef struct rd_kafka_metadata_internal_s { + rd_kafka_metadata_t + metadata; /**< Public metadata struct. Must + be kept the first field so the pointer + can be cast to *rd_kafka_metadata_internal_t + when needed */ + /* Identical to metadata->brokers, but sorted by broker id. */ + struct rd_kafka_metadata_broker *brokers_sorted; + /* Internal metadata brokers. Same count as metadata.broker_cnt. + * Sorted by broker id. */ + rd_kafka_metadata_broker_internal_t *brokers; + /* Internal metadata topics. Same count as metadata.topic_cnt. */ + rd_kafka_metadata_topic_internal_t *topics; + char *cluster_id; /**< Cluster id (optionally populated)*/ + int controller_id; /**< current controller id for cluster, -1 if not + * supported by broker. */ + int32_t cluster_authorized_operations; /**< ACL operations allowed + * for cluster, -1 if not + * supported by broker */ +} rd_kafka_metadata_internal_t; + +/** + * @brief The internal metadata type corresponding to the + * public one. + */ +#define rd_kafka_metadata_get_internal(md) ((rd_kafka_metadata_internal_t *)md) + +rd_bool_t rd_kafka_has_reliable_leader_epochs(rd_kafka_broker_t *rkb); + +rd_kafka_resp_err_t +rd_kafka_parse_Metadata(rd_kafka_broker_t *rkb, + rd_kafka_buf_t *request, + rd_kafka_buf_t *rkbuf, + rd_kafka_metadata_internal_t **mdip); + +rd_kafka_resp_err_t +rd_kafka_parse_Metadata_admin(rd_kafka_broker_t *rkb, + rd_kafka_buf_t *rkbuf, + rd_list_t *request_topics, + rd_kafka_metadata_internal_t **mdip); + +rd_kafka_metadata_internal_t * +rd_kafka_metadata_copy(const rd_kafka_metadata_internal_t *mdi, size_t size); + +rd_kafka_metadata_internal_t * +rd_kafka_metadata_copy_add_racks(const rd_kafka_metadata_internal_t *mdi, + size_t size); size_t rd_kafka_metadata_topic_match(rd_kafka_t *rk, @@ -63,6 +147,7 @@ rd_kafka_metadata_refresh_topics(rd_kafka_t *rk, rd_bool_t force, rd_bool_t allow_auto_create, rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, const char *reason); rd_kafka_resp_err_t rd_kafka_metadata_refresh_known_topics(rd_kafka_t *rk, @@ -86,6 +171,7 @@ rd_kafka_metadata_request(rd_kafka_t *rk, const rd_list_t *topics, rd_bool_t allow_auto_create_topics, rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, const char *reason, rd_kafka_op_t *rko); @@ -93,11 +179,40 @@ rd_kafka_metadata_request(rd_kafka_t *rk, int rd_kafka_metadata_partition_id_cmp(const void *_a, const void *_b); +int rd_kafka_metadata_broker_internal_cmp(const void *_a, const void *_b); + +int rd_kafka_metadata_broker_cmp(const void *_a, const void *_b); + +void rd_kafka_metadata_partition_clear( + struct rd_kafka_metadata_partition *rkmp); + +#define rd_kafka_metadata_broker_internal_find(mdi, broker_id, broker) \ + do { \ + rd_kafka_metadata_broker_internal_t __key = {.id = broker_id}; \ + broker = \ + bsearch(&__key, mdi->brokers, mdi->metadata.broker_cnt, \ + sizeof(rd_kafka_metadata_broker_internal_t), \ + rd_kafka_metadata_broker_internal_cmp); \ + } while (0) + + rd_kafka_metadata_t * rd_kafka_metadata_new_topic_mock(const rd_kafka_metadata_topic_t *topics, - size_t topic_cnt); + size_t topic_cnt, + int replication_factor, + int num_brokers); rd_kafka_metadata_t *rd_kafka_metadata_new_topic_mockv(size_t topic_cnt, ...); - +rd_kafka_metadata_t *rd_kafka_metadata_new_topic_with_partition_replicas_mockv( + int replication_factor, + int num_brokers, + size_t topic_cnt, + ...); +rd_kafka_metadata_t * +rd_kafka_metadata_new_topic_with_partition_replicas_mock(int replication_factor, + int num_brokers, + char *topic_names[], + int *partition_cnts, + size_t topic_cnt); /** * @{ @@ -106,12 +221,17 @@ rd_kafka_metadata_t *rd_kafka_metadata_new_topic_mockv(size_t topic_cnt, ...); */ struct rd_kafka_metadata_cache_entry { - rd_avl_node_t rkmce_avlnode; /* rkmc_avl */ + rd_avl_node_t rkmce_avlnode; /* rkmc_avl */ + rd_avl_node_t rkmce_avlnode_by_id; /* rkmc_avl_by_id */ TAILQ_ENTRY(rd_kafka_metadata_cache_entry) rkmce_link; /* rkmc_expiry */ rd_ts_t rkmce_ts_expires; /* Expire time */ rd_ts_t rkmce_ts_insert; /* Insert time */ + /** Last known leader epochs array (same size as the partition count), + * or NULL if not known. */ rd_kafka_metadata_topic_t rkmce_mtopic; /* Cached topic metadata */ - /* rkmce_partitions memory points here. */ + /* Cached internal topic metadata */ + rd_kafka_metadata_topic_internal_t rkmce_metadata_internal_topic; + /* rkmce_topics.partitions memory points here. */ }; @@ -126,6 +246,7 @@ struct rd_kafka_metadata_cache_entry { struct rd_kafka_metadata_cache { rd_avl_t rkmc_avl; + rd_avl_t rkmc_avl_by_id; TAILQ_HEAD(, rd_kafka_metadata_cache_entry) rkmc_expiry; rd_kafka_timer_t rkmc_expiry_tmr; int rkmc_cnt; @@ -151,36 +272,49 @@ struct rd_kafka_metadata_cache { +int rd_kafka_metadata_cache_delete_by_name(rd_kafka_t *rk, const char *topic); +int rd_kafka_metadata_cache_delete_by_topic_id(rd_kafka_t *rk, + const rd_kafka_Uuid_t topic_id); void rd_kafka_metadata_cache_expiry_start(rd_kafka_t *rk); -void rd_kafka_metadata_cache_topic_update(rd_kafka_t *rk, - const rd_kafka_metadata_topic_t *mdt, - rd_bool_t propagate); -void rd_kafka_metadata_cache_update(rd_kafka_t *rk, - const rd_kafka_metadata_t *md, - int abs_update); +int rd_kafka_metadata_cache_purge_all_hints(rd_kafka_t *rk); +int rd_kafka_metadata_cache_topic_update( + rd_kafka_t *rk, + const rd_kafka_metadata_topic_t *mdt, + const rd_kafka_metadata_topic_internal_t *mdit, + rd_bool_t propagate, + rd_bool_t include_metadata, + rd_bool_t has_reliable_leader_epochs); void rd_kafka_metadata_cache_propagate_changes(rd_kafka_t *rk); struct rd_kafka_metadata_cache_entry * rd_kafka_metadata_cache_find(rd_kafka_t *rk, const char *topic, int valid); +struct rd_kafka_metadata_cache_entry * +rd_kafka_metadata_cache_find_by_id(rd_kafka_t *rk, + const rd_kafka_Uuid_t topic_id, + int valid); void rd_kafka_metadata_cache_purge_hints(rd_kafka_t *rk, const rd_list_t *topics); +void rd_kafka_metadata_cache_purge_hints_by_id(rd_kafka_t *rk, + const rd_list_t *topic_ids); int rd_kafka_metadata_cache_hint(rd_kafka_t *rk, const rd_list_t *topics, rd_list_t *dst, - rd_kafka_resp_err_t err, - rd_bool_t replace); + rd_kafka_resp_err_t err); int rd_kafka_metadata_cache_hint_rktparlist( rd_kafka_t *rk, const rd_kafka_topic_partition_list_t *rktparlist, - rd_list_t *dst, - int replace); + rd_list_t *dst); -const rd_kafka_metadata_topic_t * -rd_kafka_metadata_cache_topic_get(rd_kafka_t *rk, const char *topic, int valid); +const rd_kafka_metadata_topic_t *rd_kafka_metadata_cache_topic_get( + rd_kafka_t *rk, + const char *topic, + const rd_kafka_metadata_topic_internal_t **mdtip, + int valid); int rd_kafka_metadata_cache_topic_partition_get( rd_kafka_t *rk, const rd_kafka_metadata_topic_t **mtopicp, const rd_kafka_metadata_partition_t **mpartp, + const rd_kafka_metadata_partition_internal_t **mdpip, const char *topic, int32_t partition, int valid); @@ -189,7 +323,7 @@ int rd_kafka_metadata_cache_topics_count_exists(rd_kafka_t *rk, const rd_list_t *topics, int *metadata_agep); -void rd_kafka_metadata_fast_leader_query(rd_kafka_t *rk); +void rd_kafka_metadata_fast_leader_query(rd_kafka_t *rk, rd_bool_t force); void rd_kafka_metadata_cache_init(rd_kafka_t *rk); void rd_kafka_metadata_cache_destroy(rd_kafka_t *rk); @@ -197,11 +331,15 @@ void rd_kafka_metadata_cache_purge(rd_kafka_t *rk, rd_bool_t purge_observers); int rd_kafka_metadata_cache_wait_change(rd_kafka_t *rk, int timeout_ms); void rd_kafka_metadata_cache_dump(FILE *fp, rd_kafka_t *rk); -int rd_kafka_metadata_cache_topics_to_list(rd_kafka_t *rk, rd_list_t *topics); +int rd_kafka_metadata_cache_topics_to_list(rd_kafka_t *rk, + rd_list_t *topics, + rd_bool_t exclude_valid); void rd_kafka_metadata_cache_wait_state_change_async( rd_kafka_t *rk, rd_kafka_enq_once_t *eonce); +rd_kafka_op_res_t +rd_kafka_metadata_update_op(rd_kafka_t *rk, rd_kafka_metadata_internal_t *mdi); /**@}*/ #endif /* _RDKAFKA_METADATA_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_metadata_cache.c b/src/third_party/librdkafka/dist/src/rdkafka_metadata_cache.c index 822d0cb2f19..157a90b20e9 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_metadata_cache.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_metadata_cache.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -79,8 +80,14 @@ static RD_INLINE void rd_kafka_metadata_cache_delete(rd_kafka_t *rk, struct rd_kafka_metadata_cache_entry *rkmce, int unlink_avl) { - if (unlink_avl) + if (unlink_avl) { RD_AVL_REMOVE_ELM(&rk->rk_metadata_cache.rkmc_avl, rkmce); + if (!RD_KAFKA_UUID_IS_ZERO( + rkmce->rkmce_metadata_internal_topic.topic_id)) { + RD_AVL_REMOVE_ELM(&rk->rk_metadata_cache.rkmc_avl_by_id, + rkmce); + } + } TAILQ_REMOVE(&rk->rk_metadata_cache.rkmc_expiry, rkmce, rkmce_link); rd_kafka_assert(NULL, rk->rk_metadata_cache.rkmc_cnt > 0); rk->rk_metadata_cache.rkmc_cnt--; @@ -93,8 +100,7 @@ rd_kafka_metadata_cache_delete(rd_kafka_t *rk, * @locks rd_kafka_wrlock() * @returns 1 if entry was found and removed, else 0. */ -static int rd_kafka_metadata_cache_delete_by_name(rd_kafka_t *rk, - const char *topic) { +int rd_kafka_metadata_cache_delete_by_name(rd_kafka_t *rk, const char *topic) { struct rd_kafka_metadata_cache_entry *rkmce; rkmce = rd_kafka_metadata_cache_find(rk, topic, 1); @@ -103,6 +109,21 @@ static int rd_kafka_metadata_cache_delete_by_name(rd_kafka_t *rk, return rkmce ? 1 : 0; } +/** + * @brief Delete cache entry by topic id + * @locks rd_kafka_wrlock() + * @returns 1 if entry was found and removed, else 0. + */ +int rd_kafka_metadata_cache_delete_by_topic_id(rd_kafka_t *rk, + const rd_kafka_Uuid_t topic_id) { + struct rd_kafka_metadata_cache_entry *rkmce; + + rkmce = rd_kafka_metadata_cache_find_by_id(rk, topic_id, 1); + if (rkmce) + rd_kafka_metadata_cache_delete(rk, rkmce, 1); + return rkmce ? 1 : 0; +} + static int rd_kafka_metadata_cache_evict(rd_kafka_t *rk); /** @@ -126,7 +147,7 @@ static void rd_kafka_metadata_cache_evict_tmr_cb(rd_kafka_timers_t *rkts, * * @returns the number of entries evicted. * - * @locks rd_kafka_wrlock() + * @locks_required rd_kafka_wrlock() */ static int rd_kafka_metadata_cache_evict(rd_kafka_t *rk) { int cnt = 0; @@ -160,6 +181,32 @@ static int rd_kafka_metadata_cache_evict(rd_kafka_t *rk) { } +/** + * @brief Remove all cache hints,. + * This is done when the Metadata response has been parsed and + * replaced hints with existing topic information, thus this will + * only remove unmatched topics from the cache. + * + * @returns the number of purged hints + * + * @locks_required rd_kafka_wrlock() + */ +int rd_kafka_metadata_cache_purge_all_hints(rd_kafka_t *rk) { + int cnt = 0; + struct rd_kafka_metadata_cache_entry *rkmce, *tmp; + + TAILQ_FOREACH_SAFE(rkmce, &rk->rk_metadata_cache.rkmc_expiry, + rkmce_link, tmp) { + if (!RD_KAFKA_METADATA_CACHE_VALID(rkmce)) { + rd_kafka_metadata_cache_delete(rk, rkmce, 1); + cnt++; + } + } + + return cnt; +} + + /** * @brief Find cache entry by topic name * @@ -177,6 +224,25 @@ rd_kafka_metadata_cache_find(rd_kafka_t *rk, const char *topic, int valid) { return NULL; } +/** + * @brief Find cache entry by topic id + * + * @param valid: entry must be valid (not hint) + * + * @locks rd_kafka_*lock() + */ +struct rd_kafka_metadata_cache_entry * +rd_kafka_metadata_cache_find_by_id(rd_kafka_t *rk, + const rd_kafka_Uuid_t topic_id, + int valid) { + struct rd_kafka_metadata_cache_entry skel, *rkmce; + skel.rkmce_metadata_internal_topic.topic_id = topic_id; + rkmce = RD_AVL_FIND(&rk->rk_metadata_cache.rkmc_avl_by_id, &skel); + if (rkmce && (!valid || RD_KAFKA_METADATA_CACHE_VALID(rkmce))) + return rkmce; + return NULL; +} + /** * @brief Partition (id) comparator @@ -186,21 +252,20 @@ int rd_kafka_metadata_partition_id_cmp(const void *_a, const void *_b) { return RD_CMP(a->id, b->id); } - /** - * @brief Add (and replace) cache entry for topic. + * @brief Creates a new metadata cache entry. * - * This makes a copy of \p topic + * @param mdt Topic to insert in the cache entry. + * @param mdti Topic to insert in the cache entry (internal structure). + * @param include_racks Include partition racks. * - * @locks_required rd_kafka_wrlock() + * @return The new metadata cache entry, to free with `rd_free`. */ -static struct rd_kafka_metadata_cache_entry * -rd_kafka_metadata_cache_insert(rd_kafka_t *rk, - const rd_kafka_metadata_topic_t *mtopic, - rd_ts_t now, - rd_ts_t ts_expires) { - struct rd_kafka_metadata_cache_entry *rkmce, *old; - size_t topic_len; +static struct rd_kafka_metadata_cache_entry *rd_kafka_metadata_cache_entry_new( + const rd_kafka_metadata_topic_t *mtopic, + const rd_kafka_metadata_topic_internal_t *metadata_internal_topic, + rd_bool_t include_racks) { + struct rd_kafka_metadata_cache_entry *rkmce; rd_tmpabuf_t tbuf; int i; @@ -209,19 +274,41 @@ rd_kafka_metadata_cache_insert(rd_kafka_t *rk, * rd_tmpabuf_t provides the infrastructure to do this. * Because of this we copy all the structs verbatim but * any pointer fields needs to be copied explicitly to update - * the pointer address. */ - topic_len = strlen(mtopic->topic) + 1; - rd_tmpabuf_new(&tbuf, - RD_ROUNDUP(sizeof(*rkmce), 8) + - RD_ROUNDUP(topic_len, 8) + - (mtopic->partition_cnt * - RD_ROUNDUP(sizeof(*mtopic->partitions), 8)), - 1 /*assert on fail*/); + * the pointer address. + * See also rd_kafka_metadata_cache_delete which frees this. */ + rd_tmpabuf_new(&tbuf, 0, rd_true /*assert on fail*/); + + rd_tmpabuf_add_alloc(&tbuf, sizeof(*rkmce)); + rd_tmpabuf_add_alloc(&tbuf, strlen(mtopic->topic) + 1); + rd_tmpabuf_add_alloc(&tbuf, mtopic->partition_cnt * + sizeof(*mtopic->partitions)); + rd_tmpabuf_add_alloc(&tbuf, + mtopic->partition_cnt * + sizeof(*metadata_internal_topic->partitions)); + + for (i = 0; include_racks && i < mtopic->partition_cnt; i++) { + size_t j; + rd_tmpabuf_add_alloc( + &tbuf, metadata_internal_topic->partitions[i].racks_cnt * + sizeof(char *)); + for (j = 0; + j < metadata_internal_topic->partitions[i].racks_cnt; + j++) { + rd_tmpabuf_add_alloc( + &tbuf, strlen(metadata_internal_topic->partitions[i] + .racks[j]) + + 1); + } + } + + rd_tmpabuf_finalize(&tbuf); rkmce = rd_tmpabuf_alloc(&tbuf, sizeof(*rkmce)); rkmce->rkmce_mtopic = *mtopic; + rkmce->rkmce_metadata_internal_topic = *metadata_internal_topic; + /* Copy topic name and update pointer */ rkmce->rkmce_mtopic.topic = rd_tmpabuf_write_str(&tbuf, mtopic->topic); @@ -230,6 +317,41 @@ rd_kafka_metadata_cache_insert(rd_kafka_t *rk, &tbuf, mtopic->partitions, mtopic->partition_cnt * sizeof(*mtopic->partitions)); + /* Copy partition array (internal) and update pointer */ + rkmce->rkmce_metadata_internal_topic.partitions = + rd_tmpabuf_write(&tbuf, metadata_internal_topic->partitions, + mtopic->partition_cnt * + sizeof(*metadata_internal_topic->partitions)); + + + /* Sort partitions for future bsearch() lookups. */ + qsort(rkmce->rkmce_mtopic.partitions, rkmce->rkmce_mtopic.partition_cnt, + sizeof(*rkmce->rkmce_mtopic.partitions), + rd_kafka_metadata_partition_id_cmp); + + /* partitions (internal) are already sorted. */ + + if (include_racks) { + for (i = 0; i < rkmce->rkmce_mtopic.partition_cnt; i++) { + size_t j; + rd_kafka_metadata_partition_t *mdp = + &rkmce->rkmce_mtopic.partitions[i]; + rd_kafka_metadata_partition_internal_t *mdpi = + &rkmce->rkmce_metadata_internal_topic.partitions[i]; + rd_kafka_metadata_partition_internal_t *mdpi_orig = + &metadata_internal_topic->partitions[i]; + + if (mdp->replica_cnt == 0 || mdpi->racks_cnt == 0) + continue; + + mdpi->racks = rd_tmpabuf_alloc( + &tbuf, sizeof(char *) * mdpi->racks_cnt); + for (j = 0; j < mdpi_orig->racks_cnt; j++) + mdpi->racks[j] = rd_tmpabuf_write_str( + &tbuf, mdpi_orig->racks[j]); + } + } + /* Clear uncached fields. */ for (i = 0; i < mtopic->partition_cnt; i++) { rkmce->rkmce_mtopic.partitions[i].replicas = NULL; @@ -238,11 +360,20 @@ rd_kafka_metadata_cache_insert(rd_kafka_t *rk, rkmce->rkmce_mtopic.partitions[i].isr_cnt = 0; } - /* Sort partitions for future bsearch() lookups. */ - qsort(rkmce->rkmce_mtopic.partitions, rkmce->rkmce_mtopic.partition_cnt, - sizeof(*rkmce->rkmce_mtopic.partitions), - rd_kafka_metadata_partition_id_cmp); + return rkmce; +} +/** + * @brief Add (and replace) cache entry for topic. + * + * @locks_required rd_kafka_wrlock() + */ +static struct rd_kafka_metadata_cache_entry * +rd_kafka_metadata_cache_insert(rd_kafka_t *rk, + struct rd_kafka_metadata_cache_entry *rkmce, + rd_ts_t now, + rd_ts_t ts_expires) { + struct rd_kafka_metadata_cache_entry *old, *old_by_id = NULL; TAILQ_INSERT_TAIL(&rk->rk_metadata_cache.rkmc_expiry, rkmce, rkmce_link); rk->rk_metadata_cache.rkmc_cnt++; @@ -252,14 +383,58 @@ rd_kafka_metadata_cache_insert(rd_kafka_t *rk, /* Insert (and replace existing) entry. */ old = RD_AVL_INSERT(&rk->rk_metadata_cache.rkmc_avl, rkmce, rkmce_avlnode); - if (old) + /* Insert (and replace existing) entry into the AVL tree sorted + * by topic id. */ + if (!RD_KAFKA_UUID_IS_ZERO( + rkmce->rkmce_metadata_internal_topic.topic_id)) { + /* If topic id isn't zero insert cache entry into this tree */ + old_by_id = RD_AVL_INSERT(&rk->rk_metadata_cache.rkmc_avl_by_id, + rkmce, rkmce_avlnode_by_id); + } + if (old && + !RD_KAFKA_UUID_IS_ZERO( + old->rkmce_metadata_internal_topic.topic_id) && + rd_kafka_Uuid_cmp(rkmce->rkmce_metadata_internal_topic.topic_id, + old->rkmce_metadata_internal_topic.topic_id) != + 0) { + /* If it had a different topic id, remove it from the tree */ + RD_AVL_REMOVE_ELM(&rk->rk_metadata_cache.rkmc_avl_by_id, old); + } + if (old) { + /* Delete and free old cache entry */ rd_kafka_metadata_cache_delete(rk, old, 0); + } + if (old_by_id && old_by_id != old) { + rd_dassert( + !*"Different cache entries for topic name and topic id"); + } /* Explicitly not freeing the tmpabuf since rkmce points to its * memory. */ return rkmce; } +/** + * @brief Add (and replace) cache entry for topic. + * + * This makes a copy of \p mtopic and \p metadata_internal_topic , + * + * @locks_required rd_kafka_wrlock() + */ +static struct rd_kafka_metadata_cache_entry *rd_kafka_metadata_cache_insert_new( + rd_kafka_t *rk, + const rd_kafka_metadata_topic_t *mtopic, + const rd_kafka_metadata_topic_internal_t *metadata_internal_topic, + rd_ts_t now, + rd_ts_t ts_expires, + rd_bool_t include_racks) { + /* Create entry */ + struct rd_kafka_metadata_cache_entry *rkmce = + rd_kafka_metadata_cache_entry_new(mtopic, metadata_internal_topic, + include_racks); + /* Insert/replace entry */ + return rd_kafka_metadata_cache_insert(rk, rkmce, now, ts_expires); +} /** * @brief Purge the metadata cache @@ -300,6 +475,142 @@ void rd_kafka_metadata_cache_expiry_start(rd_kafka_t *rk) { rd_kafka_metadata_cache_evict_tmr_cb, rk); } +#define rd_kafka_metadata_cache_topic_update_replace_partition( \ + current_partition, new_partition, current_partition_cnt, \ + new_partition_cnt, partition) \ + ((partition) < (current_partition_cnt) && \ + (partition) >= (new_partition_cnt) \ + ? rd_false \ + : (partition) < (new_partition_cnt) && \ + (partition) >= (current_partition_cnt) \ + ? rd_true \ + : (new_partition).leader_epoch == -1 || \ + (new_partition).leader_epoch >= \ + (current_partition.leader_epoch)); + + +static struct rd_kafka_metadata_cache_entry * +rd_kafka_metadata_cache_topic_update_merge_partitions( + rd_kafka_t *rk, + struct rd_kafka_metadata_cache_entry *rkmce_current, + const rd_kafka_metadata_topic_t *mdt, + const rd_kafka_metadata_topic_internal_t *mdti, + rd_bool_t include_racks, + rd_bool_t has_reliable_epochs) { + rd_tmpabuf_t tbuf; + struct rd_kafka_metadata_cache_entry *rkmce; + size_t i, current_partition_cnt, new_partition_cnt, partition_cnt; + + if (!has_reliable_epochs || !rkmce_current || + /* Different topic ids */ + rd_kafka_Uuid_cmp( + mdti->topic_id, + rkmce_current->rkmce_metadata_internal_topic.topic_id) != 0) { + return rd_kafka_metadata_cache_entry_new(mdt, mdti, + include_racks); + } + + current_partition_cnt = rkmce_current->rkmce_mtopic.partition_cnt; + new_partition_cnt = mdt->partition_cnt; + partition_cnt = RD_MAX(current_partition_cnt, new_partition_cnt); + + rd_tmpabuf_new(&tbuf, sizeof(*rkmce), rd_true /*assert on fail*/); + rd_tmpabuf_add_alloc(&tbuf, sizeof(*rkmce)); + rd_tmpabuf_add_alloc(&tbuf, strlen(mdt->topic) + 1); + rd_tmpabuf_add_alloc(&tbuf, partition_cnt * sizeof(*mdt->partitions)); + rd_tmpabuf_add_alloc(&tbuf, partition_cnt * sizeof(*mdti->partitions)); + + for (i = 0; include_racks && i < partition_cnt; i++) { + size_t j; + rd_kafka_metadata_partition_internal_t *partition_internal; + rd_bool_t replace_partition = + rd_kafka_metadata_cache_topic_update_replace_partition( + rkmce_current->rkmce_metadata_internal_topic + .partitions[i], + mdti->partitions[i], current_partition_cnt, + new_partition_cnt, i); + + partition_internal = + replace_partition + ? &mdti->partitions[i] + : &rkmce_current->rkmce_metadata_internal_topic + .partitions[i]; + rd_tmpabuf_add_alloc(&tbuf, partition_internal->racks_cnt * + sizeof(char *)); + for (j = 0; j < partition_internal->racks_cnt; j++) { + rd_tmpabuf_add_alloc( + &tbuf, strlen(partition_internal->racks[j]) + 1); + } + } + + rd_tmpabuf_finalize(&tbuf); + + rkmce = rd_tmpabuf_alloc(&tbuf, sizeof(*rkmce)); + + rkmce->rkmce_mtopic = *mdt; + + rkmce->rkmce_metadata_internal_topic = *mdti; + + /* Copy topic name */ + rkmce->rkmce_mtopic.topic = rd_tmpabuf_write_str(&tbuf, mdt->topic); + + /* Allocate partition array */ + rkmce->rkmce_mtopic.partitions = + rd_tmpabuf_alloc(&tbuf, partition_cnt * sizeof(*mdt->partitions)); + + /* Allocate partition array (internal) */ + rkmce->rkmce_metadata_internal_topic.partitions = + rd_tmpabuf_alloc(&tbuf, partition_cnt * sizeof(*mdti->partitions)); + + for (i = 0; i < partition_cnt; i++) { + struct rd_kafka_metadata_partition *partition; + rd_kafka_metadata_partition_internal_t *partition_internal; + + rd_bool_t replace_partition = + rd_kafka_metadata_cache_topic_update_replace_partition( + rkmce_current->rkmce_metadata_internal_topic + .partitions[i], + mdti->partitions[i], current_partition_cnt, + new_partition_cnt, i); + + if (replace_partition) { + partition = &mdt->partitions[i]; + partition_internal = &mdti->partitions[i]; + } else { + partition = &rkmce_current->rkmce_mtopic.partitions[i]; + partition_internal = + &rkmce_current->rkmce_metadata_internal_topic + .partitions[i]; + } + + rkmce->rkmce_mtopic.partitions[i] = *partition; + rkmce->rkmce_metadata_internal_topic.partitions[i] = + *partition_internal; + + if (include_racks) { + size_t j; + rkmce->rkmce_metadata_internal_topic.partitions[i] + .racks = rd_tmpabuf_alloc( + &tbuf, + partition_internal->racks_cnt * sizeof(char *)); + rkmce->rkmce_metadata_internal_topic.partitions[i] + .racks_cnt = partition_internal->racks_cnt; + for (j = 0; j < partition_internal->racks_cnt; j++) { + rkmce->rkmce_metadata_internal_topic + .partitions[i] + .racks[j] = rd_tmpabuf_write_str( + &tbuf, partition_internal->racks[j]); + } + } else { + rkmce->rkmce_metadata_internal_topic.partitions[i] + .racks = NULL; + rkmce->rkmce_metadata_internal_topic.partitions[i] + .racks_cnt = 0; + } + } + return rkmce; +} + /** * @brief Update the metadata cache for a single topic * with the provided metadata. @@ -316,72 +627,87 @@ void rd_kafka_metadata_cache_expiry_start(rd_kafka_t *rk) { * For permanent errors (authorization failures), we keep * the entry cached for metadata.max.age.ms. * + * @param mdt Topic to insert in the cache entry. + * @param mdti Topic to insert in the cache entry (internal structure). + * @param propagate Propagate metadata cache changes now. + * @param include_racks Include partition racks. + * @param has_reliable_leader_epochs Comes from a broker with reliable leader + * epochs. + * + * @return 1 on metadata change, 0 when no change was applied + * * @remark The cache expiry timer will not be updated/started, * call rd_kafka_metadata_cache_expiry_start() instead. * * @locks rd_kafka_wrlock() */ -void rd_kafka_metadata_cache_topic_update(rd_kafka_t *rk, - const rd_kafka_metadata_topic_t *mdt, - rd_bool_t propagate) { +int rd_kafka_metadata_cache_topic_update( + rd_kafka_t *rk, + const rd_kafka_metadata_topic_t *mdt, + const rd_kafka_metadata_topic_internal_t *mdti, + rd_bool_t propagate, + rd_bool_t include_racks, + rd_bool_t has_reliable_leader_epochs) { rd_ts_t now = rd_clock(); rd_ts_t ts_expires = now + (rk->rk_conf.metadata_max_age_ms * 1000); - int changed = 1; + int changed = 0; - /* Cache unknown topics for a short while (100ms) to allow the cgrp - * logic to find negative cache hits. */ - if (mdt->err == RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART) - ts_expires = RD_MIN(ts_expires, now + (100 * 1000)); + if (likely(mdt->topic != NULL)) { + struct rd_kafka_metadata_cache_entry *rkmce, + *rkmce_current = NULL; + rd_kafka_metadata_topic_internal_t mdti_copy = *mdti; + switch (mdt->err) { + case RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART: + /* Cache unknown topics for metadata.propagation.max.ms + * to allow the cgrp logic to find negative cache hits. + * and to avoid false reappearances of the topic + * after deletion. */ + ts_expires = RD_MIN(ts_expires, now + (100 * 1000)); - if (!mdt->err || - mdt->err == RD_KAFKA_RESP_ERR_TOPIC_AUTHORIZATION_FAILED || - mdt->err == RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART) - rd_kafka_metadata_cache_insert(rk, mdt, now, ts_expires); - else - changed = - rd_kafka_metadata_cache_delete_by_name(rk, mdt->topic); + /* Continue */ + case RD_KAFKA_RESP_ERR_NO_ERROR: + rkmce_current = + rd_kafka_metadata_cache_find(rk, mdt->topic, 1); + if (mdt->err == + RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART && + rkmce_current && + RD_KAFKA_UUID_IS_ZERO(mdti->topic_id) && + !RD_KAFKA_UUID_IS_ZERO( + rkmce_current->rkmce_metadata_internal_topic + .topic_id)) { + /* Keep the existing topic id to detect + * if the same id is received again + * as existing */ + mdti_copy.topic_id = + rkmce_current->rkmce_metadata_internal_topic + .topic_id; + } + + /* Continue */ + case RD_KAFKA_RESP_ERR_TOPIC_AUTHORIZATION_FAILED: + rkmce = + rd_kafka_metadata_cache_topic_update_merge_partitions( + rk, rkmce_current, mdt, &mdti_copy, + include_racks, has_reliable_leader_epochs); + /* Insert/replace entry */ + rd_kafka_metadata_cache_insert(rk, rkmce, now, + ts_expires); + changed = 1; + break; + default: + break; + } + } else { + /* Cache entry found but no topic name: + * delete it. */ + changed = rd_kafka_metadata_cache_delete_by_topic_id( + rk, mdti->topic_id); + } if (changed && propagate) rd_kafka_metadata_cache_propagate_changes(rk); -} - -/** - * @brief Update the metadata cache with the provided metadata. - * - * @param abs_update int: absolute update: purge cache before updating. - * - * @locks rd_kafka_wrlock() - */ -void rd_kafka_metadata_cache_update(rd_kafka_t *rk, - const rd_kafka_metadata_t *md, - int abs_update) { - struct rd_kafka_metadata_cache_entry *rkmce; - rd_ts_t now = rd_clock(); - rd_ts_t ts_expires = now + (rk->rk_conf.metadata_max_age_ms * 1000); - int i; - - rd_kafka_dbg(rk, METADATA, "METADATA", - "%s of metadata cache with %d topic(s)", - abs_update ? "Absolute update" : "Update", md->topic_cnt); - - if (abs_update) - rd_kafka_metadata_cache_purge(rk, rd_false /*not observers*/); - - - for (i = 0; i < md->topic_cnt; i++) - rd_kafka_metadata_cache_insert(rk, &md->topics[i], now, - ts_expires); - - /* Update expiry timer */ - if ((rkmce = TAILQ_FIRST(&rk->rk_metadata_cache.rkmc_expiry))) - rd_kafka_timer_start(&rk->rk_timers, - &rk->rk_metadata_cache.rkmc_expiry_tmr, - rkmce->rkmce_ts_expires - now, - rd_kafka_metadata_cache_evict_tmr_cb, rk); - - if (md->topic_cnt > 0 || abs_update) - rd_kafka_metadata_cache_propagate_changes(rk); + return changed; } @@ -419,6 +745,40 @@ void rd_kafka_metadata_cache_purge_hints(rd_kafka_t *rk, } } +/** + * @brief Remove cache hints for topic ids in \p topic_ids + * This is done when the Metadata response has been parsed and + * replaced hints with existing topic information, thus this will + * only remove unmatched topics from the cache. + * + * @locks rd_kafka_wrlock() + */ +void rd_kafka_metadata_cache_purge_hints_by_id(rd_kafka_t *rk, + const rd_list_t *topic_ids) { + const rd_kafka_Uuid_t *topic_id; + int i; + int cnt = 0; + + RD_LIST_FOREACH(topic_id, topic_ids, i) { + struct rd_kafka_metadata_cache_entry *rkmce; + + if (!(rkmce = rd_kafka_metadata_cache_find_by_id(rk, *topic_id, + 0 /*any*/)) || + RD_KAFKA_METADATA_CACHE_VALID(rkmce)) + continue; + + rd_kafka_metadata_cache_delete(rk, rkmce, 1 /*unlink avl*/); + cnt++; + } + + if (cnt > 0) { + rd_kafka_dbg(rk, METADATA, "METADATA", + "Purged %d/%d cached topic hint(s)", cnt, + rd_list_cnt(topic_ids)); + rd_kafka_metadata_cache_propagate_changes(rk); + } +} + /** * @brief Inserts a non-valid entry for topics in \p topics indicating @@ -435,7 +795,6 @@ void rd_kafka_metadata_cache_purge_hints(rd_kafka_t *rk, * @param dst rd_list_t(char *topicname) * @param err is the error to set on hint cache entries, * typically ERR__WAIT_CACHE. - * @param replace replace existing valid entries * * @returns the number of topic hints inserted. * @@ -444,8 +803,7 @@ void rd_kafka_metadata_cache_purge_hints(rd_kafka_t *rk, int rd_kafka_metadata_cache_hint(rd_kafka_t *rk, const rd_list_t *topics, rd_list_t *dst, - rd_kafka_resp_err_t err, - rd_bool_t replace) { + rd_kafka_resp_err_t err) { const char *topic; rd_ts_t now = rd_clock(); rd_ts_t ts_expires = now + (rk->rk_conf.socket_timeout_ms * 1000); @@ -455,11 +813,12 @@ int rd_kafka_metadata_cache_hint(rd_kafka_t *rk, RD_LIST_FOREACH(topic, topics, i) { rd_kafka_metadata_topic_t mtopic = {.topic = (char *)topic, .err = err}; + rd_kafka_metadata_topic_internal_t metadata_internal_topic = + RD_ZERO_INIT; /*const*/ struct rd_kafka_metadata_cache_entry *rkmce; - /* !replace: Dont overwrite valid entries */ - if (!replace && (rkmce = rd_kafka_metadata_cache_find( - rk, topic, 0 /*any*/))) { + if ((rkmce = + rd_kafka_metadata_cache_find(rk, topic, 0 /*any*/))) { if (RD_KAFKA_METADATA_CACHE_VALID(rkmce) || (dst && rkmce->rkmce_mtopic.err != RD_KAFKA_RESP_ERR__NOENT)) @@ -468,7 +827,9 @@ int rd_kafka_metadata_cache_hint(rd_kafka_t *rk, /* FALLTHRU */ } - rd_kafka_metadata_cache_insert(rk, &mtopic, now, ts_expires); + rd_kafka_metadata_cache_insert_new(rk, &mtopic, + &metadata_internal_topic, + now, ts_expires, rd_false); cnt++; if (dst) @@ -493,8 +854,7 @@ int rd_kafka_metadata_cache_hint(rd_kafka_t *rk, int rd_kafka_metadata_cache_hint_rktparlist( rd_kafka_t *rk, const rd_kafka_topic_partition_list_t *rktparlist, - rd_list_t *dst, - int replace) { + rd_list_t *dst) { rd_list_t topics; int r; @@ -502,8 +862,8 @@ int rd_kafka_metadata_cache_hint_rktparlist( rd_kafka_topic_partition_list_get_topic_names(rktparlist, &topics, 0 /*dont include regex*/); rd_kafka_wrlock(rk); - r = rd_kafka_metadata_cache_hint( - rk, &topics, dst, RD_KAFKA_RESP_ERR__WAIT_CACHE, replace); + r = rd_kafka_metadata_cache_hint(rk, &topics, dst, + RD_KAFKA_RESP_ERR__WAIT_CACHE); rd_kafka_wrunlock(rk); rd_list_destroy(&topics); @@ -519,6 +879,16 @@ static int rd_kafka_metadata_cache_entry_cmp(const void *_a, const void *_b) { return strcmp(a->rkmce_mtopic.topic, b->rkmce_mtopic.topic); } +/** + * @brief Cache entry comparator (on topic id) + */ +static int rd_kafka_metadata_cache_entry_by_id_cmp(const void *_a, + const void *_b) { + const struct rd_kafka_metadata_cache_entry *a = _a, *b = _b; + return rd_kafka_Uuid_cmp(a->rkmce_metadata_internal_topic.topic_id, + b->rkmce_metadata_internal_topic.topic_id); +} + /** * @brief Initialize the metadata cache @@ -528,6 +898,8 @@ static int rd_kafka_metadata_cache_entry_cmp(const void *_a, const void *_b) { void rd_kafka_metadata_cache_init(rd_kafka_t *rk) { rd_avl_init(&rk->rk_metadata_cache.rkmc_avl, rd_kafka_metadata_cache_entry_cmp, 0); + rd_avl_init(&rk->rk_metadata_cache.rkmc_avl_by_id, + rd_kafka_metadata_cache_entry_by_id_cmp, 0); TAILQ_INIT(&rk->rk_metadata_cache.rkmc_expiry); mtx_init(&rk->rk_metadata_cache.rkmc_full_lock, mtx_plain); mtx_init(&rk->rk_metadata_cache.rkmc_cnd_lock, mtx_plain); @@ -550,6 +922,7 @@ void rd_kafka_metadata_cache_destroy(rd_kafka_t *rk) { mtx_destroy(&rk->rk_metadata_cache.rkmc_cnd_lock); cnd_destroy(&rk->rk_metadata_cache.rkmc_cnd); rd_avl_destroy(&rk->rk_metadata_cache.rkmc_avl); + rd_avl_destroy(&rk->rk_metadata_cache.rkmc_avl_by_id); } @@ -626,20 +999,27 @@ void rd_kafka_metadata_cache_propagate_changes(rd_kafka_t *rk) { } /** + * @param mdtip If non NULL, it's set to a pointer to internal topic metadata, + * or to NULL if not found in cache. * @returns the shared metadata for a topic, or NULL if not found in * cache. * * @locks rd_kafka_*lock() */ -const rd_kafka_metadata_topic_t * -rd_kafka_metadata_cache_topic_get(rd_kafka_t *rk, - const char *topic, - int valid) { +const rd_kafka_metadata_topic_t *rd_kafka_metadata_cache_topic_get( + rd_kafka_t *rk, + const char *topic, + const rd_kafka_metadata_topic_internal_t **mdtip, + int valid) { struct rd_kafka_metadata_cache_entry *rkmce; - if (!(rkmce = rd_kafka_metadata_cache_find(rk, topic, valid))) + if (!(rkmce = rd_kafka_metadata_cache_find(rk, topic, valid))) { + if (mdtip) + *mdtip = NULL; return NULL; - + } + if (mdtip) + *mdtip = &rkmce->rkmce_metadata_internal_topic; return &rkmce->rkmce_mtopic; } @@ -653,6 +1033,7 @@ rd_kafka_metadata_cache_topic_get(rd_kafka_t *rk, * * @param mtopicp: pointer to topic metadata * @param mpartp: pointer to partition metadata + * @param mdpip: pointer to internal partition metadata * @param valid: only return valid entries (no hints) * * @returns -1 if topic was not found in cache, 0 if topic was found @@ -664,18 +1045,22 @@ int rd_kafka_metadata_cache_topic_partition_get( rd_kafka_t *rk, const rd_kafka_metadata_topic_t **mtopicp, const rd_kafka_metadata_partition_t **mpartp, + const rd_kafka_metadata_partition_internal_t **mdpip, const char *topic, int32_t partition, int valid) { const rd_kafka_metadata_topic_t *mtopic; + const rd_kafka_metadata_topic_internal_t *mdti; const rd_kafka_metadata_partition_t *mpart; rd_kafka_metadata_partition_t skel = {.id = partition}; *mtopicp = NULL; *mpartp = NULL; + *mdpip = NULL; - if (!(mtopic = rd_kafka_metadata_cache_topic_get(rk, topic, valid))) + if (!(mtopic = + rd_kafka_metadata_cache_topic_get(rk, topic, &mdti, valid))) return -1; *mtopicp = mtopic; @@ -692,6 +1077,8 @@ int rd_kafka_metadata_cache_topic_partition_get( return 0; *mpartp = mpart; + if (mdpip) + *mdpip = &mdti->partitions[mpart->id]; return 1; } @@ -738,17 +1125,21 @@ int rd_kafka_metadata_cache_topics_count_exists(rd_kafka_t *rk, * * Element type is (char *topic_name). * + * @param exclude_valid Exclude topics that have up to date metadata info. + * * @returns the number of elements added to \p topics * * @locks_required rd_kafka_*lock() */ -int rd_kafka_metadata_cache_topics_to_list(rd_kafka_t *rk, rd_list_t *topics) { +int rd_kafka_metadata_cache_topics_to_list(rd_kafka_t *rk, + rd_list_t *topics, + rd_bool_t exclude_valid) { const struct rd_kafka_metadata_cache_entry *rkmce; int precnt = rd_list_cnt(topics); TAILQ_FOREACH(rkmce, &rk->rk_metadata_cache.rkmc_expiry, rkmce_link) { /* Ignore topics that have up to date metadata info */ - if (RD_KAFKA_METADATA_CACHE_VALID(rkmce)) + if (exclude_valid && RD_KAFKA_METADATA_CACHE_VALID(rkmce)) continue; if (rd_list_find(topics, rkmce->rkmce_mtopic.topic, diff --git a/src/third_party/librdkafka/dist/src/rdkafka_mock.c b/src/third_party/librdkafka/dist/src/rdkafka_mock.c index e28c66484ef..154d10580ab 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_mock.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_mock.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,11 +38,19 @@ #include "rdkafka_interceptor.h" #include "rdkafka_mock_int.h" #include "rdkafka_transport_int.h" +#include "rdkafka_mock.h" +#include "rdunittest.h" #include -static void rd_kafka_mock_cluster_destroy0(rd_kafka_mock_cluster_t *mcluster); +typedef struct rd_kafka_mock_request_s rd_kafka_mock_request_t; +static void rd_kafka_mock_cluster_destroy0(rd_kafka_mock_cluster_t *mcluster); +static rd_kafka_mock_request_t * +rd_kafka_mock_request_new(int32_t id, int16_t api_key, int64_t timestamp_us); +static void rd_kafka_mock_request_free(void *element); +static void rd_kafka_mock_coord_remove(rd_kafka_mock_cluster_t *mcluster, + int32_t broker_id); static rd_kafka_mock_broker_t * rd_kafka_mock_broker_find(const rd_kafka_mock_cluster_t *mcluster, @@ -93,6 +102,7 @@ rd_kafka_mock_msgset_new(rd_kafka_mock_partition_t *mpart, rd_kafka_mock_msgset_t *mset; size_t totsize = sizeof(*mset) + RD_KAFKAP_BYTES_LEN(bytes); int64_t BaseOffset; + int32_t PartitionLeaderEpoch; int64_t orig_start_offset = mpart->start_offset; rd_assert(!RD_KAFKAP_BYTES_IS_NULL(bytes)); @@ -107,7 +117,8 @@ rd_kafka_mock_msgset_new(rd_kafka_mock_partition_t *mpart, mpart->follower_end_offset = mpart->end_offset; mpart->cnt++; - mset->bytes.len = bytes->len; + mset->bytes.len = bytes->len; + mset->leader_epoch = mpart->leader_epoch; mset->bytes.data = (void *)(mset + 1); @@ -118,7 +129,11 @@ rd_kafka_mock_msgset_new(rd_kafka_mock_partition_t *mpart, * actual absolute log offset. */ BaseOffset = htobe64(mset->first_offset); memcpy((void *)mset->bytes.data, &BaseOffset, sizeof(BaseOffset)); - + /* Update the base PartitionLeaderEpoch in the MessageSet with the + * actual partition leader epoch. */ + PartitionLeaderEpoch = htobe32(mset->leader_epoch); + memcpy(((char *)mset->bytes.data) + 12, &PartitionLeaderEpoch, + sizeof(PartitionLeaderEpoch)); /* Remove old msgsets until within limits */ while (mpart->cnt > 1 && @@ -365,6 +380,52 @@ static void rd_kafka_mock_partition_set_leader0(rd_kafka_mock_partition_t *mpart, rd_kafka_mock_broker_t *mrkb) { mpart->leader = mrkb; + mpart->leader_epoch++; +} + + +/** + * @brief Verifies that the client-provided leader_epoch matches that of the + * partition, else returns the appropriate error. + */ +rd_kafka_resp_err_t rd_kafka_mock_partition_leader_epoch_check( + const rd_kafka_mock_partition_t *mpart, + int32_t leader_epoch) { + if (likely(leader_epoch == -1 || mpart->leader_epoch == leader_epoch)) + return RD_KAFKA_RESP_ERR_NO_ERROR; + else if (mpart->leader_epoch < leader_epoch) + return RD_KAFKA_RESP_ERR_UNKNOWN_LEADER_EPOCH; + else if (mpart->leader_epoch > leader_epoch) + return RD_KAFKA_RESP_ERR_FENCED_LEADER_EPOCH; + + /* NOTREACHED, but avoids warning */ + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Returns the end offset (last offset + 1) + * for the passed leader epoch in the mock partition. + * + * @param mpart The mock partition + * @param leader_epoch The leader epoch + * + * @return The end offset for the passed \p leader_epoch in \p mpart + */ +int64_t rd_kafka_mock_partition_offset_for_leader_epoch( + const rd_kafka_mock_partition_t *mpart, + int32_t leader_epoch) { + const rd_kafka_mock_msgset_t *mset = NULL; + + if (leader_epoch < 0) + return -1; + + TAILQ_FOREACH_REVERSE(mset, &mpart->msgsets, + rd_kafka_mock_msgset_tailq_s, link) { + if (mset->leader_epoch == leader_epoch) + return mset->last_offset + 1; + } + + return -1; } @@ -372,20 +433,41 @@ rd_kafka_mock_partition_set_leader0(rd_kafka_mock_partition_t *mpart, * @brief Automatically assign replicas for partition */ static void -rd_kafka_mock_partition_assign_replicas(rd_kafka_mock_partition_t *mpart) { +rd_kafka_mock_partition_assign_replicas(rd_kafka_mock_partition_t *mpart, + int replication_factor) { rd_kafka_mock_cluster_t *mcluster = mpart->topic->cluster; - int replica_cnt = - RD_MIN(mcluster->defaults.replication_factor, mcluster->broker_cnt); + int replica_cnt = RD_MIN(replication_factor, mcluster->broker_cnt); rd_kafka_mock_broker_t *mrkb; int i = 0; + int first_replica; + int skipped = 0; if (mpart->replicas) rd_free(mpart->replicas); - mpart->replicas = rd_calloc(replica_cnt, sizeof(*mpart->replicas)); + mpart->replicas = replica_cnt + ? rd_calloc(replica_cnt, sizeof(*mpart->replicas)) + : NULL; mpart->replica_cnt = replica_cnt; + if (replica_cnt == 0) { + rd_kafka_mock_partition_set_leader0(mpart, NULL); + return; + } - /* FIXME: randomize this using perhaps reservoir sampling */ + first_replica = (mpart->id * replication_factor) % mcluster->broker_cnt; + + /* Use a predictable, determininistic order on a per-topic basis. + * + * Two loops are needed for wraparound. */ + TAILQ_FOREACH(mrkb, &mcluster->brokers, link) { + if (skipped < first_replica) { + skipped++; + continue; + } + if (i == mpart->replica_cnt) + break; + mpart->replicas[i++] = mrkb; + } TAILQ_FOREACH(mrkb, &mcluster->brokers, link) { if (i == mpart->replica_cnt) break; @@ -397,7 +479,39 @@ rd_kafka_mock_partition_assign_replicas(rd_kafka_mock_partition_t *mpart) { mpart, mpart->replicas[rd_jitter(0, replica_cnt - 1)]); } +/** + * @brief Push a partition leader response to passed \p mpart . + */ +static void +rd_kafka_mock_partition_push_leader_response0(rd_kafka_mock_partition_t *mpart, + int32_t leader_id, + int32_t leader_epoch) { + rd_kafka_mock_partition_leader_t *leader_response; + leader_response = rd_calloc(1, sizeof(*leader_response)); + leader_response->leader_id = leader_id; + leader_response->leader_epoch = leader_epoch; + TAILQ_INSERT_TAIL(&mpart->leader_responses, leader_response, link); +} + +/** + * @brief Return the first mocked partition leader response in \p mpart , + * if available. + */ +rd_kafka_mock_partition_leader_t * +rd_kafka_mock_partition_next_leader_response(rd_kafka_mock_partition_t *mpart) { + return TAILQ_FIRST(&mpart->leader_responses); +} + +/** + * @brief Unlink and destroy a partition leader response + */ +void rd_kafka_mock_partition_leader_destroy( + rd_kafka_mock_partition_t *mpart, + rd_kafka_mock_partition_leader_t *mpart_leader) { + TAILQ_REMOVE(&mpart->leader_responses, mpart_leader, link); + rd_free(mpart_leader); +} /** * @brief Unlink and destroy committed offset @@ -474,6 +588,7 @@ rd_kafka_mock_commit_offset(rd_kafka_mock_partition_t *mpart, static void rd_kafka_mock_partition_destroy(rd_kafka_mock_partition_t *mpart) { rd_kafka_mock_msgset_t *mset, *tmp; rd_kafka_mock_committed_offset_t *coff, *tmpcoff; + rd_kafka_mock_partition_leader_t *mpart_leader, *tmp_mpart_leader; TAILQ_FOREACH_SAFE(mset, &mpart->msgsets, link, tmp) rd_kafka_mock_msgset_destroy(mpart, mset); @@ -481,6 +596,10 @@ static void rd_kafka_mock_partition_destroy(rd_kafka_mock_partition_t *mpart) { TAILQ_FOREACH_SAFE(coff, &mpart->committed_offsets, link, tmpcoff) rd_kafka_mock_committed_offset_destroy(mpart, coff); + TAILQ_FOREACH_SAFE(mpart_leader, &mpart->leader_responses, link, + tmp_mpart_leader) + rd_kafka_mock_partition_leader_destroy(mpart, mpart_leader); + rd_list_destroy(&mpart->pidstates); rd_free(mpart->replicas); @@ -494,7 +613,9 @@ static void rd_kafka_mock_partition_init(rd_kafka_mock_topic_t *mtopic, mpart->topic = mtopic; mpart->id = id; - mpart->follower_id = -1; + mpart->follower_id = -1; + mpart->leader_epoch = -1; /* Start at -1 since assign_replicas() will + * bump it right away to 0. */ TAILQ_INIT(&mpart->msgsets); @@ -505,16 +626,17 @@ static void rd_kafka_mock_partition_init(rd_kafka_mock_topic_t *mtopic, mpart->update_follower_end_offset = rd_true; TAILQ_INIT(&mpart->committed_offsets); + TAILQ_INIT(&mpart->leader_responses); rd_list_init(&mpart->pidstates, 0, rd_free); - rd_kafka_mock_partition_assign_replicas(mpart); + rd_kafka_mock_partition_assign_replicas(mpart, replication_factor); } rd_kafka_mock_partition_t * rd_kafka_mock_partition_find(const rd_kafka_mock_topic_t *mtopic, int32_t partition) { - if (partition < 0 || partition >= mtopic->partition_cnt) + if (!mtopic || partition < 0 || partition >= mtopic->partition_cnt) return NULL; return (rd_kafka_mock_partition_t *)&mtopic->partitions[partition]; @@ -544,7 +666,9 @@ rd_kafka_mock_topic_new(rd_kafka_mock_cluster_t *mcluster, rd_kafka_mock_topic_t *mtopic; int i; - mtopic = rd_calloc(1, sizeof(*mtopic)); + mtopic = rd_calloc(1, sizeof(*mtopic)); + /* Assign random topic id */ + mtopic->id = rd_kafka_Uuid_random(); mtopic->name = rd_strdup(topic); mtopic->cluster = mcluster; @@ -597,6 +721,28 @@ rd_kafka_mock_topic_find_by_kstr(const rd_kafka_mock_cluster_t *mcluster, return NULL; } +/** + * @brief Find a mock topic by id. + * + * @param mcluster Cluster to search in. + * @param id Topic id to find. + * @return Found topic or NULL. + * + * @locks mcluster->lock MUST be held. + */ +rd_kafka_mock_topic_t * +rd_kafka_mock_topic_find_by_id(const rd_kafka_mock_cluster_t *mcluster, + rd_kafka_Uuid_t id) { + const rd_kafka_mock_topic_t *mtopic; + + TAILQ_FOREACH(mtopic, &mcluster->topics, link) { + if (!rd_kafka_Uuid_cmp(mtopic->id, id)) + return (rd_kafka_mock_topic_t *)mtopic; + } + + return NULL; +} + /** * @brief Create a topic using default settings. @@ -782,6 +928,23 @@ static void rd_kafka_mock_cluster_io_add(rd_kafka_mock_cluster_t *mcluster, mcluster->fd_cnt++; } +/** + * @brief Reassign partition replicas to broker, after deleting or + * adding a new one. + */ +static void +rd_kafka_mock_cluster_reassign_partitions(rd_kafka_mock_cluster_t *mcluster) { + rd_kafka_mock_topic_t *mtopic; + TAILQ_FOREACH(mtopic, &mcluster->topics, link) { + int i; + for (i = 0; i < mtopic->partition_cnt; i++) { + rd_kafka_mock_partition_t *mpart = + &mtopic->partitions[i]; + rd_kafka_mock_partition_assign_replicas( + mpart, mpart->replica_cnt); + } + } +} static void rd_kafka_mock_connection_close(rd_kafka_mock_connection_t *mconn, const char *reason) { @@ -813,11 +976,11 @@ static void rd_kafka_mock_connection_close(rd_kafka_mock_connection_t *mconn, rd_free(mconn); } +void rd_kafka_mock_connection_send_response0(rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *resp, + rd_bool_t tags_written) { -void rd_kafka_mock_connection_send_response(rd_kafka_mock_connection_t *mconn, - rd_kafka_buf_t *resp) { - - if (resp->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER) { + if (!tags_written && (resp->rkbuf_flags & RD_KAFKA_OP_F_FLEXVER)) { /* Empty struct tags */ rd_kafka_buf_write_i8(resp, 0); } @@ -970,7 +1133,7 @@ rd_kafka_mock_connection_read_request(rd_kafka_mock_connection_t *mconn, RD_KAFKAP_REQHDR_SIZE); /* For convenience, shave off the ClientId */ - rd_kafka_buf_skip_str(rkbuf); + rd_kafka_buf_skip_str_no_flexver(rkbuf); /* And the flexible versions header tags, if any */ rd_kafka_buf_skip_tags(rkbuf); @@ -1057,6 +1220,15 @@ rd_kafka_mock_connection_parse_request(rd_kafka_mock_connection_t *mconn, return -1; } + mtx_lock(&mcluster->lock); + if (mcluster->track_requests) { + rd_list_add(&mcluster->request_list, + rd_kafka_mock_request_new( + mconn->broker->id, rkbuf->rkbuf_reqhdr.ApiKey, + rd_clock())); + } + mtx_unlock(&mcluster->lock); + rd_kafka_dbg(rk, MOCK, "MOCK", "Broker %" PRId32 ": Received %sRequestV%hd from %s", mconn->broker->id, @@ -1416,6 +1588,11 @@ static void rd_kafka_mock_broker_destroy(rd_kafka_mock_broker_t *mrkb) { rd_kafka_mock_error_stack_destroy(errstack); } + if (mrkb->rack) + rd_free(mrkb->rack); + + rd_kafka_mock_coord_remove(mrkb->cluster, mrkb->id); + TAILQ_REMOVE(&mrkb->cluster->brokers, mrkb, link); mrkb->cluster->broker_cnt--; @@ -1423,6 +1600,30 @@ static void rd_kafka_mock_broker_destroy(rd_kafka_mock_broker_t *mrkb) { } +rd_kafka_resp_err_t +rd_kafka_mock_broker_decommission(rd_kafka_mock_cluster_t *mcluster, + int32_t broker_id) { + rd_kafka_op_t *rko = rd_kafka_op_new(RD_KAFKA_OP_MOCK); + + rko->rko_u.mock.broker_id = broker_id; + rko->rko_u.mock.cmd = RD_KAFKA_MOCK_CMD_BROKER_DECOMMISSION; + + return rd_kafka_op_err_destroy( + rd_kafka_op_req(mcluster->ops, rko, RD_POLL_INFINITE)); +} + +rd_kafka_resp_err_t rd_kafka_mock_broker_add(rd_kafka_mock_cluster_t *mcluster, + int32_t broker_id) { + rd_kafka_op_t *rko = rd_kafka_op_new(RD_KAFKA_OP_MOCK); + + rko->rko_u.mock.broker_id = broker_id; + rko->rko_u.mock.cmd = RD_KAFKA_MOCK_CMD_BROKER_ADD; + + return rd_kafka_op_err_destroy( + rd_kafka_op_req(mcluster->ops, rko, RD_POLL_INFINITE)); +} + + /** * @brief Starts listening on the mock broker socket. * @@ -1516,16 +1717,28 @@ static int rd_kafka_mock_broker_new_listener(rd_kafka_mock_cluster_t *mcluster, static rd_kafka_mock_broker_t * -rd_kafka_mock_broker_new(rd_kafka_mock_cluster_t *mcluster, int32_t broker_id) { +rd_kafka_mock_broker_new(rd_kafka_mock_cluster_t *mcluster, + int32_t broker_id, + rd_kafka_resp_err_t *err) { rd_kafka_mock_broker_t *mrkb; rd_socket_t listen_s; struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr = {.s_addr = htonl(INADDR_LOOPBACK)}}; - listen_s = rd_kafka_mock_broker_new_listener(mcluster, &sin); - if (listen_s == -1) + if (rd_kafka_mock_broker_find(mcluster, broker_id)) { + if (err) + *err = RD_KAFKA_RESP_ERR__INVALID_ARG; + /* A broker with this id already exists. */ return NULL; + } + + listen_s = rd_kafka_mock_broker_new_listener(mcluster, &sin); + if (listen_s == -1) { + if (err) + *err = RD_KAFKA_RESP_ERR__TRANSPORT; + return NULL; + } /* * Create mock broker object @@ -1550,6 +1763,8 @@ rd_kafka_mock_broker_new(rd_kafka_mock_cluster_t *mcluster, int32_t broker_id) { if (rd_kafka_mock_broker_start_listener(mrkb) == -1) { rd_kafka_mock_broker_destroy(mrkb); + if (err) + *err = RD_KAFKA_RESP_ERR__TRANSPORT; return NULL; } @@ -1658,6 +1873,20 @@ rd_kafka_mock_coord_set(rd_kafka_mock_cluster_t *mcluster, return mcoord; } +/** + * @brief Remove coordinator by broker id. + */ +void rd_kafka_mock_coord_remove(rd_kafka_mock_cluster_t *mcluster, + int32_t broker_id) { + rd_kafka_mock_coord_t *mcoord, *tmp; + + TAILQ_FOREACH_SAFE(mcoord, &mcluster->coords, link, tmp) { + if (mcoord->broker_id == broker_id) { + rd_kafka_mock_coord_destroy(mcluster, mcoord); + } + } +} + /** * @brief Remove and return the next error, or RD_KAFKA_RESP_ERR_NO_ERROR @@ -1986,6 +2215,23 @@ rd_kafka_mock_partition_set_follower_wmarks(rd_kafka_mock_cluster_t *mcluster, rd_kafka_op_req(mcluster->ops, rko, RD_POLL_INFINITE)); } +rd_kafka_resp_err_t +rd_kafka_mock_partition_push_leader_response(rd_kafka_mock_cluster_t *mcluster, + const char *topic, + int partition, + int32_t leader_id, + int32_t leader_epoch) { + rd_kafka_op_t *rko = rd_kafka_op_new(RD_KAFKA_OP_MOCK); + rko->rko_u.mock.name = rd_strdup(topic); + rko->rko_u.mock.cmd = RD_KAFKA_MOCK_CMD_PART_PUSH_LEADER_RESPONSE; + rko->rko_u.mock.partition = partition; + rko->rko_u.mock.leader_id = leader_id; + rko->rko_u.mock.leader_epoch = leader_epoch; + + return rd_kafka_op_err_destroy( + rd_kafka_op_req(mcluster->ops, rko, RD_POLL_INFINITE)); +} + rd_kafka_resp_err_t rd_kafka_mock_broker_set_down(rd_kafka_mock_cluster_t *mcluster, int32_t broker_id) { @@ -2040,6 +2286,31 @@ rd_kafka_mock_broker_set_rack(rd_kafka_mock_cluster_t *mcluster, rd_kafka_op_req(mcluster->ops, rko, RD_POLL_INFINITE)); } +void rd_kafka_mock_broker_set_host_port(rd_kafka_mock_cluster_t *cluster, + int32_t broker_id, + const char *host, + int port) { + rd_kafka_mock_broker_t *mrkb; + + mtx_lock(&cluster->lock); + TAILQ_FOREACH(mrkb, &cluster->brokers, link) { + if (mrkb->id == broker_id) { + rd_kafka_dbg( + cluster->rk, MOCK, "MOCK", + "Broker %" PRId32 + ": Setting advertised listener from %s:%d to %s:%d", + broker_id, mrkb->advertised_listener, mrkb->port, + host, port); + rd_snprintf(mrkb->advertised_listener, + sizeof(mrkb->advertised_listener), "%s", + host); + mrkb->port = port; + break; + } + } + mtx_unlock(&cluster->lock); +} + rd_kafka_resp_err_t rd_kafka_mock_coordinator_set(rd_kafka_mock_cluster_t *mcluster, const char *key_type, @@ -2072,6 +2343,39 @@ rd_kafka_mock_set_apiversion(rd_kafka_mock_cluster_t *mcluster, rd_kafka_op_req(mcluster->ops, rko, RD_POLL_INFINITE)); } +rd_kafka_resp_err_t +rd_kafka_mock_telemetry_set_requested_metrics(rd_kafka_mock_cluster_t *mcluster, + char **metrics, + size_t metrics_cnt) { + rd_kafka_op_t *rko = rd_kafka_op_new(RD_KAFKA_OP_MOCK); + + rko->rko_u.mock.hi = metrics_cnt; + rko->rko_u.mock.metrics = NULL; + if (metrics_cnt) { + size_t i; + rko->rko_u.mock.metrics = + rd_calloc(metrics_cnt, sizeof(char *)); + for (i = 0; i < metrics_cnt; i++) + rko->rko_u.mock.metrics[i] = rd_strdup(metrics[i]); + } + rko->rko_u.mock.cmd = RD_KAFKA_MOCK_CMD_REQUESTED_METRICS_SET; + + return rd_kafka_op_err_destroy( + rd_kafka_op_req(mcluster->ops, rko, RD_POLL_INFINITE)); +} + +rd_kafka_resp_err_t +rd_kafka_mock_telemetry_set_push_interval(rd_kafka_mock_cluster_t *mcluster, + int64_t push_interval_ms) { + rd_kafka_op_t *rko = rd_kafka_op_new(RD_KAFKA_OP_MOCK); + + rko->rko_u.mock.hi = push_interval_ms; + rko->rko_u.mock.cmd = RD_KAFKA_MOCK_CMD_TELEMETRY_PUSH_INTERVAL_SET; + + return rd_kafka_op_err_destroy( + rd_kafka_op_req(mcluster->ops, rko, RD_POLL_INFINITE)); +} + /** * @brief Apply command to specific broker. @@ -2128,6 +2432,11 @@ rd_kafka_mock_broker_cmd(rd_kafka_mock_cluster_t *mcluster, mrkb->rack = NULL; break; + case RD_KAFKA_MOCK_CMD_BROKER_DECOMMISSION: + rd_kafka_mock_broker_destroy(mrkb); + rd_kafka_mock_cluster_reassign_partitions(mcluster); + break; + default: RD_BUG("Unhandled mock cmd %d", rko->rko_u.mock.cmd); break; @@ -2181,6 +2490,8 @@ rd_kafka_mock_cluster_cmd(rd_kafka_mock_cluster_t *mcluster, rd_kafka_mock_topic_t *mtopic; rd_kafka_mock_partition_t *mpart; rd_kafka_mock_broker_t *mrkb; + size_t i; + rd_kafka_resp_err_t err; switch (rko->rko_u.mock.cmd) { case RD_KAFKA_MOCK_CMD_TOPIC_CREATE: @@ -2269,13 +2580,38 @@ rd_kafka_mock_cluster_cmd(rd_kafka_mock_cluster_t *mcluster, mpart->update_follower_end_offset = rd_false; } break; + case RD_KAFKA_MOCK_CMD_PART_PUSH_LEADER_RESPONSE: + mpart = rd_kafka_mock_partition_get( + mcluster, rko->rko_u.mock.name, rko->rko_u.mock.partition); + if (!mpart) + return RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART; + + rd_kafka_dbg(mcluster->rk, MOCK, "MOCK", + "Push %s [%" PRId32 "] leader response: (%" PRId32 + ", %" PRId32 ")", + rko->rko_u.mock.name, rko->rko_u.mock.partition, + rko->rko_u.mock.leader_id, + rko->rko_u.mock.leader_epoch); + + rd_kafka_mock_partition_push_leader_response0( + mpart, rko->rko_u.mock.leader_id, + rko->rko_u.mock.leader_epoch); + break; /* Broker commands */ case RD_KAFKA_MOCK_CMD_BROKER_SET_UPDOWN: case RD_KAFKA_MOCK_CMD_BROKER_SET_RTT: case RD_KAFKA_MOCK_CMD_BROKER_SET_RACK: + case RD_KAFKA_MOCK_CMD_BROKER_DECOMMISSION: return rd_kafka_mock_brokers_cmd(mcluster, rko); + case RD_KAFKA_MOCK_CMD_BROKER_ADD: + if (!rd_kafka_mock_broker_new(mcluster, + rko->rko_u.mock.broker_id, &err)) + return err; + + rd_kafka_mock_cluster_reassign_partitions(mcluster); + break; case RD_KAFKA_MOCK_CMD_COORD_SET: if (!rd_kafka_mock_coord_set(mcluster, rko->rko_u.mock.name, rko->rko_u.mock.str, @@ -2294,6 +2630,22 @@ rd_kafka_mock_cluster_cmd(rd_kafka_mock_cluster_t *mcluster, .MaxVersion = (int16_t)rko->rko_u.mock.hi; break; + case RD_KAFKA_MOCK_CMD_REQUESTED_METRICS_SET: + mcluster->metrics_cnt = rko->rko_u.mock.hi; + if (!mcluster->metrics_cnt) + break; + + mcluster->metrics = + rd_calloc(mcluster->metrics_cnt, sizeof(char *)); + for (i = 0; i < mcluster->metrics_cnt; i++) + mcluster->metrics[i] = + rd_strdup(rko->rko_u.mock.metrics[i]); + break; + + case RD_KAFKA_MOCK_CMD_TELEMETRY_PUSH_INTERVAL_SET: + mcluster->telemetry_push_interval_ms = rko->rko_u.mock.hi; + break; + default: rd_assert(!*"unknown mock cmd"); break; @@ -2302,6 +2654,14 @@ rd_kafka_mock_cluster_cmd(rd_kafka_mock_cluster_t *mcluster, return RD_KAFKA_RESP_ERR_NO_ERROR; } +void rd_kafka_mock_group_initial_rebalance_delay_ms( + rd_kafka_mock_cluster_t *mcluster, + int32_t delay_ms) { + mtx_lock(&mcluster->lock); + mcluster->defaults.group_initial_rebalance_delay_ms = delay_ms; + mtx_unlock(&mcluster->lock); +} + static rd_kafka_op_res_t rd_kafka_mock_cluster_op_serve(rd_kafka_t *rk, @@ -2338,11 +2698,13 @@ rd_kafka_mock_cluster_op_serve(rd_kafka_t *rk, static void rd_kafka_mock_cluster_destroy0(rd_kafka_mock_cluster_t *mcluster) { rd_kafka_mock_topic_t *mtopic; rd_kafka_mock_broker_t *mrkb; - rd_kafka_mock_cgrp_t *mcgrp; + rd_kafka_mock_cgrp_classic_t *mcgrp_classic; + rd_kafka_mock_cgrp_consumer_t *mcgrp_consumer; rd_kafka_mock_coord_t *mcoord; rd_kafka_mock_error_stack_t *errstack; thrd_t dummy_rkb_thread; int ret; + size_t i; while ((mtopic = TAILQ_FIRST(&mcluster->topics))) rd_kafka_mock_topic_destroy(mtopic); @@ -2350,8 +2712,11 @@ static void rd_kafka_mock_cluster_destroy0(rd_kafka_mock_cluster_t *mcluster) { while ((mrkb = TAILQ_FIRST(&mcluster->brokers))) rd_kafka_mock_broker_destroy(mrkb); - while ((mcgrp = TAILQ_FIRST(&mcluster->cgrps))) - rd_kafka_mock_cgrp_destroy(mcgrp); + while ((mcgrp_classic = TAILQ_FIRST(&mcluster->cgrps_classic))) + rd_kafka_mock_cgrp_classic_destroy(mcgrp_classic); + + while ((mcgrp_consumer = TAILQ_FIRST(&mcluster->cgrps_consumer))) + rd_kafka_mock_cgrp_consumer_destroy(mcgrp_consumer); while ((mcoord = TAILQ_FIRST(&mcluster->coords))) rd_kafka_mock_coord_destroy(mcluster, mcoord); @@ -2363,15 +2728,17 @@ static void rd_kafka_mock_cluster_destroy0(rd_kafka_mock_cluster_t *mcluster) { rd_kafka_mock_error_stack_destroy(errstack); } - /* - * Destroy dummy broker - */ - rd_kafka_q_enq(mcluster->dummy_rkb->rkb_ops, - rd_kafka_op_new(RD_KAFKA_OP_TERMINATE)); + rd_list_destroy(&mcluster->request_list); dummy_rkb_thread = mcluster->dummy_rkb->rkb_thread; - rd_kafka_broker_destroy(mcluster->dummy_rkb); + /* + * Destroy dummy broker. + * WARNING: This is last time we can read + * from dummy_rkb in this thread! + */ + rd_kafka_q_enq(mcluster->dummy_rkb->rkb_ops, + rd_kafka_op_new(RD_KAFKA_OP_TERMINATE)); if (thrd_join(dummy_rkb_thread, &ret) != thrd_success) rd_assert(!*"failed to join mock dummy broker thread"); @@ -2392,6 +2759,13 @@ static void rd_kafka_mock_cluster_destroy0(rd_kafka_mock_cluster_t *mcluster) { rd_socket_close(mcluster->wakeup_fds[0]); rd_socket_close(mcluster->wakeup_fds[1]); + + if (mcluster->metrics) { + for (i = 0; i < mcluster->metrics_cnt; i++) { + rd_free(mcluster->metrics[i]); + } + rd_free(mcluster->metrics); + } } @@ -2438,7 +2812,7 @@ rd_kafka_mock_cluster_t *rd_kafka_mock_cluster_new(rd_kafka_t *rk, TAILQ_INIT(&mcluster->brokers); for (i = 1; i <= broker_cnt; i++) { - if (!(mrkb = rd_kafka_mock_broker_new(mcluster, i))) { + if (!(mrkb = rd_kafka_mock_broker_new(mcluster, i, NULL))) { rd_kafka_mock_cluster_destroy(mcluster); return NULL; } @@ -2452,8 +2826,14 @@ rd_kafka_mock_cluster_t *rd_kafka_mock_cluster_new(rd_kafka_t *rk, TAILQ_INIT(&mcluster->topics); mcluster->defaults.partition_cnt = 4; mcluster->defaults.replication_factor = RD_MIN(3, broker_cnt); + mcluster->defaults.group_initial_rebalance_delay_ms = 3000; + mcluster->track_requests = rd_false; + mcluster->defaults.group_consumer_session_timeout_ms = 30000; + mcluster->defaults.group_consumer_heartbeat_interval_ms = 3000; - TAILQ_INIT(&mcluster->cgrps); + TAILQ_INIT(&mcluster->cgrps_classic); + + TAILQ_INIT(&mcluster->cgrps_consumer); TAILQ_INIT(&mcluster->coords); @@ -2464,6 +2844,8 @@ rd_kafka_mock_cluster_t *rd_kafka_mock_cluster_new(rd_kafka_t *rk, memcpy(mcluster->api_handlers, rd_kafka_mock_api_handlers, sizeof(mcluster->api_handlers)); + rd_list_init(&mcluster->request_list, 0, rd_kafka_mock_request_free); + /* Use an op queue for controlling the cluster in * a thread-safe manner without locking. */ mcluster->ops = rd_kafka_q_new(rk); @@ -2529,3 +2911,745 @@ const char * rd_kafka_mock_cluster_bootstraps(const rd_kafka_mock_cluster_t *mcluster) { return mcluster->bootstraps; } + +/** + * @struct Represents a request to the mock cluster along with a timestamp. + */ +struct rd_kafka_mock_request_s { + int32_t id; /**< Broker id */ + int16_t api_key; /**< API Key of request */ + rd_ts_t timestamp /**< Timestamp at which request was received */; +}; + +/** + * @brief Allocate and initialize a rd_kafka_mock_request_t * + */ +static rd_kafka_mock_request_t * +rd_kafka_mock_request_new(int32_t id, int16_t api_key, int64_t timestamp_us) { + rd_kafka_mock_request_t *request; + request = rd_malloc(sizeof(*request)); + request->id = id; + request->api_key = api_key; + request->timestamp = timestamp_us; + return request; +} + +static rd_kafka_mock_request_t * +rd_kafka_mock_request_copy(rd_kafka_mock_request_t *mrequest) { + rd_kafka_mock_request_t *request; + request = rd_malloc(sizeof(*request)); + request->id = mrequest->id; + request->api_key = mrequest->api_key; + request->timestamp = mrequest->timestamp; + return request; +} + +void rd_kafka_mock_request_destroy(rd_kafka_mock_request_t *mrequest) { + rd_free(mrequest); +} + +void rd_kafka_mock_request_destroy_array(rd_kafka_mock_request_t **mrequests, + size_t mrequest_cnt) { + size_t i; + for (i = 0; i < mrequest_cnt; i++) + rd_kafka_mock_request_destroy(mrequests[i]); + rd_free(mrequests); +} + +static void rd_kafka_mock_request_free(void *element) { + rd_kafka_mock_request_destroy(element); +} + +void rd_kafka_mock_start_request_tracking(rd_kafka_mock_cluster_t *mcluster) { + mtx_lock(&mcluster->lock); + mcluster->track_requests = rd_true; + rd_list_clear(&mcluster->request_list); + mtx_unlock(&mcluster->lock); +} + +void rd_kafka_mock_stop_request_tracking(rd_kafka_mock_cluster_t *mcluster) { + mtx_lock(&mcluster->lock); + mcluster->track_requests = rd_false; + rd_list_clear(&mcluster->request_list); + mtx_unlock(&mcluster->lock); +} + +rd_kafka_mock_request_t ** +rd_kafka_mock_get_requests(rd_kafka_mock_cluster_t *mcluster, size_t *cntp) { + size_t i; + rd_kafka_mock_request_t **ret = NULL; + + mtx_lock(&mcluster->lock); + *cntp = rd_list_cnt(&mcluster->request_list); + if (*cntp > 0) { + ret = rd_calloc(*cntp, sizeof(rd_kafka_mock_request_t *)); + for (i = 0; i < *cntp; i++) { + rd_kafka_mock_request_t *mreq = + rd_list_elem(&mcluster->request_list, i); + ret[i] = rd_kafka_mock_request_copy(mreq); + } + } + + mtx_unlock(&mcluster->lock); + return ret; +} + +void rd_kafka_mock_clear_requests(rd_kafka_mock_cluster_t *mcluster) { + mtx_lock(&mcluster->lock); + rd_list_clear(&mcluster->request_list); + mtx_unlock(&mcluster->lock); +} + +int32_t rd_kafka_mock_request_id(rd_kafka_mock_request_t *mreq) { + return mreq->id; +} + +int16_t rd_kafka_mock_request_api_key(rd_kafka_mock_request_t *mreq) { + return mreq->api_key; +} + +rd_ts_t rd_kafka_mock_request_timestamp(rd_kafka_mock_request_t *mreq) { + return mreq->timestamp; +} + +/* Unit tests */ + +/** + * @brief Create a topic-partition list with vararg arguments. + * + * @param cnt Number of topic-partitions. + * @param ...vararg is a tuple of: + * const char *topic_name + * int32_t partition + * + * @remark The returned pointer ownership is transferred to the caller. + */ +static rd_kafka_topic_partition_list_t *ut_topic_partitions(int cnt, ...) { + va_list ap; + const char *topic_name; + int i = 0; + + rd_kafka_topic_partition_list_t *rktparlist = + rd_kafka_topic_partition_list_new(cnt); + va_start(ap, cnt); + while (i < cnt) { + topic_name = va_arg(ap, const char *); + int32_t partition = va_arg(ap, int32_t); + + rd_kafka_topic_partition_list_add(rktparlist, topic_name, + partition); + i++; + } + va_end(ap); + + return rktparlist; +} + +/** + * @brief Assert \p expected partition list is equal to \p actual. + * + * @param expected Expected partition list. + * @param actual Actual partition list. + * @return Comparation result. + */ +static int ut_assert_topic_partitions(rd_kafka_topic_partition_list_t *expected, + rd_kafka_topic_partition_list_t *actual) { + rd_bool_t equal; + char expected_str[256] = ""; + char actual_str[256] = ""; + + if (expected) + RD_UT_ASSERT(actual, "list should be not-NULL, but it's NULL"); + else + RD_UT_ASSERT(!actual, "list should be NULL, but it's not-NULL"); + + + if (!expected) + return 0; + + equal = !rd_kafka_topic_partition_list_cmp( + actual, expected, rd_kafka_topic_partition_cmp); + + if (!equal) { + rd_kafka_topic_partition_list_str(expected, expected_str, + sizeof(expected_str), + RD_KAFKA_FMT_F_NO_ERR); + rd_kafka_topic_partition_list_str(actual, actual_str, + sizeof(actual_str), + RD_KAFKA_FMT_F_NO_ERR); + } + + RD_UT_ASSERT(equal, "list should be equal. Expected: %s, got: %s", + expected_str, actual_str); + return 0; +} + +/** + * @struct Fixture used for testing next assignment calculation. + */ +struct cgrp_consumer_member_next_assignment_fixture { + /** Current member epoch (after calling next assignment). */ + int32_t current_member_epoch; + /** Current consumer assignment, if changed. */ + rd_kafka_topic_partition_list_t *current_assignment; + /** Returned assignment, if expected. */ + rd_kafka_topic_partition_list_t *returned_assignment; + /** Target assignment, if changed. */ + rd_kafka_topic_partition_list_t *target_assignment; + /** Should simulate a disconnection and reconnection. */ + rd_bool_t reconnected; + /** Should simulate a session time out. */ + rd_bool_t session_timed_out; + /** Comment to log. */ + const char *comment; +}; + +/** + * @brief Test next assignment calculation using passed \p fixtures. + * using a new cluster with a topic with name \p topic and + * \p partitions partitions. + * + * @param topic Topic name to create. + * @param partitions Topic partition. + * @param fixtures Array of fixtures for this test. + * @param fixtures_cnt Number of elements in \p fixtures. + * @return Number of occurred errors. + */ +static int ut_cgrp_consumer_member_next_assignment0( + const char *topic, + int partitions, + struct cgrp_consumer_member_next_assignment_fixture *fixtures, + size_t fixtures_cnt) { + int failures = 0; + int32_t current_member_epoch = 0; + size_t i; + rd_kafka_t *rk; + rd_kafka_mock_cluster_t *mcluster; + static rd_kafka_mock_topic_t *mtopic; + rd_kafka_mock_cgrp_consumer_t *mcgrp; + rd_kafka_mock_cgrp_consumer_member_t *member; + char errstr[512]; + rd_kafkap_str_t GroupId = {.str = "group", .len = 5}; + rd_kafkap_str_t MemberId = {.str = "A", .len = 1}; + rd_kafkap_str_t InstanceId = {.len = -1}; + rd_kafkap_str_t SubscribedTopic = {.str = topic, .len = strlen(topic)}; + rd_kafkap_str_t SubscribedTopicRegex = RD_KAFKAP_STR_INITIALIZER_EMPTY; + struct rd_kafka_mock_connection_s *conn = + (struct rd_kafka_mock_connection_s + *)1; /* fake connection instance */ + + rk = rd_kafka_new(RD_KAFKA_CONSUMER, NULL, errstr, sizeof(errstr)); + mcluster = rd_kafka_mock_cluster_new(rk, 1); + mcgrp = rd_kafka_mock_cgrp_consumer_get(mcluster, &GroupId); + member = rd_kafka_mock_cgrp_consumer_member_add( + mcgrp, conn, &MemberId, &InstanceId, &SubscribedTopic, 1, + &SubscribedTopicRegex); + mtopic = rd_kafka_mock_topic_new(mcluster, topic, partitions, 1); + + for (i = 0; i < fixtures_cnt; i++) { + int j; + rd_kafka_topic_partition_list_t *current_assignment, + *member_target_assignment, *next_assignment, + *returned_assignment; + + RD_UT_SAY("test fixture %" PRIusz ": %s", i, + fixtures[i].comment); + + if (fixtures[i].session_timed_out) { + rd_kafka_mock_cgrp_consumer_member_leave(mcgrp, member, + rd_false); + member = rd_kafka_mock_cgrp_consumer_member_add( + mcgrp, conn, &MemberId, &InstanceId, + &SubscribedTopic, 1, &SubscribedTopicRegex); + } + + if (fixtures[i].reconnected) { + rd_kafka_mock_cgrps_connection_closed(mcluster, conn); + conn++; + member = rd_kafka_mock_cgrp_consumer_member_add( + mcgrp, conn, &MemberId, &InstanceId, + &SubscribedTopic, 1, &SubscribedTopicRegex); + } + + member_target_assignment = fixtures[i].target_assignment; + if (member_target_assignment) { + rd_kafka_mock_cgrp_consumer_target_assignment_t + *target_assignment; + + target_assignment = + rd_kafka_mock_cgrp_consumer_target_assignment_new( + (char **)&MemberId.str, 1, + &member_target_assignment); + + rd_kafka_mock_cgrp_consumer_target_assignment( + mcluster, GroupId.str, target_assignment); + rd_kafka_mock_cgrp_consumer_target_assignment_destroy( + target_assignment); + rd_kafka_topic_partition_list_destroy( + member_target_assignment); + } + + current_assignment = fixtures[i].current_assignment; + if (current_assignment) { + /* Set topic id */ + for (j = 0; j < current_assignment->cnt; j++) { + rd_kafka_topic_partition_set_topic_id( + ¤t_assignment->elems[j], mtopic->id); + } + } + + next_assignment = + rd_kafka_mock_cgrp_consumer_member_next_assignment( + member, current_assignment, ¤t_member_epoch); + RD_IF_FREE(current_assignment, + rd_kafka_topic_partition_list_destroy); + RD_UT_ASSERT( + current_member_epoch == fixtures[i].current_member_epoch, + "current member epoch after call. Expected: %" PRId32 + ", got: %" PRId32, + fixtures[i].current_member_epoch, current_member_epoch); + + returned_assignment = fixtures[i].returned_assignment; + failures += ut_assert_topic_partitions(returned_assignment, + next_assignment); + + RD_IF_FREE(next_assignment, + rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(returned_assignment, + rd_kafka_topic_partition_list_destroy); + } + + rd_kafka_mock_cluster_destroy(mcluster); + rd_kafka_destroy(rk); + return failures; +} + +/** + * @brief Test case where multiple revocations are acked. + * Only when they're acked member epoch is bumped + * and a new partition is returned to the member. + * + * @return Number of occurred errors. + */ +static int ut_cgrp_consumer_member_next_assignment1(void) { + RD_UT_SAY("Case 1: multiple revocations acked"); + + const char *topic = "topic"; + struct cgrp_consumer_member_next_assignment_fixture fixtures[] = { + { + .comment = "Target+Returned assignment 0,1,2. Epoch 0 -> 3", + .current_member_epoch = 3, + .current_assignment = NULL, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + }, + { + .comment = "Current assignment empty", + .current_member_epoch = 3, + .current_assignment = ut_topic_partitions(0), + .returned_assignment = NULL, + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0", + .current_member_epoch = 3, + .current_assignment = ut_topic_partitions(1, topic, 0), + .returned_assignment = NULL, + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0,1", + .current_member_epoch = 3, + .current_assignment = + ut_topic_partitions(2, topic, 0, topic, 1), + .returned_assignment = NULL, + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0,1,2", + .current_member_epoch = 3, + .current_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = NULL, + }, + { + .comment = "Target assignment 0,1,3. Returned assignment 0,1", + .current_member_epoch = 3, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 3), + .current_assignment = NULL, + .returned_assignment = + ut_topic_partitions(2, topic, 0, topic, 1), + }, + { + .comment = "Target assignment 0,3. Returned assignment 0", + .current_member_epoch = 3, + .target_assignment = ut_topic_partitions(2, topic, 0, topic, 3), + .current_assignment = NULL, + .returned_assignment = ut_topic_partitions(1, topic, 0), + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0,1", + .current_member_epoch = 3, + .current_assignment = + ut_topic_partitions(2, topic, 0, topic, 1), + .returned_assignment = NULL, + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0. Returned assignment 0,3. " + "Epoch 3 -> 5", + .current_member_epoch = 5, + .current_assignment = ut_topic_partitions(1, topic, 0), + .returned_assignment = + ut_topic_partitions(2, topic, 0, topic, 3), + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 5, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0,3", + .current_member_epoch = 5, + .current_assignment = + ut_topic_partitions(2, topic, 0, topic, 3), + .returned_assignment = NULL, + }, + }; + return ut_cgrp_consumer_member_next_assignment0( + topic, 4, fixtures, RD_ARRAY_SIZE(fixtures)); +} + +/** + * @brief Test case where multiple revocations happen. + * Only the first revocation is acked and after that + * there's a reassignment and epoch bump. + * + * @return Number of occurred errors. + */ +static int ut_cgrp_consumer_member_next_assignment2(void) { + RD_UT_SAY( + "Case 2: reassignment of revoked partition, partial revocation " + "acknowledge"); + + const char *topic = "topic"; + struct cgrp_consumer_member_next_assignment_fixture fixtures[] = { + { + .comment = "Target+Returned assignment 0,1,2. Epoch 0 -> 3", + .current_member_epoch = 3, + .current_assignment = NULL, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + }, + { + .comment = "Current assignment 0,1,2", + .current_member_epoch = 3, + .current_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = NULL, + }, + { + .comment = "Target assignment 0,1,3. Returned assignment 0,1", + .current_member_epoch = 3, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 3), + .current_assignment = NULL, + .returned_assignment = + ut_topic_partitions(2, topic, 0, topic, 1), + }, + { + .comment = "Target assignment 0,3. Returned assignment 0", + .current_member_epoch = 3, + .target_assignment = ut_topic_partitions(2, topic, 0, topic, 3), + .current_assignment = NULL, + .returned_assignment = ut_topic_partitions(1, topic, 0), + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0,1", + .current_member_epoch = 3, + .current_assignment = + ut_topic_partitions(2, topic, 0, topic, 1), + .returned_assignment = NULL, + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Target+Returned assignment 0,1,3. Epoch 3 -> 6", + .current_member_epoch = 6, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 3), + .current_assignment = NULL, + .returned_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 3), + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 6, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0,1,3", + .current_member_epoch = 6, + .current_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 3), + .returned_assignment = NULL, + }, + }; + return ut_cgrp_consumer_member_next_assignment0( + topic, 4, fixtures, RD_ARRAY_SIZE(fixtures)); +} + +/** + * @brief Test case where multiple revocations happen. + * They aren't acked but then a + * reassignment of all the revoked partition happens, bumping the epoch. + * + * @return Number of occurred errors. + */ +static int ut_cgrp_consumer_member_next_assignment3(void) { + RD_UT_SAY( + "Case 3: reassignment of revoked partition and new partition, no " + "revocation acknowledge"); + + const char *topic = "topic"; + struct cgrp_consumer_member_next_assignment_fixture fixtures[] = { + { + .comment = "Target+Returned assignment 0,1,2. Epoch 0 -> 3", + .current_member_epoch = 3, + .current_assignment = NULL, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + }, + { + .comment = "Current assignment 0,1,2", + .current_member_epoch = 3, + .current_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = NULL, + }, + { + .comment = "Target assignment 0,1,3. Returned assignment 0,1", + .current_member_epoch = 3, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 3), + .current_assignment = NULL, + .returned_assignment = + ut_topic_partitions(2, topic, 0, topic, 1), + }, + { + .comment = "Target assignment 0,3. Returned assignment 0", + .current_member_epoch = 3, + .target_assignment = ut_topic_partitions(2, topic, 0, topic, 3), + .current_assignment = NULL, + .returned_assignment = ut_topic_partitions(1, topic, 0), + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Target+Returned assignment 0,1,2,3. Epoch 3 -> 6", + .current_member_epoch = 6, + .target_assignment = ut_topic_partitions( + 3, topic, 0, topic, 1, topic, 2, topic, 3, NULL), + .returned_assignment = ut_topic_partitions( + 3, topic, 0, topic, 1, topic, 2, topic, 3, NULL), + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 6, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0,1,2,3", + .current_member_epoch = 6, + .current_assignment = ut_topic_partitions( + 3, topic, 0, topic, 1, topic, 2, topic, 3, NULL), + .returned_assignment = NULL, + }, + }; + return ut_cgrp_consumer_member_next_assignment0( + topic, 4, fixtures, RD_ARRAY_SIZE(fixtures)); +} + +/** + * @brief Test case where a disconnection happens and after that + * the client send its assignment again, with same member epoch, + * and receives back the returned assignment, even if the same. + * + * @return Number of occurred errors. + */ +static int ut_cgrp_consumer_member_next_assignment4(void) { + RD_UT_SAY("Case 4: reconciliation after disconnection"); + + const char *topic = "topic"; + struct cgrp_consumer_member_next_assignment_fixture fixtures[] = { + { + .comment = "Target+Returned assignment 0,1,2. Epoch 0 -> 3", + .current_member_epoch = 3, + .current_assignment = NULL, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + }, + { + .comment = "Current assignment empty", + .current_member_epoch = 3, + .current_assignment = ut_topic_partitions(0), + .returned_assignment = NULL, + }, + { + .comment = "Disconnected, resends current assignment. Returns " + "assignment again", + .reconnected = rd_true, + .current_member_epoch = 3, + .current_assignment = ut_topic_partitions(0), + .returned_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + }, + { + .comment = "Empty heartbeat", + .current_member_epoch = 3, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Current assignment 0,1,2", + .current_member_epoch = 3, + .current_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = NULL, + }, + }; + return ut_cgrp_consumer_member_next_assignment0( + topic, 3, fixtures, RD_ARRAY_SIZE(fixtures)); +} + +/** + * @brief Test case where a session timeout happens and then + * the client receives a FENCED_MEMBER_EPOCH error, + * revokes all of its partitions and rejoins with epoch 0. + * + * @return Number of occurred errors. + */ +static int ut_cgrp_consumer_member_next_assignment5(void) { + RD_UT_SAY("Case 5: fenced consumer"); + + const char *topic = "topic"; + struct cgrp_consumer_member_next_assignment_fixture fixtures[] = { + { + .comment = "Target+Returned assignment 0,1,2. Epoch 0 -> 3", + .current_member_epoch = 3, + .current_assignment = NULL, + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + }, + { + .comment = "Session times out, receives FENCED_MEMBER_EPOCH. " + "Epoch 3 -> 0", + .session_timed_out = rd_true, + .current_member_epoch = -1, + .current_assignment = NULL, + .returned_assignment = NULL, + }, + { + .comment = "Target+Returned assignment 0,1,2. Epoch 0 -> 6", + .target_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .current_member_epoch = 4, + .current_assignment = NULL, + .returned_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + }, + { + .comment = "Current assignment 0,1,2", + .current_member_epoch = 4, + .current_assignment = + ut_topic_partitions(3, topic, 0, topic, 1, topic, 2), + .returned_assignment = NULL, + }, + }; + return ut_cgrp_consumer_member_next_assignment0( + topic, 3, fixtures, RD_ARRAY_SIZE(fixtures)); +} + +/** + * @brief Test all next assignment calculation cases, + * for KIP-848 consumer group type and collect + * number of errors. + * + * @return Number of occurred errors. + */ +static int ut_cgrp_consumer_member_next_assignment(void) { + RD_UT_BEGIN(); + int failures = 0; + + failures += ut_cgrp_consumer_member_next_assignment1(); + failures += ut_cgrp_consumer_member_next_assignment2(); + failures += ut_cgrp_consumer_member_next_assignment3(); + failures += ut_cgrp_consumer_member_next_assignment4(); + failures += ut_cgrp_consumer_member_next_assignment5(); + + RD_UT_ASSERT(!failures, "some tests failed"); + RD_UT_PASS(); +} + +/** + * @brief Mock cluster unit tests + */ +int unittest_mock_cluster(void) { + int fails = 0; + fails += ut_cgrp_consumer_member_next_assignment(); + return fails; +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_mock.h b/src/third_party/librdkafka/dist/src/rdkafka_mock.h index f06efe8fd5e..0b81b312ef8 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_mock.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_mock.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019-2022 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -67,6 +68,7 @@ extern "C" { * - Low-level consumer * - High-level balanced consumer groups with offset commits * - Topic Metadata and auto creation + * - Telemetry (KIP-714) * * @remark This is an experimental public API that is NOT covered by the * librdkafka API or ABI stability guarantees. @@ -166,6 +168,15 @@ rd_kafka_mock_push_request_errors_array(rd_kafka_mock_cluster_t *mcluster, const rd_kafka_resp_err_t *errors); +/** + * @brief Apply broker configuration group.initial.rebalance.delay.ms + * to the whole \p mcluster. + */ +RD_EXPORT void rd_kafka_mock_group_initial_rebalance_delay_ms( + rd_kafka_mock_cluster_t *mcluster, + int32_t delay_ms); + + /** * @brief Push \p cnt errors and RTT tuples in the \p ... va-arg list onto * the broker's error stack for the given \p ApiKey. @@ -281,6 +292,24 @@ rd_kafka_mock_partition_set_follower_wmarks(rd_kafka_mock_cluster_t *mcluster, int64_t lo, int64_t hi); +/** + * @brief Push \p cnt Metadata leader response + * onto the cluster's stack for the given \p topic and \p partition. + * + * @param topic Topic to change + * @param partition Partition to change in \p topic + * @param leader_id Broker id of the leader node + * @param leader_epoch Leader epoch corresponding to the given \p leader_id + * + * @return Push operation error code + */ +RD_EXPORT +rd_kafka_resp_err_t +rd_kafka_mock_partition_push_leader_response(rd_kafka_mock_cluster_t *mcluster, + const char *topic, + int partition, + int32_t leader_id, + int32_t leader_epoch); /** * @brief Disconnects the broker and disallows any new connections. @@ -293,6 +322,22 @@ RD_EXPORT rd_kafka_resp_err_t rd_kafka_mock_broker_set_down(rd_kafka_mock_cluster_t *mcluster, int32_t broker_id); +/** + * @brief Sets a new \p host and \p port for a given broker identified by + * \p broker_id. + * + * @param mcluster Mock cluster instance. + * @param broker_id The id of the broker to modify. + * @param host The new hostname. + * @param port The new port. + */ +RD_EXPORT void +rd_kafka_mock_broker_set_host_port(rd_kafka_mock_cluster_t *mcluster, + int32_t broker_id, + const char *host, + int port); + + /** * @brief Makes the broker accept connections again. * This does NOT trigger leader change. @@ -329,6 +374,33 @@ rd_kafka_mock_broker_set_rack(rd_kafka_mock_cluster_t *mcluster, +/** + * @brief Remove and delete a mock broker from a cluster. + * All partitions assigned to that broker will be + * reassigned to other brokers. + * + * @param cluster The mock cluster containing the broker + * @param broker_id The broker to delete + * @returns 0 on success or -1 on error + */ +RD_EXPORT rd_kafka_resp_err_t +rd_kafka_mock_broker_decommission(rd_kafka_mock_cluster_t *cluster, + int32_t broker_id); + +/** + * @brief Add a new broker to the cluster. + * Cluster partition will be reassigned to use the new broker + * as well. + * + * @param mcluster The mock cluster + * @param broker_id The id of the broker to add + * + * @returns Error value or 0 if no error occurred + */ +RD_EXPORT rd_kafka_resp_err_t +rd_kafka_mock_broker_add(rd_kafka_mock_cluster_t *mcluster, int32_t broker_id); + + /** * @brief Explicitly sets the coordinator. If this API is not a standard * hashing scheme will be used. @@ -364,6 +436,171 @@ rd_kafka_mock_set_apiversion(rd_kafka_mock_cluster_t *mcluster, int16_t MinVersion, int16_t MaxVersion); +/** + * @brief Start tracking RPC requests for this mock cluster. + * @sa rd_kafka_mock_get_requests to get the requests. + */ +RD_EXPORT +void rd_kafka_mock_start_request_tracking(rd_kafka_mock_cluster_t *mcluster); + +/** + * @brief Stop tracking RPC requests for this mock cluster. + * Does not clear already tracked requests. + */ +RD_EXPORT +void rd_kafka_mock_stop_request_tracking(rd_kafka_mock_cluster_t *mcluster); + +/** + * @name Represents a request to the mock cluster along with a timestamp. + */ +typedef struct rd_kafka_mock_request_s rd_kafka_mock_request_t; + +/** + * @brief Destroy a rd_kafka_mock_request_t * and deallocate memory. + */ +RD_EXPORT void rd_kafka_mock_request_destroy(rd_kafka_mock_request_t *mreq); + +/** + * @brief Destroy a rd_kafka_mock_request_t * array and deallocate it. + */ +RD_EXPORT void +rd_kafka_mock_request_destroy_array(rd_kafka_mock_request_t **mreqs, + size_t mreq_cnt); + +/** + * @brief Get the broker id to which \p mreq was sent. + */ +RD_EXPORT int32_t rd_kafka_mock_request_id(rd_kafka_mock_request_t *mreq); + +/** + * @brief Get the ApiKey with which \p mreq was sent. + */ +RD_EXPORT int16_t rd_kafka_mock_request_api_key(rd_kafka_mock_request_t *mreq); + +/** + * @brief Get the timestamp in micros at which \p mreq was sent. + */ +RD_EXPORT int64_t +rd_kafka_mock_request_timestamp(rd_kafka_mock_request_t *mreq); + +/** + * @brief Get the list of requests sent to this mock cluster. + * + * @param cntp is set to the count of requests. + * @return List of rd_kafka_mock_request_t *. + * @remark each element of the returned array must be freed with + * rd_kafka_mock_request_destroy, and the list itself must be freed too. + */ +RD_EXPORT rd_kafka_mock_request_t ** +rd_kafka_mock_get_requests(rd_kafka_mock_cluster_t *mcluster, size_t *cntp); + +/** + * @brief Clear the list of requests sent to this mock broker, in case request + * tracking is/was turned on. + */ +RD_EXPORT void rd_kafka_mock_clear_requests(rd_kafka_mock_cluster_t *mcluster); + +/** + * @brief Set the metrics that are expected by the broker for telemetry + * collection. + * + * @param metrics List of prefixes of metric names or NULL. + * @param metrics_cnt + * + * @note if \p metrics is NULL, no metrics will be expected by the broker. If + * the first elements of \p metrics is an empty string, that indicates the + * broker expects all metrics. + */ +RD_EXPORT rd_kafka_resp_err_t +rd_kafka_mock_telemetry_set_requested_metrics(rd_kafka_mock_cluster_t *mcluster, + char **metrics, + size_t metrics_cnt); + + +/** + * @brief Set push frequency to be sent to the client for telemetry collection. + * when the broker receives GetTelemetrySubscription requests. + * + * @param push_interval_ms time for push in milliseconds. Must be more than 0. + */ +RD_EXPORT rd_kafka_resp_err_t +rd_kafka_mock_telemetry_set_push_interval(rd_kafka_mock_cluster_t *mcluster, + int64_t push_interval_ms); + +typedef struct rd_kafka_mock_cgrp_consumer_target_assignment_s + rd_kafka_mock_cgrp_consumer_target_assignment_t; + +/** + * @brief Create a new target assignment for \p member_cnt members + * given a member id and a member assignment for each member `i`, + * specified in \p member_ids[i] and \p assignment[i]. + * + * @remark used for mocking target assignment + * in KIP-848 consumer group protocol. + * + * @param member_ids Array of member ids of size \p member_cnt. + * @param member_cnt Number of members. + * @param assignment Array of (rd_kafka_topic_partition_list_t *) of size \p + * member_cnt. + */ +RD_EXPORT rd_kafka_mock_cgrp_consumer_target_assignment_t * +rd_kafka_mock_cgrp_consumer_target_assignment_new( + char **member_ids, + int member_cnt, + rd_kafka_topic_partition_list_t **assignment); + +/** + * @brief Destroy target assignment \p target_assignment . + */ +RD_EXPORT void rd_kafka_mock_cgrp_consumer_target_assignment_destroy( + rd_kafka_mock_cgrp_consumer_target_assignment_t *target_assignment); + +/** + * @brief Sets next target assignment for the group + * identified by \p group_id to the + * target assignment contained in \p target_assignment, + * in the cluster \p mcluster. + * + * @remark used for mocking target assignment + * in KIP-848 consumer group protocol. + * + * @param mcluster Mock cluster instance. + * @param group_id Group id. + * @param target_assignment Target assignment for all the members. + */ +RD_EXPORT void rd_kafka_mock_cgrp_consumer_target_assignment( + rd_kafka_mock_cluster_t *mcluster, + const char *group_id, + rd_kafka_mock_cgrp_consumer_target_assignment_t *target_assignment); + +/** + * @brief Sets group.consumer.session.timeout.ms + * for the cluster \p mcluster to \p group_consumer_session_timeout_ms. + * + * @remark used in KIP-848 consumer group protocol. + * + * @param mcluster Mock cluster instance. + * @param group_consumer_session_timeout_ms Session timeout in milliseconds. + */ +RD_EXPORT void rd_kafka_mock_set_group_consumer_session_timeout_ms( + rd_kafka_mock_cluster_t *mcluster, + int group_consumer_session_timeout_ms); + +/** + * @brief Sets group.consumer.heartbeat.interval.ms + * for the cluster \p mcluster to \p + * group_consumer_heartbeat_interval_ms. + * + * @remark used in KIP-848 consumer group protocol. + * + * @param mcluster Mock cluster instance. + * @param group_consumer_heartbeat_interval_ms Heartbeat interval in + * milliseconds. + */ +RD_EXPORT void rd_kafka_mock_set_group_consumer_heartbeat_interval_ms( + rd_kafka_mock_cluster_t *mcluster, + int group_consumer_heartbeat_interval_ms); + /**@}*/ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_mock_cgrp.c b/src/third_party/librdkafka/dist/src/rdkafka_mock_cgrp.c index 8f71fb48c9d..0c75e003e57 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_mock_cgrp.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_mock_cgrp.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,19 +37,21 @@ #include "rdkafka_mock_int.h" -static const char *rd_kafka_mock_cgrp_state_names[] = { +static const char *rd_kafka_mock_cgrp_classic_state_names[] = { "Empty", "Joining", "Syncing", "Rebalancing", "Up"}; -static void rd_kafka_mock_cgrp_rebalance(rd_kafka_mock_cgrp_t *mcgrp, - const char *reason); static void -rd_kafka_mock_cgrp_member_destroy(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member); +rd_kafka_mock_cgrp_classic_rebalance(rd_kafka_mock_cgrp_classic_t *mcgrp, + const char *reason); +static void rd_kafka_mock_cgrp_classic_member_destroy( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member); -static void rd_kafka_mock_cgrp_set_state(rd_kafka_mock_cgrp_t *mcgrp, - unsigned int new_state, - const char *reason) { +static void +rd_kafka_mock_cgrp_classic_set_state(rd_kafka_mock_cgrp_classic_t *mcgrp, + unsigned int new_state, + const char *reason) { if (mcgrp->state == new_state) return; @@ -56,8 +59,8 @@ static void rd_kafka_mock_cgrp_set_state(rd_kafka_mock_cgrp_t *mcgrp, "Mock consumer group %s with %d member(s) " "changing state %s -> %s: %s", mcgrp->id, mcgrp->member_cnt, - rd_kafka_mock_cgrp_state_names[mcgrp->state], - rd_kafka_mock_cgrp_state_names[new_state], reason); + rd_kafka_mock_cgrp_classic_state_names[mcgrp->state], + rd_kafka_mock_cgrp_classic_state_names[new_state], reason); mcgrp->state = new_state; } @@ -66,8 +69,9 @@ static void rd_kafka_mock_cgrp_set_state(rd_kafka_mock_cgrp_t *mcgrp, /** * @brief Mark member as active (restart session timer) */ -void rd_kafka_mock_cgrp_member_active(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member) { +void rd_kafka_mock_cgrp_classic_member_active( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member) { rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", "Marking mock consumer group member %s as active", member->id); @@ -80,11 +84,11 @@ void rd_kafka_mock_cgrp_member_active(rd_kafka_mock_cgrp_t *mcgrp, * * @param member may be NULL. */ -rd_kafka_resp_err_t -rd_kafka_mock_cgrp_check_state(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member, - const rd_kafka_buf_t *request, - int32_t generation_id) { +rd_kafka_resp_err_t rd_kafka_mock_cgrp_classic_check_state( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member, + const rd_kafka_buf_t *request, + int32_t generation_id) { int16_t ApiKey = request->rkbuf_reqhdr.ApiKey; rd_bool_t has_generation_id = ApiKey == RD_KAFKAP_SyncGroup || ApiKey == RD_KAFKAP_Heartbeat || @@ -141,9 +145,9 @@ rd_kafka_mock_cgrp_check_state(rd_kafka_mock_cgrp_t *mcgrp, /** * @brief Set a member's assignment (from leader's SyncGroupRequest) */ -void rd_kafka_mock_cgrp_member_assignment_set( - rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member, +void rd_kafka_mock_cgrp_classic_member_assignment_set( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member, const rd_kafkap_bytes_t *Metadata) { if (member->assignment) { rd_assert(mcgrp->assignment_cnt > 0); @@ -162,9 +166,10 @@ void rd_kafka_mock_cgrp_member_assignment_set( /** * @brief Sync done (successfully) or failed, send responses back to members. */ -static void rd_kafka_mock_cgrp_sync_done(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_resp_err_t err) { - rd_kafka_mock_cgrp_member_t *member; +static void +rd_kafka_mock_cgrp_classic_sync_done(rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_resp_err_t err) { + rd_kafka_mock_cgrp_classic_member_t *member; TAILQ_FOREACH(member, &mcgrp->members, link) { rd_kafka_buf_t *resp; @@ -180,7 +185,8 @@ static void rd_kafka_mock_cgrp_sync_done(rd_kafka_mock_cgrp_t *mcgrp, resp, !err ? member->assignment : NULL); } - rd_kafka_mock_cgrp_member_assignment_set(mcgrp, member, NULL); + rd_kafka_mock_cgrp_classic_member_assignment_set(mcgrp, member, + NULL); if (member->conn) { rd_kafka_mock_connection_set_blocking(member->conn, @@ -200,20 +206,21 @@ static void rd_kafka_mock_cgrp_sync_done(rd_kafka_mock_cgrp_t *mcgrp, * @brief Check if all members have sent SyncGroupRequests, if so, propagate * assignment to members. */ -static void rd_kafka_mock_cgrp_sync_check(rd_kafka_mock_cgrp_t *mcgrp) { +static void +rd_kafka_mock_cgrp_classic_sync_check(rd_kafka_mock_cgrp_classic_t *mcgrp) { rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", "Mock consumer group %s: awaiting %d/%d syncing members " "in state %s", mcgrp->id, mcgrp->assignment_cnt, mcgrp->member_cnt, - rd_kafka_mock_cgrp_state_names[mcgrp->state]); + rd_kafka_mock_cgrp_classic_state_names[mcgrp->state]); if (mcgrp->assignment_cnt < mcgrp->member_cnt) return; - rd_kafka_mock_cgrp_sync_done(mcgrp, RD_KAFKA_RESP_ERR_NO_ERROR); - rd_kafka_mock_cgrp_set_state(mcgrp, RD_KAFKA_MOCK_CGRP_STATE_UP, - "all members synced"); + rd_kafka_mock_cgrp_classic_sync_done(mcgrp, RD_KAFKA_RESP_ERR_NO_ERROR); + rd_kafka_mock_cgrp_classic_set_state(mcgrp, RD_KAFKA_MOCK_CGRP_STATE_UP, + "all members synced"); } @@ -222,16 +229,16 @@ static void rd_kafka_mock_cgrp_sync_check(rd_kafka_mock_cgrp_t *mcgrp) { * which will be sent when the all group member SyncGroupRequest are * received. */ -rd_kafka_resp_err_t -rd_kafka_mock_cgrp_member_sync_set(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member, - rd_kafka_mock_connection_t *mconn, - rd_kafka_buf_t *resp) { +rd_kafka_resp_err_t rd_kafka_mock_cgrp_classic_member_sync_set( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member, + rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *resp) { if (mcgrp->state != RD_KAFKA_MOCK_CGRP_STATE_SYNCING) return RD_KAFKA_RESP_ERR_REBALANCE_IN_PROGRESS; /* FIXME */ - rd_kafka_mock_cgrp_member_active(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_active(mcgrp, member); rd_assert(!member->resp); @@ -240,7 +247,7 @@ rd_kafka_mock_cgrp_member_sync_set(rd_kafka_mock_cgrp_t *mcgrp, rd_kafka_mock_connection_set_blocking(member->conn, rd_true); /* Check if all members now have an assignment, if so, send responses */ - rd_kafka_mock_cgrp_sync_check(mcgrp); + rd_kafka_mock_cgrp_classic_sync_check(mcgrp); return RD_KAFKA_RESP_ERR_NO_ERROR; } @@ -249,16 +256,16 @@ rd_kafka_mock_cgrp_member_sync_set(rd_kafka_mock_cgrp_t *mcgrp, /** * @brief Member is explicitly leaving the group (through LeaveGroupRequest) */ -rd_kafka_resp_err_t -rd_kafka_mock_cgrp_member_leave(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member) { +rd_kafka_resp_err_t rd_kafka_mock_cgrp_classic_member_leave( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member) { rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", "Member %s is leaving group %s", member->id, mcgrp->id); - rd_kafka_mock_cgrp_member_destroy(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_destroy(mcgrp, member); - rd_kafka_mock_cgrp_rebalance(mcgrp, "explicit member leave"); + rd_kafka_mock_cgrp_classic_rebalance(mcgrp, "explicit member leave"); return RD_KAFKA_RESP_ERR_NO_ERROR; } @@ -266,8 +273,9 @@ rd_kafka_mock_cgrp_member_leave(rd_kafka_mock_cgrp_t *mcgrp, /** * @brief Destroys/frees an array of protocols, including the array itself. */ -void rd_kafka_mock_cgrp_protos_destroy(rd_kafka_mock_cgrp_proto_t *protos, - int proto_cnt) { +void rd_kafka_mock_cgrp_classic_protos_destroy( + rd_kafka_mock_cgrp_classic_proto_t *protos, + int proto_cnt) { int i; for (i = 0; i < proto_cnt; i++) { @@ -279,30 +287,46 @@ void rd_kafka_mock_cgrp_protos_destroy(rd_kafka_mock_cgrp_proto_t *protos, rd_free(protos); } -static void -rd_kafka_mock_cgrp_rebalance_timer_restart(rd_kafka_mock_cgrp_t *mcgrp, - int timeout_ms); +static void rd_kafka_mock_cgrp_classic_rebalance_timer_restart( + rd_kafka_mock_cgrp_classic_t *mcgrp, + int timeout_ms); /** * @brief Elect consumer group leader and send JoinGroup responses */ -static void rd_kafka_mock_cgrp_elect_leader(rd_kafka_mock_cgrp_t *mcgrp) { - rd_kafka_mock_cgrp_member_t *member; +static void +rd_kafka_mock_cgrp_classic_elect_leader(rd_kafka_mock_cgrp_classic_t *mcgrp) { + rd_kafka_mock_cgrp_classic_member_t *member; rd_assert(mcgrp->state == RD_KAFKA_MOCK_CGRP_STATE_JOINING); rd_assert(!TAILQ_EMPTY(&mcgrp->members)); mcgrp->generation_id++; - /* Elect a leader. - * FIXME: For now we'll use the first member */ - mcgrp->leader = TAILQ_FIRST(&mcgrp->members); + /* Elect a leader deterministically if the group.instance.id is + * available, using the lexicographic order of group.instance.ids. + * This is not how it's done on a real broker, which uses the first + * member joined. But we use a determinstic method for better testing, + * (in case we want to enforce a some consumer to be the group leader). + * If group.instance.id is not specified for any consumer, we use the + * first one joined, similar to the real broker. */ + mcgrp->leader = NULL; + TAILQ_FOREACH(member, &mcgrp->members, link) { + if (!mcgrp->leader) + mcgrp->leader = member; + else if (mcgrp->leader->group_instance_id && + member->group_instance_id && + (rd_strcmp(mcgrp->leader->group_instance_id, + member->group_instance_id) > 0)) + mcgrp->leader = member; + } - rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", - "Consumer group %s with %d member(s) is rebalancing: " - "elected leader is %s, generation id %d", - mcgrp->id, mcgrp->member_cnt, mcgrp->leader->id, - mcgrp->generation_id); + rd_kafka_dbg( + mcgrp->cluster->rk, MOCK, "MOCK", + "Consumer group %s with %d member(s) is rebalancing: " + "elected leader is %s (group.instance.id = %s), generation id %d", + mcgrp->id, mcgrp->member_cnt, mcgrp->leader->id, + mcgrp->leader->group_instance_id, mcgrp->generation_id); /* Find the most commonly supported protocol name among the members. * FIXME: For now we'll blindly use the first protocol of the leader. */ @@ -315,7 +339,7 @@ static void rd_kafka_mock_cgrp_elect_leader(rd_kafka_mock_cgrp_t *mcgrp) { rd_bool_t is_leader = member == mcgrp->leader; int member_cnt = is_leader ? mcgrp->member_cnt : 0; rd_kafka_buf_t *resp; - rd_kafka_mock_cgrp_member_t *member2; + rd_kafka_mock_cgrp_classic_member_t *member2; rd_kafka_mock_connection_t *mconn; /* Member connection has been closed, it will eventually @@ -357,7 +381,7 @@ static void rd_kafka_mock_cgrp_elect_leader(rd_kafka_mock_cgrp_t *mcgrp) { /* Mark each member as active to avoid them timing out * at the same time as a JoinGroup handler that blocks * session.timeout.ms to elect a leader. */ - rd_kafka_mock_cgrp_member_active(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_active(mcgrp, member); rd_kafka_mock_connection_set_blocking(mconn, rd_false); rd_kafka_mock_connection_send_response(mconn, resp); @@ -365,28 +389,32 @@ static void rd_kafka_mock_cgrp_elect_leader(rd_kafka_mock_cgrp_t *mcgrp) { mcgrp->last_member_cnt = mcgrp->member_cnt; - rd_kafka_mock_cgrp_set_state(mcgrp, RD_KAFKA_MOCK_CGRP_STATE_SYNCING, - "leader elected, waiting for all " - "members to sync"); + rd_kafka_mock_cgrp_classic_set_state(mcgrp, + RD_KAFKA_MOCK_CGRP_STATE_SYNCING, + "leader elected, waiting for all " + "members to sync"); - rd_kafka_mock_cgrp_rebalance_timer_restart(mcgrp, - mcgrp->session_timeout_ms); + rd_kafka_mock_cgrp_classic_rebalance_timer_restart( + mcgrp, mcgrp->session_timeout_ms); } /** * @brief Trigger group rebalance. */ -static void rd_kafka_mock_cgrp_rebalance(rd_kafka_mock_cgrp_t *mcgrp, - const char *reason) { +static void +rd_kafka_mock_cgrp_classic_rebalance(rd_kafka_mock_cgrp_classic_t *mcgrp, + const char *reason) { int timeout_ms; if (mcgrp->state == RD_KAFKA_MOCK_CGRP_STATE_JOINING) return; /* Do nothing, group is already rebalancing. */ else if (mcgrp->state == RD_KAFKA_MOCK_CGRP_STATE_EMPTY) - timeout_ms = 3000; /* First join, low timeout. - * Same as group.initial.rebalance.delay.ms - * on the broker. */ + /* First join, low timeout. + * Same as group.initial.rebalance.delay.ms + * on the broker. */ + timeout_ms = + mcgrp->cluster->defaults.group_initial_rebalance_delay_ms; else if (mcgrp->state == RD_KAFKA_MOCK_CGRP_STATE_REBALANCING && mcgrp->member_cnt == mcgrp->last_member_cnt) timeout_ms = 100; /* All members rejoined, quickly transition @@ -400,21 +428,23 @@ static void rd_kafka_mock_cgrp_rebalance(rd_kafka_mock_cgrp_t *mcgrp, if (mcgrp->state == RD_KAFKA_MOCK_CGRP_STATE_SYNCING) /* Abort current Syncing state */ - rd_kafka_mock_cgrp_sync_done( + rd_kafka_mock_cgrp_classic_sync_done( mcgrp, RD_KAFKA_RESP_ERR_REBALANCE_IN_PROGRESS); - rd_kafka_mock_cgrp_set_state(mcgrp, RD_KAFKA_MOCK_CGRP_STATE_JOINING, - reason); - rd_kafka_mock_cgrp_rebalance_timer_restart(mcgrp, timeout_ms); + rd_kafka_mock_cgrp_classic_set_state( + mcgrp, RD_KAFKA_MOCK_CGRP_STATE_JOINING, reason); + rd_kafka_mock_cgrp_classic_rebalance_timer_restart(mcgrp, timeout_ms); } /** * @brief Consumer group state machine triggered by timer events. */ -static void rd_kafka_mock_cgrp_fsm_timeout(rd_kafka_mock_cgrp_t *mcgrp) { +static void +rd_kafka_mock_cgrp_classic_fsm_timeout(rd_kafka_mock_cgrp_classic_t *mcgrp) { rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", "Mock consumer group %s FSM timeout in state %s", - mcgrp->id, rd_kafka_mock_cgrp_state_names[mcgrp->state]); + mcgrp->id, + rd_kafka_mock_cgrp_classic_state_names[mcgrp->state]); switch (mcgrp->state) { case RD_KAFKA_MOCK_CGRP_STATE_EMPTY: @@ -423,9 +453,9 @@ static void rd_kafka_mock_cgrp_fsm_timeout(rd_kafka_mock_cgrp_t *mcgrp) { case RD_KAFKA_MOCK_CGRP_STATE_JOINING: /* Timed out waiting for more members, elect a leader */ if (mcgrp->member_cnt > 0) - rd_kafka_mock_cgrp_elect_leader(mcgrp); + rd_kafka_mock_cgrp_classic_elect_leader(mcgrp); else - rd_kafka_mock_cgrp_set_state( + rd_kafka_mock_cgrp_classic_set_state( mcgrp, RD_KAFKA_MOCK_CGRP_STATE_EMPTY, "no members joined"); break; @@ -434,20 +464,20 @@ static void rd_kafka_mock_cgrp_fsm_timeout(rd_kafka_mock_cgrp_t *mcgrp) { /* Timed out waiting for all members to sync */ /* Send error response to all waiting members */ - rd_kafka_mock_cgrp_sync_done( + rd_kafka_mock_cgrp_classic_sync_done( mcgrp, RD_KAFKA_RESP_ERR_REBALANCE_IN_PROGRESS /* FIXME */); - rd_kafka_mock_cgrp_set_state( + rd_kafka_mock_cgrp_classic_set_state( mcgrp, RD_KAFKA_MOCK_CGRP_STATE_REBALANCING, "timed out waiting for all members to synchronize"); break; case RD_KAFKA_MOCK_CGRP_STATE_REBALANCING: /* Timed out waiting for all members to Leave or re-Join */ - rd_kafka_mock_cgrp_set_state(mcgrp, - RD_KAFKA_MOCK_CGRP_STATE_JOINING, - "timed out waiting for all " - "members to re-Join or Leave"); + rd_kafka_mock_cgrp_classic_set_state( + mcgrp, RD_KAFKA_MOCK_CGRP_STATE_JOINING, + "timed out waiting for all " + "members to re-Join or Leave"); break; case RD_KAFKA_MOCK_CGRP_STATE_UP: @@ -459,27 +489,27 @@ static void rd_kafka_mock_cgrp_fsm_timeout(rd_kafka_mock_cgrp_t *mcgrp) { static void rd_kafka_mcgrp_rebalance_timer_cb(rd_kafka_timers_t *rkts, void *arg) { - rd_kafka_mock_cgrp_t *mcgrp = arg; + rd_kafka_mock_cgrp_classic_t *mcgrp = arg; - rd_kafka_mock_cgrp_fsm_timeout(mcgrp); + rd_kafka_mock_cgrp_classic_fsm_timeout(mcgrp); } /** * @brief Restart the rebalance timer, postponing leader election. */ -static void -rd_kafka_mock_cgrp_rebalance_timer_restart(rd_kafka_mock_cgrp_t *mcgrp, - int timeout_ms) { +static void rd_kafka_mock_cgrp_classic_rebalance_timer_restart( + rd_kafka_mock_cgrp_classic_t *mcgrp, + int timeout_ms) { rd_kafka_timer_start_oneshot( &mcgrp->cluster->timers, &mcgrp->rebalance_tmr, rd_true, timeout_ms * 1000, rd_kafka_mcgrp_rebalance_timer_cb, mcgrp); } -static void -rd_kafka_mock_cgrp_member_destroy(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member) { +static void rd_kafka_mock_cgrp_classic_member_destroy( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member) { rd_assert(mcgrp->member_cnt > 0); TAILQ_REMOVE(&mcgrp->members, member, link); mcgrp->member_cnt--; @@ -492,9 +522,10 @@ rd_kafka_mock_cgrp_member_destroy(rd_kafka_mock_cgrp_t *mcgrp, if (member->group_instance_id) rd_free(member->group_instance_id); - rd_kafka_mock_cgrp_member_assignment_set(mcgrp, member, NULL); + rd_kafka_mock_cgrp_classic_member_assignment_set(mcgrp, member, NULL); - rd_kafka_mock_cgrp_protos_destroy(member->protos, member->proto_cnt); + rd_kafka_mock_cgrp_classic_protos_destroy(member->protos, + member->proto_cnt); rd_free(member); } @@ -503,13 +534,13 @@ rd_kafka_mock_cgrp_member_destroy(rd_kafka_mock_cgrp_t *mcgrp, /** * @brief Find member in group. */ -rd_kafka_mock_cgrp_member_t * -rd_kafka_mock_cgrp_member_find(const rd_kafka_mock_cgrp_t *mcgrp, - const rd_kafkap_str_t *MemberId) { - const rd_kafka_mock_cgrp_member_t *member; +rd_kafka_mock_cgrp_classic_member_t *rd_kafka_mock_cgrp_classic_member_find( + const rd_kafka_mock_cgrp_classic_t *mcgrp, + const rd_kafkap_str_t *MemberId) { + const rd_kafka_mock_cgrp_classic_member_t *member; TAILQ_FOREACH(member, &mcgrp->members, link) { if (!rd_kafkap_str_cmp_str(MemberId, member->id)) - return (rd_kafka_mock_cgrp_member_t *)member; + return (rd_kafka_mock_cgrp_classic_member_t *)member; } return NULL; @@ -519,24 +550,25 @@ rd_kafka_mock_cgrp_member_find(const rd_kafka_mock_cgrp_t *mcgrp, /** * @brief Update or add member to consumer group */ -rd_kafka_resp_err_t -rd_kafka_mock_cgrp_member_add(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_connection_t *mconn, - rd_kafka_buf_t *resp, - const rd_kafkap_str_t *MemberId, - const rd_kafkap_str_t *ProtocolType, - rd_kafka_mock_cgrp_proto_t *protos, - int proto_cnt, - int session_timeout_ms) { - rd_kafka_mock_cgrp_member_t *member; +rd_kafka_resp_err_t rd_kafka_mock_cgrp_classic_member_add( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *resp, + const rd_kafkap_str_t *MemberId, + const rd_kafkap_str_t *ProtocolType, + const rd_kafkap_str_t *GroupInstanceId, + rd_kafka_mock_cgrp_classic_proto_t *protos, + int proto_cnt, + int session_timeout_ms) { + rd_kafka_mock_cgrp_classic_member_t *member; rd_kafka_resp_err_t err; - err = rd_kafka_mock_cgrp_check_state(mcgrp, NULL, resp, -1); + err = rd_kafka_mock_cgrp_classic_check_state(mcgrp, NULL, resp, -1); if (err) return err; /* Find member */ - member = rd_kafka_mock_cgrp_member_find(mcgrp, MemberId); + member = rd_kafka_mock_cgrp_classic_member_find(mcgrp, MemberId); if (!member) { /* Not found, add member */ member = rd_calloc(1, sizeof(*member)); @@ -549,25 +581,29 @@ rd_kafka_mock_cgrp_member_add(rd_kafka_mock_cgrp_t *mcgrp, } else member->id = RD_KAFKAP_STR_DUP(MemberId); + if (RD_KAFKAP_STR_LEN(GroupInstanceId)) + member->group_instance_id = + RD_KAFKAP_STR_DUP(GroupInstanceId); + TAILQ_INSERT_TAIL(&mcgrp->members, member, link); mcgrp->member_cnt++; } if (mcgrp->state != RD_KAFKA_MOCK_CGRP_STATE_JOINING) - rd_kafka_mock_cgrp_rebalance(mcgrp, "member join"); + rd_kafka_mock_cgrp_classic_rebalance(mcgrp, "member join"); mcgrp->session_timeout_ms = session_timeout_ms; if (member->protos) - rd_kafka_mock_cgrp_protos_destroy(member->protos, - member->proto_cnt); + rd_kafka_mock_cgrp_classic_protos_destroy(member->protos, + member->proto_cnt); member->protos = protos; member->proto_cnt = proto_cnt; rd_assert(!member->resp); member->resp = resp; member->conn = mconn; - rd_kafka_mock_cgrp_member_active(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_active(mcgrp, member); return RD_KAFKA_RESP_ERR_NO_ERROR; } @@ -575,10 +611,10 @@ rd_kafka_mock_cgrp_member_add(rd_kafka_mock_cgrp_t *mcgrp, /** * @brief Check if any members have exceeded the session timeout. */ -static void rd_kafka_mock_cgrp_session_tmr_cb(rd_kafka_timers_t *rkts, - void *arg) { - rd_kafka_mock_cgrp_t *mcgrp = arg; - rd_kafka_mock_cgrp_member_t *member, *tmp; +static void rd_kafka_mock_cgrp_classic_session_tmr_cb(rd_kafka_timers_t *rkts, + void *arg) { + rd_kafka_mock_cgrp_classic_t *mcgrp = arg; + rd_kafka_mock_cgrp_classic_member_t *member, *tmp; rd_ts_t now = rd_clock(); int timeout_cnt = 0; @@ -592,19 +628,19 @@ static void rd_kafka_mock_cgrp_session_tmr_cb(rd_kafka_timers_t *rkts, "Member %s session timed out for group %s", member->id, mcgrp->id); - rd_kafka_mock_cgrp_member_destroy(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_destroy(mcgrp, member); timeout_cnt++; } if (timeout_cnt) - rd_kafka_mock_cgrp_rebalance(mcgrp, "member timeout"); + rd_kafka_mock_cgrp_classic_rebalance(mcgrp, "member timeout"); } -void rd_kafka_mock_cgrp_destroy(rd_kafka_mock_cgrp_t *mcgrp) { - rd_kafka_mock_cgrp_member_t *member; +void rd_kafka_mock_cgrp_classic_destroy(rd_kafka_mock_cgrp_classic_t *mcgrp) { + rd_kafka_mock_cgrp_classic_member_t *member; - TAILQ_REMOVE(&mcgrp->cluster->cgrps, mcgrp, link); + TAILQ_REMOVE(&mcgrp->cluster->cgrps_classic, mcgrp, link); rd_kafka_timer_stop(&mcgrp->cluster->timers, &mcgrp->rebalance_tmr, rd_true); @@ -615,15 +651,16 @@ void rd_kafka_mock_cgrp_destroy(rd_kafka_mock_cgrp_t *mcgrp) { if (mcgrp->protocol_name) rd_free(mcgrp->protocol_name); while ((member = TAILQ_FIRST(&mcgrp->members))) - rd_kafka_mock_cgrp_member_destroy(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_destroy(mcgrp, member); rd_free(mcgrp); } -rd_kafka_mock_cgrp_t *rd_kafka_mock_cgrp_find(rd_kafka_mock_cluster_t *mcluster, - const rd_kafkap_str_t *GroupId) { - rd_kafka_mock_cgrp_t *mcgrp; - TAILQ_FOREACH(mcgrp, &mcluster->cgrps, link) { +rd_kafka_mock_cgrp_classic_t * +rd_kafka_mock_cgrp_classic_find(rd_kafka_mock_cluster_t *mcluster, + const rd_kafkap_str_t *GroupId) { + rd_kafka_mock_cgrp_classic_t *mcgrp; + TAILQ_FOREACH(mcgrp, &mcluster->cgrps_classic, link) { if (!rd_kafkap_str_cmp_str(GroupId, mcgrp->id)) return mcgrp; } @@ -633,15 +670,15 @@ rd_kafka_mock_cgrp_t *rd_kafka_mock_cgrp_find(rd_kafka_mock_cluster_t *mcluster, /** - * @brief Find or create a consumer group + * @brief Find or create a classic consumer group */ -rd_kafka_mock_cgrp_t * -rd_kafka_mock_cgrp_get(rd_kafka_mock_cluster_t *mcluster, - const rd_kafkap_str_t *GroupId, - const rd_kafkap_str_t *ProtocolType) { - rd_kafka_mock_cgrp_t *mcgrp; +rd_kafka_mock_cgrp_classic_t * +rd_kafka_mock_cgrp_classic_get(rd_kafka_mock_cluster_t *mcluster, + const rd_kafkap_str_t *GroupId, + const rd_kafkap_str_t *ProtocolType) { + rd_kafka_mock_cgrp_classic_t *mcgrp; - mcgrp = rd_kafka_mock_cgrp_find(mcluster, GroupId); + mcgrp = rd_kafka_mock_cgrp_classic_find(mcluster, GroupId); if (mcgrp) return mcgrp; @@ -656,24 +693,25 @@ rd_kafka_mock_cgrp_get(rd_kafka_mock_cluster_t *mcluster, TAILQ_INIT(&mcgrp->members); rd_kafka_timer_start(&mcluster->timers, &mcgrp->session_tmr, 1000 * 1000 /*1s*/, - rd_kafka_mock_cgrp_session_tmr_cb, mcgrp); + rd_kafka_mock_cgrp_classic_session_tmr_cb, mcgrp); - TAILQ_INSERT_TAIL(&mcluster->cgrps, mcgrp, link); + TAILQ_INSERT_TAIL(&mcluster->cgrps_classic, mcgrp, link); return mcgrp; } /** - * @brief A client connection closed, check if any cgrp has any state + * @brief A client connection closed, check if any classic cgrp has any state * for this connection that needs to be cleared. */ -void rd_kafka_mock_cgrps_connection_closed(rd_kafka_mock_cluster_t *mcluster, - rd_kafka_mock_connection_t *mconn) { - rd_kafka_mock_cgrp_t *mcgrp; +void rd_kafka_mock_cgrps_classic_connection_closed( + rd_kafka_mock_cluster_t *mcluster, + rd_kafka_mock_connection_t *mconn) { + rd_kafka_mock_cgrp_classic_t *mcgrp; - TAILQ_FOREACH(mcgrp, &mcluster->cgrps, link) { - rd_kafka_mock_cgrp_member_t *member, *tmp; + TAILQ_FOREACH(mcgrp, &mcluster->cgrps_classic, link) { + rd_kafka_mock_cgrp_classic_member_t *member, *tmp; TAILQ_FOREACH_SAFE(member, &mcgrp->members, link, tmp) { if (member->conn == mconn) { member->conn = NULL; @@ -685,3 +723,1154 @@ void rd_kafka_mock_cgrps_connection_closed(rd_kafka_mock_cluster_t *mcluster, } } } + +/** + * @struct Target assignment for a consumer group. + * `member_ids` and `assignment` are in the same order + * and have the same count. + */ +typedef struct rd_kafka_mock_cgrp_consumer_target_assignment_s { + rd_list_t *member_ids; /**< Member id list (char *). */ + rd_list_t *assignment; /**< Assingment list + (rd_kafka_topic_partition_list_t *). */ +} rd_kafka_mock_cgrp_consumer_target_assignment_t; + +static rd_kafka_mock_cgrp_consumer_target_assignment_t * +rd_kafka_mock_cgrp_consumer_target_assignment_new0(rd_list_t *member_ids, + rd_list_t *assignment) { + rd_assert(member_ids->rl_cnt == assignment->rl_cnt); + rd_kafka_mock_cgrp_consumer_target_assignment_t *target_assignment = + rd_calloc(1, sizeof(*target_assignment)); + target_assignment->member_ids = + rd_list_copy(member_ids, rd_list_string_copy, NULL); + target_assignment->assignment = rd_list_copy( + assignment, rd_kafka_topic_partition_list_copy_opaque, NULL); + return target_assignment; +} + +rd_kafka_mock_cgrp_consumer_target_assignment_t * +rd_kafka_mock_cgrp_consumer_target_assignment_new( + char **member_ids, + int member_cnt, + rd_kafka_topic_partition_list_t **assignment) { + int i; + rd_list_t *member_id_list, *assignment_list; + rd_kafka_mock_cgrp_consumer_target_assignment_t *ret; + + member_id_list = rd_list_new(member_cnt, rd_free); + assignment_list = + rd_list_new(member_cnt, rd_kafka_topic_partition_list_destroy_free); + for (i = 0; i < member_cnt; i++) { + rd_list_add(member_id_list, rd_strdup(member_ids[i])); + rd_list_add(assignment_list, + rd_kafka_topic_partition_list_copy(assignment[i])); + } + + ret = rd_kafka_mock_cgrp_consumer_target_assignment_new0( + member_id_list, assignment_list); + rd_list_destroy(member_id_list); + rd_list_destroy(assignment_list); + return ret; +} + +void rd_kafka_mock_cgrp_consumer_target_assignment_destroy( + rd_kafka_mock_cgrp_consumer_target_assignment_t *target_assignment) { + rd_list_destroy(target_assignment->member_ids); + rd_list_destroy(target_assignment->assignment); + rd_free(target_assignment); +} + +/** + * @brief Sets next target assignment and member epoch for \p member + * to a copy of partition list \p rktparlist, + * filling its topic ids if not provided, using \p cgrp cluster topics. + * + * @param mcgrp The consumer group containing the member. + * @param member A consumer group member. + * @param target_member_epoch New member epoch. + * @param rktparlist Next target assignment. + * + * @locks mcluster->lock MUST be held. + */ +static void rd_kafka_mock_cgrp_consumer_member_target_assignment_set( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_member_t *member, + int target_member_epoch, + const rd_kafka_topic_partition_list_t *rktparlist) { + rd_kafka_topic_partition_t *rktpar; + if (member->target_assignment) { + rd_kafka_topic_partition_list_destroy( + member->target_assignment); + } + member->target_member_epoch = target_member_epoch; + member->target_assignment = + rd_kafka_topic_partition_list_copy(rktparlist); + + /* If not present, fill topic ids using names */ + RD_KAFKA_TPLIST_FOREACH(rktpar, member->target_assignment) { + rd_kafka_Uuid_t topic_id = + rd_kafka_topic_partition_get_topic_id(rktpar); + if (!rd_kafka_Uuid_cmp(topic_id, RD_KAFKA_UUID_ZERO)) { + rd_kafka_mock_topic_t *mtopic = + rd_kafka_mock_topic_find(mcgrp->cluster, + rktpar->topic); + if (mtopic) + rd_kafka_topic_partition_set_topic_id( + rktpar, mtopic->id); + } + } +} + +/** + * @brief Sets next target assignment for group \p mcgrp + * to a copy of \p target_assignment partition lists. + * + * @param mcgrp The consumer group. + * @param target_assignment Target assignment for all members. + * + * @locks mcluster->lock MUST be held. + */ +static void rd_kafka_mock_cgrp_consumer_target_assignment_set( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_target_assignment_t *target_assignment) { + int i = 0; + int32_t new_target_member_epoch; + const char *member_id; + rd_kafka_mock_cgrp_consumer_member_t *member; + + mcgrp->group_epoch++; + new_target_member_epoch = mcgrp->group_epoch; + RD_LIST_FOREACH(member_id, target_assignment->member_ids, i) { + rd_kafkap_str_t *member_id_str = + rd_kafkap_str_new(member_id, strlen(member_id)); + rd_kafka_topic_partition_list_t *member_assignment = + rd_list_elem(target_assignment->assignment, i); + member = rd_kafka_mock_cgrp_consumer_member_find(mcgrp, + member_id_str); + rd_kafkap_str_destroy(member_id_str); + + if (!member) + continue; + + rd_kafka_mock_cgrp_consumer_member_target_assignment_set( + mcgrp, member, new_target_member_epoch, member_assignment); + } +} + +typedef RD_MAP_TYPE(const char *, rd_list_t *) map_str_list; +typedef RD_MAP_TYPE(const char *, int *) map_str_int; + +/** + * @brief Calculate a simple range target assignment for the consumer group \p + * mcgrp. This isn't replicating any given broker assignor but is used + * when the test doesn't need a specific type of assignment. + * + * If the test needs it, instead of replicating same conditions with all the + * members, one can mock the assignment directly with + * `rd_kafka_mock_cgrp_consumer_target_assignment`. + */ +static rd_kafka_mock_cgrp_consumer_target_assignment_t * +rd_kafka_mock_cgrp_consumer_target_assignment_calculate_range( + const rd_kafka_mock_cgrp_consumer_t *mcgrp) { + int i, *i_pointer; + const char *topic; + rd_list_t *members; + rd_kafka_mock_cgrp_consumer_member_t *member; + rd_kafka_mock_cluster_t *mcluster = mcgrp->cluster; + /* List of member ids (char *) */ + rd_list_t *member_ids = rd_list_new(mcgrp->member_cnt, rd_free); + /* List of member assignment (rd_kafka_topic_partition_list_t *) */ + rd_list_t *assignment = rd_list_new( + mcgrp->member_cnt, rd_kafka_topic_partition_list_destroy_free); + /* Map from topic name to list of members */ + map_str_list topic_members = + RD_MAP_INITIALIZER(mcgrp->member_cnt, rd_map_str_cmp, + rd_map_str_hash, NULL, rd_list_destroy_free); + /* Map from member id to index in the members and assignment lists. */ + map_str_int member_idx = RD_MAP_INITIALIZER( + mcgrp->member_cnt, rd_map_str_cmp, rd_map_str_hash, NULL, rd_free); + + i = 0; + + /* First create a map with topics associated to the list of members + * and save the member idx in the `member_idx` map. */ + TAILQ_FOREACH(member, &mcgrp->members, link) { + int j; + rd_list_add(member_ids, rd_strdup(member->id)); + rd_list_add(assignment, rd_kafka_topic_partition_list_new(0)); + + RD_LIST_FOREACH(topic, member->subscribed_topics, j) { + if (!RD_MAP_GET(&topic_members, topic)) { + members = rd_list_new(0, NULL); + RD_MAP_SET(&topic_members, topic, members); + } else + members = RD_MAP_GET(&topic_members, topic); + rd_list_add(members, member); + } + i_pointer = rd_calloc(1, sizeof(*i_pointer)); + *i_pointer = i; + RD_MAP_SET(&member_idx, member->id, i_pointer); + i++; + } + + /* For each topic to a range assignment and add the + * corresponding partitions to the assignment for that member. + * Finds the list index using the `member_idx` map. */ + RD_MAP_FOREACH(topic, members, &topic_members) { + rd_kafka_Uuid_t topic_id; + rd_kafka_topic_partition_list_t *member_assignment; + int members_cnt = rd_list_cnt(members); + int common, one_more, assigned = 0; + rd_kafkap_str_t Topic = {.str = topic, .len = strlen(topic)}; + rd_kafka_mock_topic_t *mock_topic = + rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); + if (!mock_topic) + continue; + + topic_id = mock_topic->id; + + /* Assign one partition more + * to the first mock_topic->partition_cnt % members_cnt + * members. */ + common = mock_topic->partition_cnt / members_cnt; + one_more = mock_topic->partition_cnt % members_cnt; + + RD_LIST_FOREACH(member, members, i) { + int j, num_partitions = common; + int idx = *RD_MAP_GET(&member_idx, member->id); + member_assignment = rd_list_elem(assignment, idx); + if (idx < one_more) + num_partitions++; + for (j = 0; j < num_partitions; j++) { + rd_kafka_topic_partition_t *rktpar = + rd_kafka_topic_partition_list_add( + member_assignment, topic, assigned + j); + rd_kafka_topic_partition_set_topic_id(rktpar, + topic_id); + } + assigned += num_partitions; + } + } + + rd_kafka_mock_cgrp_consumer_target_assignment_t *ret = + rd_kafka_mock_cgrp_consumer_target_assignment_new0(member_ids, + assignment); + + RD_MAP_DESTROY(&topic_members); + RD_MAP_DESTROY(&member_idx); + + rd_list_destroy(member_ids); + rd_list_destroy(assignment); + + return ret; +} + +/** + * @brief Recalculate and set a target assignment for \p mcgrp + * only if `mcgrp->manual_assignment` isn't set. + * + * @locks mcluster->lock MUST be held. + */ +static void rd_kafka_mock_cgrp_consumer_target_assignment_recalculate( + rd_kafka_mock_cgrp_consumer_t *mcgrp) { + if (mcgrp->manual_assignment) + return; + + rd_kafka_mock_cgrp_consumer_target_assignment_t *target_assignment = + rd_kafka_mock_cgrp_consumer_target_assignment_calculate_range( + mcgrp); + rd_kafka_mock_cgrp_consumer_target_assignment_set(mcgrp, + target_assignment); + rd_kafka_mock_cgrp_consumer_target_assignment_destroy( + target_assignment); +} + +/** + * @brief Set manual target assignment \p target_assignment + * to the consumer group \p mcgrp . + * + * @param mcgrp Consumer group + * @param target_assignment Target assignment to set. + * Pass NULL to return to automatic assignment. + * + * @locks mcluster->lock MUST be held. + */ +static void rd_kafka_mock_cgrp_consumer_target_assignment_set_manual( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_target_assignment_t *target_assignment) { + if (!target_assignment) { + mcgrp->manual_assignment = rd_false; + rd_kafka_mock_cgrp_consumer_target_assignment_recalculate( + mcgrp); + return; + } + + mcgrp->manual_assignment = rd_true; + + rd_kafka_mock_cgrp_consumer_target_assignment_set(mcgrp, + target_assignment); +} + +/** + * @brief Sets \p member current assignment to a copy of + * \p current_assignment. + * + * @param member A consumer group member. + * @param current_assignment Current assignment to set. + * + * @locks mcluster->lock MUST be held. + */ +static void rd_kafka_mock_cgrp_consumer_member_current_assignment_set( + rd_kafka_mock_cgrp_consumer_member_t *member, + const rd_kafka_topic_partition_list_t *current_assignment) { + if (member->current_assignment) { + rd_kafka_topic_partition_list_destroy( + member->current_assignment); + } + + member->current_assignment = + current_assignment + ? rd_kafka_topic_partition_list_copy(current_assignment) + : NULL; +} + +/** + * @brief Sets \p member returned assignment to a + * copy of \p returned_assignment. + * + * @param member A consumer group member. + * @param returned_assignment Returned assignment to set. + * + * @locks mcluster->lock MUST be held. + */ +static void rd_kafka_mock_cgrp_consumer_member_returned_assignment_set( + rd_kafka_mock_cgrp_consumer_member_t *member, + const rd_kafka_topic_partition_list_t *returned_assignment) { + if (member->returned_assignment) { + rd_kafka_topic_partition_list_destroy( + member->returned_assignment); + } + member->returned_assignment = + returned_assignment + ? rd_kafka_topic_partition_list_copy(returned_assignment) + : NULL; +} + +/** + * @brief Returns a copy of \p member target assignment containing only + * partitions that can be assignment, whose topic id is non-zero. + * + * @param member The group member. + * + * @remark The returned pointer ownership is transferred to the caller. + * + * @locks mcluster->lock MUST be held. + */ +static rd_kafka_topic_partition_list_t * +rd_kafka_mock_cgrp_consumer_member_target_assignment_assignable( + rd_kafka_mock_cgrp_consumer_member_t *member) { + rd_kafka_topic_partition_list_t *assignment = member->target_assignment; + rd_kafka_topic_partition_t *rktpar; + rd_kafka_topic_partition_list_t *ret = + rd_kafka_topic_partition_list_new(assignment->cnt); + + RD_KAFKA_TPLIST_FOREACH(rktpar, assignment) { + rd_kafka_Uuid_t topic_id = + rd_kafka_topic_partition_get_topic_id(rktpar); + if (rd_kafka_Uuid_cmp(topic_id, RD_KAFKA_UUID_ZERO)) { + rd_kafka_topic_partition_list_add_copy(ret, rktpar); + } + } + + return ret; +} + +/** + * Returns true iff \p new_assignment doesn't have any intersection with any + * other member current assignment. + * + * If there's an intersection, it means we cannot bump the epoch at the moment, + * because some of these partitions are held by a different member. They have + * to be revoked from that member before it's possible to increase the epoch + * and assign additional partitions to this member. + */ +rd_bool_t rd_kafka_mock_cgrp_consumer_member_next_assignment_can_bump_epoch( + rd_kafka_mock_cgrp_consumer_member_t *member, + rd_kafka_topic_partition_list_t *new_assignment) { + rd_kafka_topic_partition_list_t *double_assignment, + *assigned_partitions = rd_kafka_topic_partition_list_new(0); + rd_kafka_mock_cgrp_consumer_member_t *other_member; + rd_kafka_mock_cgrp_consumer_t *mcgrp = member->mcgrp; + rd_bool_t ret; + + TAILQ_FOREACH(other_member, &mcgrp->members, link) { + int other_current_assignment_cnt = 0, + other_returned_assignment_cnt = 0; + if (member == other_member) + continue; + if (other_member->current_assignment) + other_current_assignment_cnt = + other_member->current_assignment->cnt; + if (other_member->returned_assignment) + other_returned_assignment_cnt = + other_member->returned_assignment->cnt; + + if (other_current_assignment_cnt > 0 && + other_current_assignment_cnt > + other_returned_assignment_cnt) { + /* This is the case where we're revoking + * some partitions. + * returned_assignment < current_assignment. */ + rd_kafka_topic_partition_list_add_list( + assigned_partitions, + other_member->current_assignment); + } else if (other_returned_assignment_cnt > 0) { + /* This is the case where we're assigning + * some partitions. + * returned_assignment >= current_assignment. */ + rd_kafka_topic_partition_list_add_list( + assigned_partitions, + other_member->returned_assignment); + } + } + double_assignment = rd_kafka_topic_partition_list_intersection_by_id( + new_assignment, assigned_partitions); + ret = double_assignment->cnt == 0; + + rd_kafka_topic_partition_list_destroy(assigned_partitions); + rd_kafka_topic_partition_list_destroy(double_assignment); + return ret; +} + +/** + * @brief Calculates if \p member, + * needs a revocation, that is if its current assignment + * isn't a subset of its target assignment. + * In case it needs a revocation, it returns + * the intersection between the two assignments, + * that is the remaining partitions after revocation + * of those not included in target assignment. + * + * @param member The group member. + * + * @return The remaining set of partitions, or NULL in case no revocation + * is needed. + * + * @remark The returned pointer ownership is transferred to the caller. + * + * @locks mcluster->lock MUST be held. + */ +static rd_kafka_topic_partition_list_t * +rd_kafka_mock_cgrp_consumer_member_needs_revocation( + rd_kafka_mock_cgrp_consumer_member_t *member) { + rd_kafka_topic_partition_list_t *intersection; + rd_bool_t needs_revocation; + + if (member->current_assignment) + /* If we have a current assignment we + * calculate the intersection with + * target assignment. */ + intersection = rd_kafka_topic_partition_list_intersection_by_id( + member->current_assignment, member->target_assignment); + else + /* Otherwise intersection is empty. */ + intersection = rd_kafka_topic_partition_list_new(0); + + needs_revocation = member->current_assignment && + intersection->cnt < member->current_assignment->cnt; + if (needs_revocation) { + return intersection; + } + + rd_kafka_topic_partition_list_destroy(intersection); + return NULL; +} + +/** + * @brief Calculates if \p member, + * can receive new partitions, given revocation is completed. + * In case new partitions aren't held by other members it + * returns the assignable target assignment and bumps current + * member epoch, otherwise it returns NULL and + * doesn't change current member epoch. + * + * @param member The group member. + * + * @return The assignable set of partitions, or NULL in case new partitions + * cannot be assigned yet. + * + * @remark The returned pointer ownership is transferred to the caller. + * + * @locks mcluster->lock MUST be held. + */ +static rd_kafka_topic_partition_list_t * +rd_kafka_mock_cgrp_consumer_member_needs_assignment( + rd_kafka_mock_cgrp_consumer_member_t *member) { + rd_kafka_topic_partition_list_t *returned_assignment = + rd_kafka_mock_cgrp_consumer_member_target_assignment_assignable( + member); + + if (!rd_kafka_mock_cgrp_consumer_member_next_assignment_can_bump_epoch( + member, returned_assignment)) { + /* We can't bump the epoch still, + * there are some partitions held by other members. + * We have to return NULL. */ + rd_kafka_topic_partition_list_destroy(returned_assignment); + return NULL; + } + + /* No partitions to remove, return + * target assignment and reconcile the + * epochs */ + member->current_member_epoch = member->target_member_epoch; + return returned_assignment; +} + +/** + * @brief Calculates next assignment and member epoch for a \p member, + * given \p current_assignment. + * + * @param member The group member. + * @param current_assignment The assignment sent by the member, or NULL if it + * didn't change. Must be NULL if *member_epoch is 0. + * @param member_epoch Pointer to client reported member epoch. Can be updated. + * + * @return The new assignment to return to the member. + * + * @remark The returned pointer ownership is transferred to the caller. + * + * @locks mcluster->lock MUST be held. + */ +rd_kafka_topic_partition_list_t * +rd_kafka_mock_cgrp_consumer_member_next_assignment( + rd_kafka_mock_cgrp_consumer_member_t *member, + rd_kafka_topic_partition_list_t *current_assignment, + int *member_epoch) { + rd_kafka_topic_partition_list_t *assignment_to_return = NULL; + + if (current_assignment) { + /* Update current assignment to reflect what is provided + * by the client. */ + rd_kafka_mock_cgrp_consumer_member_current_assignment_set( + member, current_assignment); + } + + if (*member_epoch > 0 && + member->current_member_epoch != *member_epoch) { + /* Member epoch is different from the one we expect, + * that means we have to fence the member. */ + *member_epoch = -1; /* FENCED_MEMBER_EPOCH */ + return NULL; + } + + if (member->target_assignment) { + /* We have a target assignment, + * let's check if we can assign it. */ + + if (*member_epoch != member->current_member_epoch || + member->current_member_epoch != + member->target_member_epoch) { + /* Epochs are different, that means we have to bump the + * epoch immediately or do some revocations + * before that. */ + + assignment_to_return = + rd_kafka_mock_cgrp_consumer_member_needs_revocation( + member); + if (!assignment_to_return) { + /* After revocation we only have to + * add new partitions. + * In case these new partitions are held + * by other members we still cannot do it. */ + assignment_to_return = + rd_kafka_mock_cgrp_consumer_member_needs_assignment( + member); + } + } else if (!member->returned_assignment) { + /* If all the epochs are the same, the only case + * where we have to return the assignment is + * after a disconnection, when returned_assignment has + * been reset to NULL. */ + assignment_to_return = + rd_kafka_mock_cgrp_consumer_member_target_assignment_assignable( + member); + } + } + + *member_epoch = member->current_member_epoch; + if (assignment_to_return) { + /* Compare assignment_to_return with last returned_assignment. + * If equal, return NULL, otherwise return assignment_to_return + * and update last returned_assignment. */ + rd_bool_t same_returned_assignment = + member->returned_assignment && + !rd_kafka_topic_partition_list_cmp( + member->returned_assignment, assignment_to_return, + rd_kafka_topic_partition_by_id_cmp); + + if (same_returned_assignment) { + /* Returned assignment is the same as previous + * one, we return NULL instead to show no change. */ + rd_kafka_topic_partition_list_destroy( + assignment_to_return); + assignment_to_return = NULL; + } else { + /* We store returned assignment + * for later comparison. */ + rd_kafka_mock_cgrp_consumer_member_returned_assignment_set( + member, assignment_to_return); + } + } + return assignment_to_return; +} + +/** + * @brief Mark member as active (restart session timer). + * + * @param mcgrp Member's consumer group. + * @param member Member to set as active. + * + * @locks mcluster->lock MUST be held. + */ +void rd_kafka_mock_cgrp_consumer_member_active( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_member_t *member) { + rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", + "Marking mock consumer group member %s as active", + member->id); + member->ts_last_activity = rd_clock(); +} + +/** + * @brief Finds a member in consumer group \p mcgrp by \p MemberId. + * + * @param mcgrp Consumer group to search. + * @param MemberId Member id to look for. + * @return Found member or NULL. + * + * @locks mcluster->lock MUST be held. + */ +rd_kafka_mock_cgrp_consumer_member_t *rd_kafka_mock_cgrp_consumer_member_find( + const rd_kafka_mock_cgrp_consumer_t *mcgrp, + const rd_kafkap_str_t *MemberId) { + const rd_kafka_mock_cgrp_consumer_member_t *member; + TAILQ_FOREACH(member, &mcgrp->members, link) { + if (!rd_kafkap_str_cmp_str(MemberId, member->id)) + return (rd_kafka_mock_cgrp_consumer_member_t *)member; + } + + return NULL; +} + +/** + * @brief Finds a member in consumer group \p mcgrp by \p InstanceId. + * + * @param mcgrp Consumer group to search. + * @param InstanceId Instance id to look for. + * @return Found member or NULL. + * + * @locks mcluster->lock MUST be held. + */ +rd_kafka_mock_cgrp_consumer_member_t * +rd_kafka_mock_cgrp_consumer_member_find_by_instance_id( + const rd_kafka_mock_cgrp_consumer_t *mcgrp, + const rd_kafkap_str_t *InstanceId) { + if (RD_KAFKAP_STR_IS_NULL(InstanceId)) + return NULL; + + const rd_kafka_mock_cgrp_consumer_member_t *member; + TAILQ_FOREACH(member, &mcgrp->members, link) { + if (!member->instance_id) + continue; + + if (!rd_kafkap_str_cmp_str(InstanceId, member->instance_id)) + return (rd_kafka_mock_cgrp_consumer_member_t *)member; + } + + return NULL; +} + +static void validate_subscription(const rd_kafkap_str_t *SubscribedTopicNames, + int32_t SubscribedTopicNamesCnt, + const rd_kafkap_str_t *SubscribedTopicRegex) { + /* Either they are both NULL + * or both non-NULL. */ + rd_assert((SubscribedTopicNames == NULL) == + RD_KAFKAP_STR_IS_NULL(SubscribedTopicRegex)); + /* If they're not NULL at least one should be non-empty */ + rd_assert(SubscribedTopicNames == NULL || SubscribedTopicNamesCnt > 0 || + RD_KAFKAP_STR_LEN(SubscribedTopicRegex) > 0); +} + +/** + * @brief Set the subscribed topics for the member \p member based on \p + * SubscribedTopicNames and \p SubscribedTopicRegex. Deduplicates the list after + * sorting it. + * @return `rd_true` if the subscription was changed, that happens + * if it's set and different from previous one. + * + * @locks mcluster->lock MUST be held. + */ +static rd_bool_t rd_kafka_mock_cgrp_consumer_member_subscribed_topic_names_set( + rd_kafka_mock_cgrp_consumer_member_t *member, + rd_kafkap_str_t *SubscribedTopicNames, + int32_t SubscribedTopicNamesCnt, + const rd_kafkap_str_t *SubscribedTopicRegex) { + rd_bool_t changed = rd_false; + rd_list_t *new_subscription; + int32_t i; + + validate_subscription(SubscribedTopicNames, SubscribedTopicNamesCnt, + SubscribedTopicRegex); + + if (!SubscribedTopicNames && + RD_KAFKAP_STR_IS_NULL(SubscribedTopicRegex) && + !member->subscribed_topic_regex) { + /* When client is sending NULL for SubscribedTopicNames and + * SubscribedTopicRegex, its subscription didn't change. If we + * already had a regex, we need to compute the regex again. */ + return changed; + } + + if (SubscribedTopicNames) { + RD_IF_FREE(member->subscribed_topic_names, rd_list_destroy); + member->subscribed_topic_names = + rd_list_new(SubscribedTopicNamesCnt, rd_free); + for (i = 0; i < SubscribedTopicNamesCnt; i++) { + rd_list_add( + member->subscribed_topic_names, + RD_KAFKAP_STR_DUP(&SubscribedTopicNames[i])); + } + } + + if (!RD_KAFKAP_STR_IS_NULL(SubscribedTopicRegex)) { + RD_IF_FREE(member->subscribed_topic_regex, rd_free); + member->subscribed_topic_regex = + RD_KAFKAP_STR_DUP(SubscribedTopicRegex); + } + + new_subscription = + rd_list_new(rd_list_cnt(member->subscribed_topic_names), rd_free); + + rd_list_copy_to(new_subscription, member->subscribed_topic_names, + rd_list_string_copy, NULL); + + if (member->subscribed_topic_regex[0]) { + rd_kafka_mock_cluster_t *mcluster = member->mcgrp->cluster; + rd_kafka_mock_topic_t *mtopic; + char errstr[1]; + rd_regex_t *re = rd_regex_comp(member->subscribed_topic_regex, + errstr, sizeof(errstr)); + + TAILQ_FOREACH(mtopic, &mcluster->topics, link) { + if (rd_regex_exec(re, mtopic->name)) + rd_list_add(new_subscription, + rd_strdup(mtopic->name)); + } + + rd_regex_destroy(re); + } + + rd_list_deduplicate(&new_subscription, rd_strcmp2); + + if (!member->subscribed_topics || + rd_list_cmp(new_subscription, member->subscribed_topics, + rd_list_cmp_str)) { + if (member->subscribed_topics) + rd_list_destroy(member->subscribed_topics); + member->subscribed_topics = + rd_list_copy(new_subscription, rd_list_string_copy, NULL); + changed = rd_true; + } + rd_list_destroy(new_subscription); + return changed; +} + +static void rd_kafka_mock_cgrp_consumer_member_topic_id_set( + rd_kafka_mock_cgrp_consumer_member_t *member, + const rd_kafkap_str_t *MemberId) { + /* KIP 1082: MemberId is generated by the client */ + rd_assert(RD_KAFKAP_STR_LEN(MemberId) > 0); + RD_IF_FREE(member->id, rd_free); + member->id = RD_KAFKAP_STR_DUP(MemberId); +} + +/** + * @brief Adds a member to consumer group \p mcgrp. If member with same + * \p MemberId is already present, only updates the connection and + * sets it as active. + * + * @param mcgrp Consumer group to add the member to. + * @param conn Member connection. + * @param MemberId Member id. + * @param InstanceId Group instance id (optional). + * @param session_timeout_ms Session timeout to use. + * @param SubscribedTopicNames Array of subscribed topics. + * Mandatory if the member is a new one. + * @param SubscribedTopicNamesCnt Number of elements in \p SubscribedTopicNames. + * @param SubscribedTopicRegex Subscribed topic regex. + * + * @return New or existing member, NULL if the member cannot be added. + * + * @locks mcluster->lock MUST be held. + */ +rd_kafka_mock_cgrp_consumer_member_t *rd_kafka_mock_cgrp_consumer_member_add( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + struct rd_kafka_mock_connection_s *conn, + const rd_kafkap_str_t *MemberId, + const rd_kafkap_str_t *InstanceId, + rd_kafkap_str_t *SubscribedTopicNames, + int32_t SubscribedTopicNamesCnt, + const rd_kafkap_str_t *SubscribedTopicRegex) { + rd_kafka_mock_cgrp_consumer_member_t *member = NULL; + rd_bool_t changed = rd_false; + + /* Find member */ + member = rd_kafka_mock_cgrp_consumer_member_find(mcgrp, MemberId); + if (!member) { + member = rd_kafka_mock_cgrp_consumer_member_find_by_instance_id( + mcgrp, InstanceId); + + if (member) { + if (!member->left_static_membership) { + /* Old member still active, + * fence this one */ + return NULL; + } + + if (rd_kafkap_str_cmp_str(MemberId, member->id) != 0) { + /* Member is a new instance and is rejoining + * with a new MemberId. */ + rd_kafka_mock_cgrp_consumer_member_topic_id_set( + member, MemberId); + } + member->left_static_membership = rd_false; + } + } else { + member->left_static_membership = rd_false; + } + + if (!member) { + validate_subscription(SubscribedTopicNames, + SubscribedTopicNamesCnt, + SubscribedTopicRegex); + + /* In case of session timeout + * where the member isn't aware it's been fenced. */ + if (SubscribedTopicNames == NULL) + return NULL; + + /* Not found, add member */ + member = rd_calloc(1, sizeof(*member)); + member->mcgrp = mcgrp; + + rd_kafka_mock_cgrp_consumer_member_topic_id_set(member, + MemberId); + + if (!RD_KAFKAP_STR_IS_NULL(InstanceId)) + member->instance_id = RD_KAFKAP_STR_DUP(InstanceId); + + TAILQ_INSERT_TAIL(&mcgrp->members, member, link); + mcgrp->member_cnt++; + changed = rd_true; + member->target_member_epoch = mcgrp->group_epoch; + } + + changed |= + rd_kafka_mock_cgrp_consumer_member_subscribed_topic_names_set( + member, SubscribedTopicNames, SubscribedTopicNamesCnt, + SubscribedTopicRegex); + + mcgrp->session_timeout_ms = + mcgrp->cluster->defaults.group_consumer_session_timeout_ms; + mcgrp->heartbeat_interval_ms = + mcgrp->cluster->defaults.group_consumer_heartbeat_interval_ms; + + member->conn = conn; + + rd_kafka_mock_cgrp_consumer_member_active(mcgrp, member); + + if (changed) + rd_kafka_mock_cgrp_consumer_target_assignment_recalculate( + mcgrp); + + return member; +} + +/** + * @brief Destroys a consumer group member, removing from its consumer group. + * + * @param mcgrp Member consumer group. + * @param member Member to destroy. + * + * @locks mcluster->lock MUST be held. + */ +static void rd_kafka_mock_cgrp_consumer_member_destroy( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_member_t *member) { + rd_assert(mcgrp->member_cnt > 0); + TAILQ_REMOVE(&mcgrp->members, member, link); + mcgrp->member_cnt--; + + rd_kafka_mock_cgrp_consumer_target_assignment_recalculate(mcgrp); + + rd_free(member->id); + + if (member->instance_id) + rd_free(member->instance_id); + + RD_IF_FREE(member->target_assignment, + rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(member->current_assignment, + rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(member->returned_assignment, + rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(member->subscribed_topics, rd_list_destroy_free); + + RD_IF_FREE(member->subscribed_topic_names, rd_list_destroy_free); + + RD_IF_FREE(member->subscribed_topic_regex, rd_free); + + rd_free(member); +} + +static void rd_kafka_mock_cgrp_consumer_member_leave_static( + rd_kafka_mock_cgrp_consumer_member_t *member) { + member->left_static_membership = rd_true; + rd_kafka_mock_cgrp_consumer_member_returned_assignment_set(member, + NULL); +} + + +/** + * @brief Called when a member must leave a consumer group. + * + * @param mcgrp Consumer group to leave. + * @param member Member that leaves. + * @param leave_static If true, the member is leaving with static group + * membership. + * + * @locks mcluster->lock MUST be held. + */ +void rd_kafka_mock_cgrp_consumer_member_leave( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_member_t *member, + rd_bool_t leave_static) { + rd_bool_t is_static = member->instance_id != NULL; + + rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", + "Member %s is leaving group %s, is static: %s, " + "static leave: %s", + member->id, mcgrp->id, RD_STR_ToF(is_static), + RD_STR_ToF(leave_static)); + if (!is_static || !leave_static) + rd_kafka_mock_cgrp_consumer_member_destroy(mcgrp, member); + else + rd_kafka_mock_cgrp_consumer_member_leave_static(member); +} + +/** + * @brief Called when a member is fenced from a consumer group. + * + * @param mcgrp Consumer group. + * @param member Member to fence. + * + * @locks mcluster->lock MUST be held. + */ +void rd_kafka_mock_cgrp_consumer_member_fenced( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_member_t *member) { + + rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", + "Member %s is fenced from group %s", member->id, + mcgrp->id); + + rd_kafka_mock_cgrp_consumer_member_destroy(mcgrp, member); +} + +/** + * @brief Find a consumer group in cluster \p mcluster by \p GroupId. + * + * @param mcluster Cluster to search in. + * @param GroupId Group id to search. + * @return Found group or NULL. + * + * @locks mcluster->lock MUST be held. + */ +rd_kafka_mock_cgrp_consumer_t * +rd_kafka_mock_cgrp_consumer_find(const rd_kafka_mock_cluster_t *mcluster, + const rd_kafkap_str_t *GroupId) { + rd_kafka_mock_cgrp_consumer_t *mcgrp; + TAILQ_FOREACH(mcgrp, &mcluster->cgrps_consumer, link) { + if (!rd_kafkap_str_cmp_str(GroupId, mcgrp->id)) + return mcgrp; + } + + return NULL; +} + +/** + * @brief Check if any members have exceeded the session timeout. + * + * @param rkts Timers. + * @param arg Consumer group. + * + * @locks mcluster->lock is acquired and released. + */ +static void rd_kafka_mock_cgrp_consumer_session_tmr_cb(rd_kafka_timers_t *rkts, + void *arg) { + rd_kafka_mock_cgrp_consumer_t *mcgrp = arg; + rd_kafka_mock_cgrp_consumer_member_t *member, *tmp; + rd_ts_t now = rd_clock(); + rd_kafka_mock_cluster_t *mcluster = mcgrp->cluster; + + mtx_unlock(&mcluster->lock); + TAILQ_FOREACH_SAFE(member, &mcgrp->members, link, tmp) { + if (member->ts_last_activity + + (mcgrp->session_timeout_ms * 1000) > + now) + continue; + + rd_kafka_dbg(mcgrp->cluster->rk, MOCK, "MOCK", + "Member %s session timed out for group %s", + member->id, mcgrp->id); + + rd_kafka_mock_cgrp_consumer_member_fenced(mcgrp, member); + } + mtx_unlock(&mcluster->lock); +} + + +/** + * @brief Find or create a "consumer" consumer group. + * + * @param mcluster Cluster to search in. + * @param GroupId Group id to look for. + * @return Found or new consumer group. + * + * @locks mcluster->lock MUST be held. + */ +rd_kafka_mock_cgrp_consumer_t * +rd_kafka_mock_cgrp_consumer_get(rd_kafka_mock_cluster_t *mcluster, + const rd_kafkap_str_t *GroupId) { + rd_kafka_mock_cgrp_consumer_t *mcgrp; + + mcgrp = rd_kafka_mock_cgrp_consumer_find(mcluster, GroupId); + if (mcgrp) + return mcgrp; + + mcgrp = rd_calloc(1, sizeof(*mcgrp)); + mcgrp->cluster = mcluster; + mcgrp->id = RD_KAFKAP_STR_DUP(GroupId); + mcgrp->group_epoch = 1; + TAILQ_INIT(&mcgrp->members); + rd_kafka_timer_start(&mcluster->timers, &mcgrp->session_tmr, + 1000 * 1000 /*1s*/, + rd_kafka_mock_cgrp_consumer_session_tmr_cb, mcgrp); + + TAILQ_INSERT_TAIL(&mcluster->cgrps_consumer, mcgrp, link); + + return mcgrp; +} + + +void rd_kafka_mock_cgrp_consumer_target_assignment( + rd_kafka_mock_cluster_t *mcluster, + const char *group_id, + rd_kafka_mock_cgrp_consumer_target_assignment_t *target_assignment) { + rd_kafka_mock_cgrp_consumer_t *mcgrp; + rd_kafkap_str_t *group_id_str = + rd_kafkap_str_new(group_id, strlen(group_id)); + + mtx_lock(&mcluster->lock); + + mcgrp = rd_kafka_mock_cgrp_consumer_find(mcluster, group_id_str); + if (!mcgrp) + goto destroy; + + rd_kafka_mock_cgrp_consumer_target_assignment_set_manual( + mcgrp, target_assignment); + +destroy: + rd_kafkap_str_destroy(group_id_str); + mtx_unlock(&mcluster->lock); +} + +void rd_kafka_mock_set_group_consumer_session_timeout_ms( + rd_kafka_mock_cluster_t *mcluster, + int group_consumer_session_timeout_ms) { + mtx_lock(&mcluster->lock); + mcluster->defaults.group_consumer_session_timeout_ms = + group_consumer_session_timeout_ms; + mtx_unlock(&mcluster->lock); +} + +void rd_kafka_mock_set_group_consumer_heartbeat_interval_ms( + rd_kafka_mock_cluster_t *mcluster, + int group_consumer_heartbeat_interval_ms) { + mtx_lock(&mcluster->lock); + mcluster->defaults.group_consumer_heartbeat_interval_ms = + group_consumer_heartbeat_interval_ms; + mtx_unlock(&mcluster->lock); +} + +/** + * @brief A client connection closed, check if any consumer cgrp has any state + * for this connection that needs to be cleared. + * + * @param mcluster Cluster to search in. + * @param mconn Connection that was closed. + * + * @locks mcluster->lock MUST be held. + */ +void rd_kafka_mock_cgrps_consumer_connection_closed( + rd_kafka_mock_cluster_t *mcluster, + rd_kafka_mock_connection_t *mconn) { + rd_kafka_mock_cgrp_consumer_t *mcgrp; + + TAILQ_FOREACH(mcgrp, &mcluster->cgrps_consumer, link) { + rd_kafka_mock_cgrp_consumer_member_t *member, *tmp; + TAILQ_FOREACH_SAFE(member, &mcgrp->members, link, tmp) { + if (member->conn == mconn) { + member->conn = NULL; + rd_kafka_mock_cgrp_consumer_member_returned_assignment_set( + member, NULL); + rd_kafka_mock_cgrp_consumer_member_current_assignment_set( + member, NULL); + } + } + } +} + +/** + * @brief Destroys consumer group \p mcgrp and all of its members. + * + * @param mcgrp Consumer group to destroy. + * + * @locks mcluster->lock MUST be held. + */ +void rd_kafka_mock_cgrp_consumer_destroy(rd_kafka_mock_cgrp_consumer_t *mcgrp) { + rd_kafka_mock_cgrp_consumer_member_t *member; + + TAILQ_REMOVE(&mcgrp->cluster->cgrps_consumer, mcgrp, link); + + rd_kafka_timer_stop(&mcgrp->cluster->timers, &mcgrp->session_tmr, + rd_true); + rd_free(mcgrp->id); + while ((member = TAILQ_FIRST(&mcgrp->members))) + rd_kafka_mock_cgrp_consumer_member_destroy(mcgrp, member); + rd_free(mcgrp); +} + +/** + * @brief A client connection closed, check if any cgrp has any state + * for this connection that needs to be cleared. + * + * @param mcluster Mock cluster. + * @param mconn Connection that was closed. + */ +void rd_kafka_mock_cgrps_connection_closed(rd_kafka_mock_cluster_t *mcluster, + rd_kafka_mock_connection_t *mconn) { + rd_kafka_mock_cgrps_classic_connection_closed(mcluster, mconn); + rd_kafka_mock_cgrps_consumer_connection_closed(mcluster, mconn); +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_mock_handlers.c b/src/third_party/librdkafka/dist/src/rdkafka_mock_handlers.c index 835f4a9731a..17c21dc2faa 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_mock_handlers.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_mock_handlers.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill, + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,9 +39,62 @@ #include "rdkafka_mock_int.h" #include "rdkafka_transport_int.h" #include "rdkafka_offset.h" +#include "rdkafka_telemetry_decode.h" +void rd_kafka_mock_Produce_reply_tags_partition_write( + rd_kafka_buf_t *rkbuf, + int tagtype, + rd_kafka_mock_partition_t *mpart) { + switch (tagtype) { + case 0: /* CurrentLeader */ + /* Leader id */ + rd_kafka_buf_write_i32(rkbuf, mpart->leader->id); + /* Leader epoch */ + rd_kafka_buf_write_i32(rkbuf, mpart->leader_epoch); + /* Field tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + break; + default: + break; + } +} + +void rd_kafka_mock_Produce_reply_tags_write( + rd_kafka_buf_t *rkbuf, + int tagtype, + rd_kafka_mock_broker_t **changed_leaders, + int changed_leader_cnt) { + int i; + switch (tagtype) { + case 0: /* NodeEndpoints */ + /* #NodeEndpoints */ + rd_kafka_buf_write_arraycnt(rkbuf, changed_leader_cnt); + for (i = 0; i < changed_leader_cnt; i++) { + rd_kafka_mock_broker_t *changed_leader = + changed_leaders[i]; + /* Leader id */ + rd_kafka_buf_write_i32(rkbuf, changed_leader->id); + /* Leader Hostname */ + rd_kafka_buf_write_str( + rkbuf, changed_leader->advertised_listener, -1); + + /* Leader Port number */ + rd_kafka_buf_write_i32(rkbuf, + (int32_t)changed_leader->port); + + /* Leader Rack */ + rd_kafka_buf_write_str(rkbuf, changed_leader->rack, -1); + + /* Field tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + } + default: + break; + } +} + /** * @brief Handle ProduceRequest */ @@ -54,16 +108,23 @@ static int rd_kafka_mock_handle_Produce(rd_kafka_mock_connection_t *mconn, int16_t Acks; int32_t TimeoutMs; rd_kafka_resp_err_t all_err; + int32_t tags_to_write[1] = {0}; + size_t tags_to_write_cnt = 0; + int changed_leaders_cnt = 0; + rd_kafka_mock_broker_t **changed_leaders = + rd_calloc(mcluster->broker_cnt, sizeof(*changed_leaders)); + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 3) rd_kafka_buf_read_str(rkbuf, &TransactionalId); rd_kafka_buf_read_i16(rkbuf, &Acks); rd_kafka_buf_read_i32(rkbuf, &TimeoutMs); - rd_kafka_buf_read_i32(rkbuf, &TopicsCnt); + /* #Topics */ + rd_kafka_buf_read_arraycnt(rkbuf, &TopicsCnt, RD_KAFKAP_TOPICS_MAX); /* Response: #Topics */ - rd_kafka_buf_write_i32(resp, TopicsCnt); + rd_kafka_buf_write_arraycnt(resp, TopicsCnt); /* Inject error, if any */ all_err = rd_kafka_mock_next_request_error(mconn, resp); @@ -74,14 +135,14 @@ static int rd_kafka_mock_handle_Produce(rd_kafka_mock_connection_t *mconn, rd_kafka_mock_topic_t *mtopic; rd_kafka_buf_read_str(rkbuf, &Topic); - rd_kafka_buf_read_i32(rkbuf, &PartitionCnt); - + rd_kafka_buf_read_arraycnt(rkbuf, &PartitionCnt, + RD_KAFKAP_PARTITIONS_MAX); mtopic = rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); /* Response: Topic */ rd_kafka_buf_write_kstr(resp, &Topic); /* Response: #Partitions */ - rd_kafka_buf_write_i32(resp, PartitionCnt); + rd_kafka_buf_write_arraycnt(resp, PartitionCnt); while (PartitionCnt-- > 0) { int32_t Partition; @@ -89,6 +150,8 @@ static int rd_kafka_mock_handle_Produce(rd_kafka_mock_connection_t *mconn, rd_kafkap_bytes_t records; rd_kafka_resp_err_t err = RD_KAFKA_RESP_ERR_NO_ERROR; int64_t BaseOffset = -1; + int32_t partition_tags_to_write[1] = {0}; + size_t partition_tags_to_write_cnt = 0; rd_kafka_buf_read_i32(rkbuf, &Partition); @@ -96,8 +159,9 @@ static int rd_kafka_mock_handle_Produce(rd_kafka_mock_connection_t *mconn, mpart = rd_kafka_mock_partition_find(mtopic, Partition); - rd_kafka_buf_read_bytes(rkbuf, &records); - + rd_kafka_buf_read_kbytes(rkbuf, &records); + /* Partition Tags */ + rd_kafka_buf_skip_tags(rkbuf); /* Response: Partition */ rd_kafka_buf_write_i32(resp, Partition); @@ -145,7 +209,54 @@ static int rd_kafka_mock_handle_Produce(rd_kafka_mock_connection_t *mconn, resp, mpart->start_offset); } } + + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 8) { + /* Response: #RecordErrors + * TODO: Add support for injecting RecordErrors + * 0 record errors for now */ + rd_kafka_buf_write_arraycnt(resp, 0); + + /* Response: ErrorMessage */ + rd_kafka_buf_write_str(resp, NULL, 0); + } + + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 10 && + err == RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION) { + int changed_leader_idx; + /* See if this leader is already included */ + for (changed_leader_idx = 0; + changed_leader_idx < changed_leaders_cnt; + changed_leader_idx++) { + if (changed_leaders[changed_leader_idx] + ->id == mpart->leader->id) + break; + } + if (changed_leader_idx == changed_leaders_cnt) { + /* Add the new leader that wasn't + * present */ + changed_leaders[changed_leaders_cnt] = + mpart->leader; + changed_leaders_cnt++; + } + + partition_tags_to_write + [partition_tags_to_write_cnt] = + 0 /* CurrentLeader */; + partition_tags_to_write_cnt++; + } + + /* Response: Partition tags */ + rd_kafka_buf_write_tags( + resp, + rd_kafka_mock_Produce_reply_tags_partition_write, + partition_tags_to_write, + partition_tags_to_write_cnt, mpart); } + + /* Topic tags */ + rd_kafka_buf_skip_tags(rkbuf); + /* Response: Topic tags */ + rd_kafka_buf_write_tags_empty(resp); } if (rkbuf->rkbuf_reqhdr.ApiVersion >= 1) { @@ -153,15 +264,90 @@ static int rd_kafka_mock_handle_Produce(rd_kafka_mock_connection_t *mconn, rd_kafka_buf_write_i32(resp, 0); } - rd_kafka_mock_connection_send_response(mconn, resp); + /* Response: Top level tags */ + if (changed_leaders_cnt) { + tags_to_write[tags_to_write_cnt] = 0 /* NodeEndpoints */; + tags_to_write_cnt++; + } + rd_kafka_buf_write_tags(resp, rd_kafka_mock_Produce_reply_tags_write, + tags_to_write, tags_to_write_cnt, + changed_leaders, changed_leaders_cnt); + + rd_kafka_mock_connection_send_response0(mconn, resp, rd_true); + rd_free(changed_leaders); return 0; err_parse: + rd_free(changed_leaders); rd_kafka_buf_destroy(resp); return -1; } +void rd_kafka_mock_Fetch_reply_tags_partition_write( + rd_kafka_buf_t *rkbuf, + int tagtype, + rd_kafka_mock_partition_t *mpart) { + switch (tagtype) { + case 1: /* CurrentLeader */ + { + int32_t leader_id = mpart->leader->id, + leader_epoch = mpart->leader_epoch; + rd_kafka_mock_partition_leader_t *mpart_leader = + rd_kafka_mock_partition_next_leader_response(mpart); + if (mpart_leader) { + leader_id = mpart_leader->leader_id; + leader_epoch = mpart_leader->leader_epoch; + rd_kafka_mock_partition_leader_destroy(mpart, + mpart_leader); + } + + /* Leader id */ + rd_kafka_buf_write_i32(rkbuf, leader_id); + /* Leader epoch */ + rd_kafka_buf_write_i32(rkbuf, leader_epoch); + /* Field tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + break; + } + default: + break; + } +} + +void rd_kafka_mock_Fetch_reply_tags_write( + rd_kafka_buf_t *rkbuf, + int tagtype, + rd_kafka_mock_broker_t **changed_leaders, + int changed_leader_cnt) { + int i; + switch (tagtype) { + case 0: /* NodeEndpoints */ + /* #NodeEndpoints */ + rd_kafka_buf_write_arraycnt(rkbuf, changed_leader_cnt); + for (i = 0; i < changed_leader_cnt; i++) { + rd_kafka_mock_broker_t *changed_leader = + changed_leaders[i]; + /* Leader id */ + rd_kafka_buf_write_i32(rkbuf, changed_leader->id); + /* Leader Hostname */ + rd_kafka_buf_write_str( + rkbuf, changed_leader->advertised_listener, -1); + + /* Leader Port number */ + rd_kafka_buf_write_i32(rkbuf, + (int32_t)changed_leader->port); + + /* Leader Rack */ + rd_kafka_buf_write_str(rkbuf, changed_leader->rack, -1); + + /* Field tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + } + default: + break; + } +} /** @@ -173,12 +359,21 @@ static int rd_kafka_mock_handle_Fetch(rd_kafka_mock_connection_t *mconn, rd_kafka_mock_cluster_t *mcluster = mconn->broker->cluster; rd_kafka_buf_t *resp = rd_kafka_mock_buf_new_response(rkbuf); rd_kafka_resp_err_t all_err; - int32_t ReplicaId, MaxWait, MinBytes, MaxBytes = -1, SessionId = -1, - Epoch, TopicsCnt; + int32_t ReplicaId = -1, MaxWait, MinBytes, MaxBytes = -1, + SessionId = -1, Epoch, TopicsCnt; int8_t IsolationLevel; size_t totsize = 0; - rd_kafka_buf_read_i32(rkbuf, &ReplicaId); + int32_t tags_to_write[1] = {0}; + uint64_t tags_to_write_cnt = 0; + + int changed_leaders_cnt = 0; + rd_kafka_mock_broker_t **changed_leaders = + rd_calloc(mcluster->broker_cnt, sizeof(*changed_leaders)); + + if (rkbuf->rkbuf_reqhdr.ApiVersion <= 14) { + rd_kafka_buf_read_i32(rkbuf, &ReplicaId); + } rd_kafka_buf_read_i32(rkbuf, &MaxWait); rd_kafka_buf_read_i32(rkbuf, &MinBytes); if (rkbuf->rkbuf_reqhdr.ApiVersion >= 3) @@ -207,34 +402,56 @@ static int rd_kafka_mock_handle_Fetch(rd_kafka_mock_connection_t *mconn, rd_kafka_buf_write_i32(resp, SessionId); } - rd_kafka_buf_read_i32(rkbuf, &TopicsCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicsCnt, RD_KAFKAP_TOPICS_MAX); /* Response: #Topics */ - rd_kafka_buf_write_i32(resp, TopicsCnt); + rd_kafka_buf_write_arraycnt(resp, TopicsCnt); while (TopicsCnt-- > 0) { - rd_kafkap_str_t Topic; + rd_kafkap_str_t Topic = RD_KAFKAP_STR_INITIALIZER; + rd_kafka_Uuid_t TopicId = RD_KAFKA_UUID_ZERO; int32_t PartitionCnt; rd_kafka_mock_topic_t *mtopic; + rd_bool_t find_topic_by_id = rd_true; - rd_kafka_buf_read_str(rkbuf, &Topic); - rd_kafka_buf_read_i32(rkbuf, &PartitionCnt); + if (rkbuf->rkbuf_reqhdr.ApiVersion <= 12) { + rd_kafka_buf_read_str(rkbuf, &Topic); + find_topic_by_id = rd_false; + } - mtopic = rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 13) { + rd_kafka_buf_read_uuid(rkbuf, &TopicId); + } + + rd_kafka_buf_read_arraycnt(rkbuf, &PartitionCnt, + RD_KAFKAP_PARTITIONS_MAX); + + if (find_topic_by_id) { + mtopic = + rd_kafka_mock_topic_find_by_id(mcluster, TopicId); + /* Response: TopicId */ + rd_kafka_buf_write_uuid(resp, &TopicId); + } else { + mtopic = + rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); + /* Response: Topic */ + rd_kafka_buf_write_kstr(resp, &Topic); + } - /* Response: Topic */ - rd_kafka_buf_write_kstr(resp, &Topic); /* Response: #Partitions */ - rd_kafka_buf_write_i32(resp, PartitionCnt); + rd_kafka_buf_write_arraycnt(resp, PartitionCnt); while (PartitionCnt-- > 0) { - int32_t Partition, CurrentLeaderEpoch, PartMaxBytes; + int32_t Partition, CurrentLeaderEpoch = -1, + LastFetchedEpoch = -1, PartMaxBytes; int64_t FetchOffset, LogStartOffset; rd_kafka_mock_partition_t *mpart = NULL; rd_kafka_resp_err_t err = all_err; rd_bool_t on_follower; - size_t partsize = 0; - const rd_kafka_mock_msgset_t *mset = NULL; + size_t partsize = 0; + const rd_kafka_mock_msgset_t *mset = NULL; + int32_t partition_tags_to_write[1] = {0}; + uint64_t partition_tags_to_write_cnt = 0; rd_kafka_buf_read_i32(rkbuf, &Partition); @@ -244,14 +461,22 @@ static int rd_kafka_mock_handle_Fetch(rd_kafka_mock_connection_t *mconn, rd_kafka_buf_read_i64(rkbuf, &FetchOffset); + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 12) + rd_kafka_buf_read_i32(rkbuf, &LastFetchedEpoch); + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 5) rd_kafka_buf_read_i64(rkbuf, &LogStartOffset); rd_kafka_buf_read_i32(rkbuf, &PartMaxBytes); + /* Partition tags */ + rd_kafka_buf_skip_tags(rkbuf); + if (mtopic) mpart = rd_kafka_mock_partition_find(mtopic, Partition); + else if (find_topic_by_id) + err = RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_ID; /* Response: Partition */ rd_kafka_buf_write_i32(resp, Partition); @@ -261,25 +486,42 @@ static int rd_kafka_mock_handle_Fetch(rd_kafka_mock_connection_t *mconn, on_follower = mpart && mpart->follower_id == mconn->broker->id; - if (!all_err && !mpart) - err = RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART; - else if (!all_err && mpart->leader != mconn->broker && - !on_follower) + if (!err) { + if (!all_err && !mpart) + err = + RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART; + else if (!all_err && + mpart->leader != mconn->broker && + !on_follower) + err = + RD_KAFKA_RESP_ERR_NOT_LEADER_OR_FOLLOWER; + } + + if (!err && mpart) err = - RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION; + rd_kafka_mock_partition_leader_epoch_check( + mpart, CurrentLeaderEpoch); /* Find MessageSet for FetchOffset */ if (!err && FetchOffset != mpart->end_offset) { - if (on_follower && - FetchOffset <= mpart->end_offset && - FetchOffset > mpart->follower_end_offset) - err = - RD_KAFKA_RESP_ERR_OFFSET_NOT_AVAILABLE; - else if (!(mset = rd_kafka_mock_msgset_find( - mpart, FetchOffset, - on_follower))) + /* Kafka currently only returns + * OFFSET_NOT_AVAILABLE + * in ListOffsets calls */ + if (!(mset = rd_kafka_mock_msgset_find( + mpart, FetchOffset, on_follower))) err = RD_KAFKA_RESP_ERR_OFFSET_OUT_OF_RANGE; + rd_kafka_dbg( + mcluster->rk, MOCK, "MOCK", + "Topic %.*s [%" PRId32 + "] fetch err %s for offset %" PRId64 + " mset %p, on_follower %d, " + "start %" PRId64 ", end_offset %" PRId64 + ", current epoch %" PRId32, + RD_KAFKAP_STR_PR(&Topic), Partition, + rd_kafka_err2name(err), FetchOffset, mset, + on_follower, mpart->start_offset, + mpart->end_offset, mpart->leader_epoch); } @@ -311,7 +553,7 @@ static int rd_kafka_mock_handle_Fetch(rd_kafka_mock_connection_t *mconn, if (rkbuf->rkbuf_reqhdr.ApiVersion >= 4) { /* Response: #Aborted */ - rd_kafka_buf_write_i32(resp, 0); + rd_kafka_buf_write_arraycnt(resp, 0); } @@ -338,30 +580,75 @@ static int rd_kafka_mock_handle_Fetch(rd_kafka_mock_connection_t *mconn, if (mset && partsize < (size_t)PartMaxBytes && totsize < (size_t)MaxBytes) { /* Response: Records */ - rd_kafka_buf_write_kbytes(resp, &mset->bytes); - partsize += RD_KAFKAP_BYTES_SIZE(&mset->bytes); - totsize += RD_KAFKAP_BYTES_SIZE(&mset->bytes); + size_t written = rd_kafka_buf_write_kbytes( + resp, &mset->bytes); + partsize += written; + totsize += written; /* FIXME: Multiple messageSets ? */ } else { /* Empty Response: Records: Null */ - rd_kafka_buf_write_i32(resp, 0); + rd_kafka_buf_write_arraycnt(resp, 0); } + + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 12 && + err == RD_KAFKA_RESP_ERR_NOT_LEADER_OR_FOLLOWER) { + int changed_leader_idx; + for (changed_leader_idx = 0; + changed_leader_idx < changed_leaders_cnt; + changed_leader_idx++) { + if (changed_leaders[changed_leader_idx] + ->id == mpart->leader->id) + break; + } + if (changed_leader_idx == changed_leaders_cnt) { + changed_leaders[changed_leaders_cnt] = + mpart->leader; + changed_leaders_cnt++; + } + /* CurrentLeader */ + partition_tags_to_write + [partition_tags_to_write_cnt] = 1; + partition_tags_to_write_cnt++; + } + + /* Response: Partition tags */ + rd_kafka_buf_write_tags( + resp, + rd_kafka_mock_Fetch_reply_tags_partition_write, + partition_tags_to_write, + partition_tags_to_write_cnt, mpart); } + + /* Topic tags */ + rd_kafka_buf_skip_tags(rkbuf); + /* Response: Topic tags */ + rd_kafka_buf_write_tags_empty(resp); } if (rkbuf->rkbuf_reqhdr.ApiVersion >= 7) { int32_t ForgottenTopicCnt; - rd_kafka_buf_read_i32(rkbuf, &ForgottenTopicCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &ForgottenTopicCnt, + RD_KAFKAP_TOPICS_MAX); while (ForgottenTopicCnt-- > 0) { - rd_kafkap_str_t Topic; + rd_kafkap_str_t Topic = RD_KAFKAP_STR_INITIALIZER; + rd_kafka_Uuid_t TopicId = RD_KAFKA_UUID_ZERO; int32_t ForgPartCnt; - rd_kafka_buf_read_str(rkbuf, &Topic); - rd_kafka_buf_read_i32(rkbuf, &ForgPartCnt); + if (rkbuf->rkbuf_reqhdr.ApiVersion <= 12) { + rd_kafka_buf_read_str(rkbuf, &Topic); + } + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 13) { + rd_kafka_buf_read_uuid(rkbuf, &TopicId); + } + rd_kafka_buf_read_arraycnt(rkbuf, &ForgPartCnt, + RD_KAFKAP_PARTITIONS_MAX); while (ForgPartCnt-- > 0) { int32_t Partition; rd_kafka_buf_read_i32(rkbuf, &Partition); } + + /* ForgottenTopic tags */ + rd_kafka_buf_skip_tags(rkbuf); } } @@ -373,6 +660,16 @@ static int rd_kafka_mock_handle_Fetch(rd_kafka_mock_connection_t *mconn, /* Matt might do something sensible with this */ } + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 16 && changed_leaders_cnt) { + tags_to_write[tags_to_write_cnt] = 0 /* NodeEndpoints */; + tags_to_write_cnt++; + } + + /* Response: Top level tags */ + rd_kafka_buf_write_tags(resp, rd_kafka_mock_Fetch_reply_tags_write, + tags_to_write, tags_to_write_cnt, + changed_leaders, changed_leaders_cnt); + /* If there was no data, delay up to MaxWait. * This isn't strictly correct since we should cut the wait short * and feed newly produced data if a producer writes to the @@ -381,12 +678,13 @@ static int rd_kafka_mock_handle_Fetch(rd_kafka_mock_connection_t *mconn, if (!totsize && MaxWait > 0) resp->rkbuf_ts_retry = rd_clock() + (MaxWait * 1000); - rd_kafka_mock_connection_send_response(mconn, resp); - + rd_kafka_mock_connection_send_response0(mconn, resp, rd_true); + rd_free(changed_leaders); return 0; err_parse: rd_kafka_buf_destroy(resp); + rd_free(changed_leaders); return -1; } @@ -417,10 +715,10 @@ static int rd_kafka_mock_handle_ListOffsets(rd_kafka_mock_connection_t *mconn, /* Inject error, if any */ all_err = rd_kafka_mock_next_request_error(mconn, resp); - rd_kafka_buf_read_i32(rkbuf, &TopicsCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicsCnt, RD_KAFKAP_TOPICS_MAX); /* Response: #Topics */ - rd_kafka_buf_write_i32(resp, TopicsCnt); + rd_kafka_buf_write_arraycnt(resp, TopicsCnt); while (TopicsCnt-- > 0) { rd_kafkap_str_t Topic; @@ -428,18 +726,19 @@ static int rd_kafka_mock_handle_ListOffsets(rd_kafka_mock_connection_t *mconn, rd_kafka_mock_topic_t *mtopic; rd_kafka_buf_read_str(rkbuf, &Topic); - rd_kafka_buf_read_i32(rkbuf, &PartitionCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &PartitionCnt, + RD_KAFKAP_PARTITIONS_MAX); mtopic = rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); /* Response: Topic */ rd_kafka_buf_write_kstr(resp, &Topic); /* Response: #Partitions */ - rd_kafka_buf_write_i32(resp, PartitionCnt); + rd_kafka_buf_write_arraycnt(resp, PartitionCnt); while (PartitionCnt-- > 0) { - int32_t Partition, CurrentLeaderEpoch; - int64_t Timestamp, Offset = -1; + int32_t Partition, CurrentLeaderEpoch = -1; + int64_t Timestamp, Offset = -1; int32_t MaxNumOffsets; rd_kafka_mock_partition_t *mpart = NULL; rd_kafka_resp_err_t err = all_err; @@ -455,6 +754,9 @@ static int rd_kafka_mock_handle_ListOffsets(rd_kafka_mock_connection_t *mconn, if (rkbuf->rkbuf_reqhdr.ApiVersion == 0) rd_kafka_buf_read_i32(rkbuf, &MaxNumOffsets); + /* Partition tags */ + rd_kafka_buf_skip_tags(rkbuf); + if (mtopic) mpart = rd_kafka_mock_partition_find(mtopic, Partition); @@ -468,6 +770,10 @@ static int rd_kafka_mock_handle_ListOffsets(rd_kafka_mock_connection_t *mconn, err = RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION; + if (!err && mpart) + err = + rd_kafka_mock_partition_leader_epoch_check( + mpart, CurrentLeaderEpoch); /* Response: ErrorCode */ rd_kafka_buf_write_i16(resp, err); @@ -500,17 +806,44 @@ static int rd_kafka_mock_handle_ListOffsets(rd_kafka_mock_connection_t *mconn, if (rkbuf->rkbuf_reqhdr.ApiVersion >= 4) { /* Response: LeaderEpoch */ - rd_kafka_buf_write_i64(resp, -1); + const rd_kafka_mock_msgset_t *mset = NULL; + int32_t leader_epoch = -1; + rd_bool_t on_follower = rd_false; + + if (mpart) { + on_follower = + mpart && mpart->follower_id == + mconn->broker->id; + + if (Offset >= 0 && + (mset = rd_kafka_mock_msgset_find( + mpart, Offset, on_follower))) { + leader_epoch = + mset->leader_epoch; + } + } + + rd_kafka_buf_write_i32(resp, leader_epoch); } + /* Response: Partition tags */ + rd_kafka_buf_write_tags_empty(resp); + rd_kafka_dbg(mcluster->rk, MOCK, "MOCK", "Topic %.*s [%" PRId32 "] returning " - "offset %" PRId64 " for %s: %s", + "offset %" PRId64 " (leader epoch %" PRId32 + ") for %s: %s", RD_KAFKAP_STR_PR(&Topic), Partition, - Offset, rd_kafka_offset2str(Timestamp), + Offset, mpart ? mpart->leader_epoch : -1, + rd_kafka_offset2str(Timestamp), rd_kafka_err2str(err)); } + + /* Topic tags */ + rd_kafka_buf_skip_tags(rkbuf); + /* Response: Topic tags */ + rd_kafka_buf_write_tags_empty(resp); } @@ -550,13 +883,14 @@ static int rd_kafka_mock_handle_OffsetFetch(rd_kafka_mock_connection_t *mconn, mrkb = rd_kafka_mock_cluster_get_coord(mcluster, RD_KAFKA_COORD_GROUP, &GroupId); if (!mrkb && !all_err) - all_err = RD_KAFKA_RESP_ERR_NOT_COORDINATOR; + all_err = RD_KAFKA_RESP_ERR_NOT_COORDINATOR; // FIXME? check if + // its this mrkb? - rd_kafka_buf_read_i32(rkbuf, &TopicsCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicsCnt, 100000); /* Response: #Topics */ - rd_kafka_buf_write_i32(resp, TopicsCnt); + rd_kafka_buf_write_arraycnt(resp, TopicsCnt); while (TopicsCnt-- > 0) { rd_kafkap_str_t Topic; @@ -564,14 +898,14 @@ static int rd_kafka_mock_handle_OffsetFetch(rd_kafka_mock_connection_t *mconn, rd_kafka_mock_topic_t *mtopic; rd_kafka_buf_read_str(rkbuf, &Topic); - rd_kafka_buf_read_i32(rkbuf, &PartitionCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &PartitionCnt, 100000); mtopic = rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); /* Response: Topic */ rd_kafka_buf_write_kstr(resp, &Topic); /* Response: #Partitions */ - rd_kafka_buf_write_i32(resp, PartitionCnt); + rd_kafka_buf_write_arraycnt(resp, PartitionCnt); while (PartitionCnt-- > 0) { int32_t Partition; @@ -600,7 +934,8 @@ static int rd_kafka_mock_handle_OffsetFetch(rd_kafka_mock_connection_t *mconn, if (rkbuf->rkbuf_reqhdr.ApiVersion >= 5) { /* Response: CommittedLeaderEpoch */ - rd_kafka_buf_write_i32(resp, -1); + rd_kafka_buf_write_i32( + resp, mpart ? mpart->leader_epoch : -1); } /* Response: Metadata */ @@ -610,6 +945,9 @@ static int rd_kafka_mock_handle_OffsetFetch(rd_kafka_mock_connection_t *mconn, /* Response: ErrorCode */ rd_kafka_buf_write_i16(resp, err); + /* Response: Struct tags */ + rd_kafka_buf_write_tags_empty(resp); + if (coff) rd_kafka_dbg(mcluster->rk, MOCK, "MOCK", "Topic %s [%" PRId32 @@ -629,6 +967,12 @@ static int rd_kafka_mock_handle_OffsetFetch(rd_kafka_mock_connection_t *mconn, RD_KAFKAP_STR_PR(&GroupId), rd_kafka_err2str(err)); } + + /* Request: Skip struct tags */ + rd_kafka_buf_skip_tags(rkbuf); + + /* Response: Struct tags */ + rd_kafka_buf_write_tags_empty(resp); } if (rkbuf->rkbuf_reqhdr.ApiVersion >= 2) { @@ -658,7 +1002,7 @@ static int rd_kafka_mock_handle_OffsetCommit(rd_kafka_mock_connection_t *mconn, rd_kafka_buf_t *resp = rd_kafka_mock_buf_new_response(rkbuf); rd_kafka_mock_broker_t *mrkb; rd_kafka_resp_err_t all_err; - int32_t GenerationId = -1, TopicsCnt; + int32_t GenerationIdOrMemberEpoch = -1, TopicsCnt; rd_kafkap_str_t GroupId, MemberId, GroupInstanceId; if (rkbuf->rkbuf_reqhdr.ApiVersion >= 3) { @@ -669,7 +1013,7 @@ static int rd_kafka_mock_handle_OffsetCommit(rd_kafka_mock_connection_t *mconn, rd_kafka_buf_read_str(rkbuf, &GroupId); if (rkbuf->rkbuf_reqhdr.ApiVersion >= 1) { - rd_kafka_buf_read_i32(rkbuf, &GenerationId); + rd_kafka_buf_read_i32(rkbuf, &GenerationIdOrMemberEpoch); rd_kafka_buf_read_str(rkbuf, &MemberId); } @@ -693,30 +1037,56 @@ static int rd_kafka_mock_handle_OffsetCommit(rd_kafka_mock_connection_t *mconn, if (!all_err) { - rd_kafka_mock_cgrp_t *mcgrp; + rd_kafka_mock_cgrp_classic_t *mcgrp_classic; - mcgrp = rd_kafka_mock_cgrp_find(mcluster, &GroupId); - if (mcgrp) { - rd_kafka_mock_cgrp_member_t *member = NULL; + mcgrp_classic = + rd_kafka_mock_cgrp_classic_find(mcluster, &GroupId); + if (mcgrp_classic) { + rd_kafka_mock_cgrp_classic_member_t *member = NULL; if (!RD_KAFKAP_STR_IS_NULL(&MemberId)) - member = rd_kafka_mock_cgrp_member_find( - mcgrp, &MemberId); + member = rd_kafka_mock_cgrp_classic_member_find( + mcgrp_classic, &MemberId); if (!member) all_err = RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID; else - all_err = rd_kafka_mock_cgrp_check_state( - mcgrp, member, rkbuf, GenerationId); + all_err = + rd_kafka_mock_cgrp_classic_check_state( + mcgrp_classic, member, rkbuf, + GenerationIdOrMemberEpoch); + } else { + rd_kafka_mock_cgrp_consumer_t *mcgrp_consumer; + rd_kafka_mock_cgrp_consumer_member_t *member = NULL; + + mcgrp_consumer = rd_kafka_mock_cgrp_consumer_find( + mcluster, &GroupId); + if (mcgrp_consumer) { + if (!RD_KAFKAP_STR_IS_NULL(&MemberId)) + member = + rd_kafka_mock_cgrp_consumer_member_find( + mcgrp_consumer, &MemberId); + + if (!member) + all_err = + RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID; + else + all_err = + GenerationIdOrMemberEpoch != + member->current_member_epoch + ? RD_KAFKA_RESP_ERR_STALE_MEMBER_EPOCH + : RD_KAFKA_RESP_ERR_NO_ERROR; + } } - /* FIXME: also check that partitions are assigned to member */ + /* As happens here, a real broker doesn't check that partitions + * are assigned to the member, but only the GenerationId. */ } - rd_kafka_buf_read_i32(rkbuf, &TopicsCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicsCnt, RD_KAFKAP_TOPICS_MAX); /* Response: #Topics */ - rd_kafka_buf_write_i32(resp, TopicsCnt); + rd_kafka_buf_write_arraycnt(resp, TopicsCnt); while (TopicsCnt-- > 0) { rd_kafkap_str_t Topic; @@ -724,14 +1094,15 @@ static int rd_kafka_mock_handle_OffsetCommit(rd_kafka_mock_connection_t *mconn, rd_kafka_mock_topic_t *mtopic; rd_kafka_buf_read_str(rkbuf, &Topic); - rd_kafka_buf_read_i32(rkbuf, &PartitionCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &PartitionCnt, + RD_KAFKAP_PARTITIONS_MAX); mtopic = rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); /* Response: Topic */ rd_kafka_buf_write_kstr(resp, &Topic); /* Response: #Partitions */ - rd_kafka_buf_write_i32(resp, PartitionCnt); + rd_kafka_buf_write_arraycnt(resp, PartitionCnt); while (PartitionCnt-- > 0) { int32_t Partition; @@ -758,6 +1129,11 @@ static int rd_kafka_mock_handle_OffsetCommit(rd_kafka_mock_connection_t *mconn, int32_t CommittedLeaderEpoch; rd_kafka_buf_read_i32(rkbuf, &CommittedLeaderEpoch); + + if (!err && mpart) + err = + rd_kafka_mock_partition_leader_epoch_check( + mpart, CommittedLeaderEpoch); } if (rkbuf->rkbuf_reqhdr.ApiVersion == 1) { @@ -766,6 +1142,7 @@ static int rd_kafka_mock_handle_OffsetCommit(rd_kafka_mock_connection_t *mconn, } rd_kafka_buf_read_str(rkbuf, &Metadata); + rd_kafka_buf_skip_tags(rkbuf); if (!err) rd_kafka_mock_commit_offset(mpart, &GroupId, @@ -774,7 +1151,10 @@ static int rd_kafka_mock_handle_OffsetCommit(rd_kafka_mock_connection_t *mconn, /* Response: ErrorCode */ rd_kafka_buf_write_i16(resp, err); + rd_kafka_buf_write_tags_empty(resp); } + rd_kafka_buf_skip_tags(rkbuf); + rd_kafka_buf_write_tags_empty(resp); } rd_kafka_mock_connection_send_response(mconn, resp); @@ -801,14 +1181,17 @@ static int rd_kafka_mock_handle_ApiVersion(rd_kafka_mock_connection_t *mconn, * @param mtopic may be NULL */ static void -rd_kafka_mock_buf_write_Metadata_Topic(rd_kafka_buf_t *resp, +rd_kafka_mock_buf_write_Metadata_Topic(rd_kafka_mock_cluster_t *mcluster, + rd_kafka_buf_t *resp, int16_t ApiVersion, + rd_kafka_Uuid_t topic_id, const char *topic, const rd_kafka_mock_topic_t *mtopic, rd_kafka_resp_err_t err) { int i; int partition_cnt = - (!mtopic || err == RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART) + (!mtopic || err == RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART || + err == RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_ID) ? 0 : mtopic->partition_cnt; @@ -816,46 +1199,87 @@ rd_kafka_mock_buf_write_Metadata_Topic(rd_kafka_buf_t *resp, rd_kafka_buf_write_i16(resp, err); /* Response: Topics.Name */ rd_kafka_buf_write_str(resp, topic, -1); + + if (ApiVersion >= 10) { + /* Response: Topics.TopicId */ + rd_kafka_buf_write_uuid(resp, &topic_id); + } + if (ApiVersion >= 1) { /* Response: Topics.IsInternal */ rd_kafka_buf_write_bool(resp, rd_false); } /* Response: Topics.#Partitions */ - rd_kafka_buf_write_i32(resp, partition_cnt); + rd_kafka_buf_write_arraycnt(resp, partition_cnt); for (i = 0; mtopic && i < partition_cnt; i++) { - const rd_kafka_mock_partition_t *mpart = &mtopic->partitions[i]; + rd_kafka_mock_partition_leader_t *mpart_leader; + rd_kafka_mock_partition_t *mpart = &mtopic->partitions[i]; int r; /* Response: ..Partitions.ErrorCode */ rd_kafka_buf_write_i16(resp, 0); /* Response: ..Partitions.PartitionIndex */ rd_kafka_buf_write_i32(resp, mpart->id); - /* Response: ..Partitions.Leader */ - rd_kafka_buf_write_i32(resp, - mpart->leader ? mpart->leader->id : -1); - if (ApiVersion >= 7) { - /* Response: ..Partitions.LeaderEpoch */ - rd_kafka_buf_write_i32(resp, -1); + mpart_leader = + rd_kafka_mock_partition_next_leader_response(mpart); + if (mpart_leader) { + rd_kafka_dbg( + mcluster->rk, MOCK, "MOCK", + "MetadataRequest: using next leader response " + "(%" PRId32 ", %" PRId32 ")", + mpart_leader->leader_id, + mpart_leader->leader_epoch); + + /* Response: ..Partitions.Leader */ + rd_kafka_buf_write_i32(resp, mpart_leader->leader_id); + + if (ApiVersion >= 7) { + /* Response: ..Partitions.LeaderEpoch */ + rd_kafka_buf_write_i32( + resp, mpart_leader->leader_epoch); + } + rd_kafka_mock_partition_leader_destroy(mpart, + mpart_leader); + mpart_leader = NULL; + } else { + /* Response: ..Partitions.Leader */ + rd_kafka_buf_write_i32( + resp, mpart->leader ? mpart->leader->id : -1); + + if (ApiVersion >= 7) { + /* Response: ..Partitions.LeaderEpoch */ + rd_kafka_buf_write_i32(resp, + mpart->leader_epoch); + } } /* Response: ..Partitions.#ReplicaNodes */ - rd_kafka_buf_write_i32(resp, mpart->replica_cnt); + rd_kafka_buf_write_arraycnt(resp, mpart->replica_cnt); for (r = 0; r < mpart->replica_cnt; r++) rd_kafka_buf_write_i32(resp, mpart->replicas[r]->id); /* Response: ..Partitions.#IsrNodes */ /* Let Replicas == ISRs for now */ - rd_kafka_buf_write_i32(resp, mpart->replica_cnt); + rd_kafka_buf_write_arraycnt(resp, mpart->replica_cnt); for (r = 0; r < mpart->replica_cnt; r++) rd_kafka_buf_write_i32(resp, mpart->replicas[r]->id); if (ApiVersion >= 5) { /* Response: ...OfflineReplicas */ - rd_kafka_buf_write_i32(resp, 0); + rd_kafka_buf_write_arraycnt(resp, 0); } + + rd_kafka_buf_write_tags_empty(resp); } + + if (ApiVersion >= 8) { + /* Response: Topics.TopicAuthorizedOperations */ + rd_kafka_buf_write_i32(resp, INT32_MIN); + } + + rd_kafka_buf_write_tags_empty(resp); } @@ -873,6 +1297,8 @@ static int rd_kafka_mock_handle_Metadata(rd_kafka_mock_connection_t *mconn, rd_bool_t list_all_topics = rd_false; int32_t TopicsCnt; int i; + size_t of_Brokers_cnt; + int32_t response_Brokers_cnt = 0; if (rkbuf->rkbuf_reqhdr.ApiVersion >= 3) { /* Response: ThrottleTime */ @@ -880,9 +1306,11 @@ static int rd_kafka_mock_handle_Metadata(rd_kafka_mock_connection_t *mconn, } /* Response: #Brokers */ - rd_kafka_buf_write_i32(resp, mcluster->broker_cnt); + of_Brokers_cnt = rd_kafka_buf_write_arraycnt_pos(resp); TAILQ_FOREACH(mrkb, &mcluster->brokers, link) { + if (!mrkb->up) + continue; /* Response: Brokers.Nodeid */ rd_kafka_buf_write_i32(resp, mrkb->id); /* Response: Brokers.Host */ @@ -893,7 +1321,11 @@ static int rd_kafka_mock_handle_Metadata(rd_kafka_mock_connection_t *mconn, /* Response: Brokers.Rack (Matt's going to love this) */ rd_kafka_buf_write_str(resp, mrkb->rack, -1); } + rd_kafka_buf_write_tags_empty(resp); + response_Brokers_cnt++; } + rd_kafka_buf_finalize_arraycnt(resp, of_Brokers_cnt, + response_Brokers_cnt); if (rkbuf->rkbuf_reqhdr.ApiVersion >= 2) { /* Response: ClusterId */ @@ -906,7 +1338,7 @@ static int rd_kafka_mock_handle_Metadata(rd_kafka_mock_connection_t *mconn, } /* #Topics */ - rd_kafka_buf_read_i32(rkbuf, &TopicsCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicsCnt, RD_KAFKAP_TOPICS_MAX); if (TopicsCnt > 0) requested_topics = rd_kafka_topic_partition_list_new(TopicsCnt); @@ -915,13 +1347,22 @@ static int rd_kafka_mock_handle_Metadata(rd_kafka_mock_connection_t *mconn, for (i = 0; i < TopicsCnt; i++) { rd_kafkap_str_t Topic; - char *topic; + rd_kafka_Uuid_t TopicId = RD_KAFKA_UUID_ZERO; + rd_kafka_topic_partition_t *rktpar; + char *topic = NULL; + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 10) { + /* TopicId */ + rd_kafka_buf_read_uuid(rkbuf, &TopicId); + } rd_kafka_buf_read_str(rkbuf, &Topic); RD_KAFKAP_STR_DUPA(&topic, &Topic); - rd_kafka_topic_partition_list_add(requested_topics, topic, - RD_KAFKA_PARTITION_UA); + rktpar = rd_kafka_topic_partition_list_add( + requested_topics, topic, RD_KAFKA_PARTITION_UA); + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 10) + rd_kafka_topic_partition_set_topic_id(rktpar, TopicId); + rd_kafka_buf_skip_tags(rkbuf); } if (rkbuf->rkbuf_reqhdr.ApiVersion >= 4) @@ -930,8 +1371,9 @@ static int rd_kafka_mock_handle_Metadata(rd_kafka_mock_connection_t *mconn, if (rkbuf->rkbuf_reqhdr.ApiVersion >= 8) { rd_bool_t IncludeClusterAuthorizedOperations; rd_bool_t IncludeTopicAuthorizedOperations; - rd_kafka_buf_read_bool(rkbuf, - &IncludeClusterAuthorizedOperations); + if (rkbuf->rkbuf_reqhdr.ApiVersion <= 10) + rd_kafka_buf_read_bool( + rkbuf, &IncludeClusterAuthorizedOperations); rd_kafka_buf_read_bool(rkbuf, &IncludeTopicAuthorizedOperations); } @@ -939,51 +1381,89 @@ static int rd_kafka_mock_handle_Metadata(rd_kafka_mock_connection_t *mconn, if (list_all_topics) { rd_kafka_mock_topic_t *mtopic; /* Response: #Topics */ - rd_kafka_buf_write_i32(resp, mcluster->topic_cnt); + rd_kafka_buf_write_arraycnt(resp, mcluster->topic_cnt); TAILQ_FOREACH(mtopic, &mcluster->topics, link) { rd_kafka_mock_buf_write_Metadata_Topic( - resp, rkbuf->rkbuf_reqhdr.ApiVersion, mtopic->name, - mtopic, mtopic->err); + mcluster, resp, rkbuf->rkbuf_reqhdr.ApiVersion, + mtopic->id, mtopic->name, mtopic, mtopic->err); } } else if (requested_topics) { /* Response: #Topics */ - rd_kafka_buf_write_i32(resp, requested_topics->cnt); + rd_kafka_buf_write_arraycnt(resp, requested_topics->cnt); for (i = 0; i < requested_topics->cnt; i++) { const rd_kafka_topic_partition_t *rktpar = &requested_topics->elems[i]; - rd_kafka_mock_topic_t *mtopic; + rd_kafka_mock_topic_t *mtopic = NULL; rd_kafka_resp_err_t err = RD_KAFKA_RESP_ERR_NO_ERROR; + char *topic_name = rktpar->topic; + rd_kafka_Uuid_t topic_id = + rd_kafka_topic_partition_get_topic_id(rktpar); + rd_bool_t invalid_before_12 = + rkbuf->rkbuf_reqhdr.ApiVersion < 12 && + (!RD_KAFKA_UUID_IS_ZERO(topic_id) || !topic_name); + rd_bool_t invalid_after_12 = + rkbuf->rkbuf_reqhdr.ApiVersion >= 12 && + RD_KAFKA_UUID_IS_ZERO(topic_id) && !topic_name; + if (invalid_before_12 || invalid_after_12) { + err = RD_KAFKA_RESP_ERR_INVALID_REQUEST; + } - mtopic = - rd_kafka_mock_topic_find(mcluster, rktpar->topic); - if (!mtopic && AllowAutoTopicCreation) - mtopic = rd_kafka_mock_topic_auto_create( - mcluster, rktpar->topic, -1, &err); - else if (!mtopic) - err = RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART; + if (!err) { + rd_bool_t use_topic_id = + !RD_KAFKA_UUID_IS_ZERO(topic_id); + if (use_topic_id) { + mtopic = rd_kafka_mock_topic_find_by_id( + mcluster, topic_id); + } else + mtopic = rd_kafka_mock_topic_find( + mcluster, topic_name); + + if (mtopic) { + topic_name = mtopic->name; + topic_id = mtopic->id; + } else if (!use_topic_id) { + topic_name = rktpar->topic; + } else { + topic_name = NULL; + } + + if (!mtopic && topic_name && + AllowAutoTopicCreation) { + mtopic = + rd_kafka_mock_topic_auto_create( + mcluster, topic_name, -1, &err); + topic_id = mtopic->id; + } else if (!mtopic) { + err = + use_topic_id + ? RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_ID + : RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART; + } + } rd_kafka_mock_buf_write_Metadata_Topic( - resp, rkbuf->rkbuf_reqhdr.ApiVersion, rktpar->topic, - mtopic, err ? err : mtopic->err); + mcluster, resp, rkbuf->rkbuf_reqhdr.ApiVersion, + topic_id, topic_name, mtopic, + err ? err : mtopic->err); } - if (rkbuf->rkbuf_reqhdr.ApiVersion >= 8) { - /* TopicAuthorizedOperations */ - rd_kafka_buf_write_i32(resp, INT32_MIN); - } } else { /* Response: #Topics: brokers only */ - rd_kafka_buf_write_i32(resp, 0); + rd_kafka_buf_write_arraycnt(resp, 0); } - if (rkbuf->rkbuf_reqhdr.ApiVersion >= 8) { + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 8 && + rkbuf->rkbuf_reqhdr.ApiVersion <= 10) { /* ClusterAuthorizedOperations */ rd_kafka_buf_write_i32(resp, INT32_MIN); } + rd_kafka_buf_skip_tags(rkbuf); + rd_kafka_buf_write_tags_empty(resp); + if (requested_topics) rd_kafka_topic_partition_list_destroy(requested_topics); @@ -1090,8 +1570,8 @@ static int rd_kafka_mock_handle_JoinGroup(rd_kafka_mock_connection_t *mconn, int32_t ProtocolCnt = 0; int32_t i; rd_kafka_resp_err_t err; - rd_kafka_mock_cgrp_t *mcgrp; - rd_kafka_mock_cgrp_proto_t *protos = NULL; + rd_kafka_mock_cgrp_classic_t *mcgrp; + rd_kafka_mock_cgrp_classic_proto_t *protos = NULL; rd_kafka_buf_read_str(rkbuf, &GroupId); rd_kafka_buf_read_i32(rkbuf, &SessionTimeoutMs); @@ -1117,7 +1597,7 @@ static int rd_kafka_mock_handle_JoinGroup(rd_kafka_mock_connection_t *mconn, rd_kafkap_str_t ProtocolName; rd_kafkap_bytes_t Metadata; rd_kafka_buf_read_str(rkbuf, &ProtocolName); - rd_kafka_buf_read_bytes(rkbuf, &Metadata); + rd_kafka_buf_read_kbytes(rkbuf, &Metadata); protos[i].name = rd_kafkap_str_copy(&ProtocolName); protos[i].metadata = rd_kafkap_bytes_copy(&Metadata); } @@ -1144,15 +1624,15 @@ static int rd_kafka_mock_handle_JoinGroup(rd_kafka_mock_connection_t *mconn, } if (!err) { - mcgrp = - rd_kafka_mock_cgrp_get(mcluster, &GroupId, &ProtocolType); + mcgrp = rd_kafka_mock_cgrp_classic_get(mcluster, &GroupId, + &ProtocolType); rd_assert(mcgrp); /* This triggers an async rebalance, the response will be * sent later. */ - err = rd_kafka_mock_cgrp_member_add( - mcgrp, mconn, resp, &MemberId, &ProtocolType, protos, - ProtocolCnt, SessionTimeoutMs); + err = rd_kafka_mock_cgrp_classic_member_add( + mcgrp, mconn, resp, &MemberId, &ProtocolType, + &GroupInstanceId, protos, ProtocolCnt, SessionTimeoutMs); if (!err) { /* .._add() assumes ownership of resp and protos */ protos = NULL; @@ -1161,7 +1641,7 @@ static int rd_kafka_mock_handle_JoinGroup(rd_kafka_mock_connection_t *mconn, } } - rd_kafka_mock_cgrp_protos_destroy(protos, ProtocolCnt); + rd_kafka_mock_cgrp_classic_protos_destroy(protos, ProtocolCnt); /* Error case */ rd_kafka_buf_write_i16(resp, err); /* ErrorCode */ @@ -1178,7 +1658,7 @@ static int rd_kafka_mock_handle_JoinGroup(rd_kafka_mock_connection_t *mconn, err_parse: rd_kafka_buf_destroy(resp); if (protos) - rd_kafka_mock_cgrp_protos_destroy(protos, ProtocolCnt); + rd_kafka_mock_cgrp_classic_protos_destroy(protos, ProtocolCnt); return -1; } @@ -1196,8 +1676,8 @@ static int rd_kafka_mock_handle_Heartbeat(rd_kafka_mock_connection_t *mconn, rd_kafkap_str_t GroupInstanceId = RD_KAFKAP_STR_INITIALIZER; int32_t GenerationId; rd_kafka_resp_err_t err; - rd_kafka_mock_cgrp_t *mcgrp; - rd_kafka_mock_cgrp_member_t *member = NULL; + rd_kafka_mock_cgrp_classic_t *mcgrp; + rd_kafka_mock_cgrp_classic_member_t *member = NULL; rd_kafka_buf_read_str(rkbuf, &GroupId); rd_kafka_buf_read_i32(rkbuf, &GenerationId); @@ -1226,23 +1706,24 @@ static int rd_kafka_mock_handle_Heartbeat(rd_kafka_mock_connection_t *mconn, } if (!err) { - mcgrp = rd_kafka_mock_cgrp_find(mcluster, &GroupId); + mcgrp = rd_kafka_mock_cgrp_classic_find(mcluster, &GroupId); if (!mcgrp) err = RD_KAFKA_RESP_ERR_GROUP_ID_NOT_FOUND; } if (!err) { - member = rd_kafka_mock_cgrp_member_find(mcgrp, &MemberId); + member = + rd_kafka_mock_cgrp_classic_member_find(mcgrp, &MemberId); if (!member) err = RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID; } if (!err) - err = rd_kafka_mock_cgrp_check_state(mcgrp, member, rkbuf, - GenerationId); + err = rd_kafka_mock_cgrp_classic_check_state( + mcgrp, member, rkbuf, GenerationId); if (!err) - rd_kafka_mock_cgrp_member_active(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_active(mcgrp, member); rd_kafka_buf_write_i16(resp, err); /* ErrorCode */ @@ -1267,8 +1748,8 @@ static int rd_kafka_mock_handle_LeaveGroup(rd_kafka_mock_connection_t *mconn, rd_kafka_buf_t *resp = rd_kafka_mock_buf_new_response(rkbuf); rd_kafkap_str_t GroupId, MemberId; rd_kafka_resp_err_t err; - rd_kafka_mock_cgrp_t *mcgrp; - rd_kafka_mock_cgrp_member_t *member = NULL; + rd_kafka_mock_cgrp_classic_t *mcgrp; + rd_kafka_mock_cgrp_classic_member_t *member = NULL; rd_kafka_buf_read_str(rkbuf, &GroupId); rd_kafka_buf_read_str(rkbuf, &MemberId); @@ -1295,22 +1776,24 @@ static int rd_kafka_mock_handle_LeaveGroup(rd_kafka_mock_connection_t *mconn, } if (!err) { - mcgrp = rd_kafka_mock_cgrp_find(mcluster, &GroupId); + mcgrp = rd_kafka_mock_cgrp_classic_find(mcluster, &GroupId); if (!mcgrp) err = RD_KAFKA_RESP_ERR_GROUP_ID_NOT_FOUND; } if (!err) { - member = rd_kafka_mock_cgrp_member_find(mcgrp, &MemberId); + member = + rd_kafka_mock_cgrp_classic_member_find(mcgrp, &MemberId); if (!member) err = RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID; } if (!err) - err = rd_kafka_mock_cgrp_check_state(mcgrp, member, rkbuf, -1); + err = rd_kafka_mock_cgrp_classic_check_state(mcgrp, member, + rkbuf, -1); if (!err) - rd_kafka_mock_cgrp_member_leave(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_leave(mcgrp, member); rd_kafka_buf_write_i16(resp, err); /* ErrorCode */ @@ -1339,8 +1822,8 @@ static int rd_kafka_mock_handle_SyncGroup(rd_kafka_mock_connection_t *mconn, int32_t GenerationId, AssignmentCnt; int32_t i; rd_kafka_resp_err_t err; - rd_kafka_mock_cgrp_t *mcgrp = NULL; - rd_kafka_mock_cgrp_member_t *member = NULL; + rd_kafka_mock_cgrp_classic_t *mcgrp = NULL; + rd_kafka_mock_cgrp_classic_member_t *member = NULL; rd_kafka_buf_read_str(rkbuf, &GroupId); rd_kafka_buf_read_i32(rkbuf, &GenerationId); @@ -1370,23 +1853,24 @@ static int rd_kafka_mock_handle_SyncGroup(rd_kafka_mock_connection_t *mconn, } if (!err) { - mcgrp = rd_kafka_mock_cgrp_find(mcluster, &GroupId); + mcgrp = rd_kafka_mock_cgrp_classic_find(mcluster, &GroupId); if (!mcgrp) err = RD_KAFKA_RESP_ERR_GROUP_ID_NOT_FOUND; } if (!err) { - member = rd_kafka_mock_cgrp_member_find(mcgrp, &MemberId); + member = + rd_kafka_mock_cgrp_classic_member_find(mcgrp, &MemberId); if (!member) err = RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID; } if (!err) - err = rd_kafka_mock_cgrp_check_state(mcgrp, member, rkbuf, - GenerationId); + err = rd_kafka_mock_cgrp_classic_check_state( + mcgrp, member, rkbuf, GenerationId); if (!err) - rd_kafka_mock_cgrp_member_active(mcgrp, member); + rd_kafka_mock_cgrp_classic_member_active(mcgrp, member); if (!err) { rd_bool_t is_leader = mcgrp->leader && mcgrp->leader == member; @@ -1402,26 +1886,27 @@ static int rd_kafka_mock_handle_SyncGroup(rd_kafka_mock_connection_t *mconn, for (i = 0; i < AssignmentCnt; i++) { rd_kafkap_str_t MemberId2; rd_kafkap_bytes_t Metadata; - rd_kafka_mock_cgrp_member_t *member2; + rd_kafka_mock_cgrp_classic_member_t *member2; rd_kafka_buf_read_str(rkbuf, &MemberId2); - rd_kafka_buf_read_bytes(rkbuf, &Metadata); + rd_kafka_buf_read_kbytes(rkbuf, &Metadata); if (err) continue; /* Find member */ - member2 = rd_kafka_mock_cgrp_member_find(mcgrp, &MemberId2); + member2 = + rd_kafka_mock_cgrp_classic_member_find(mcgrp, &MemberId2); if (!member2) continue; - rd_kafka_mock_cgrp_member_assignment_set(mcgrp, member2, - &Metadata); + rd_kafka_mock_cgrp_classic_member_assignment_set(mcgrp, member2, + &Metadata); } if (!err) { - err = rd_kafka_mock_cgrp_member_sync_set(mcgrp, member, mconn, - resp); + err = rd_kafka_mock_cgrp_classic_member_sync_set(mcgrp, member, + mconn, resp); /* .._sync_set() assumes ownership of resp */ if (!err) return 0; /* Response will be sent when all members @@ -1808,11 +2293,24 @@ rd_kafka_mock_handle_TxnOffsetCommit(rd_kafka_mock_connection_t *mconn, rd_kafka_buf_read_i64(rkbuf, &pid.id); /* Epoch */ rd_kafka_buf_read_i16(rkbuf, &pid.epoch); + + if (rkbuf->rkbuf_reqhdr.ApiVersion >= 3) { + int32_t GenerationId; + rd_kafkap_str_t kMemberId, kGroupInstanceId; + + /* GenerationId */ + rd_kafka_buf_read_i32(rkbuf, &GenerationId); + /* MemberId */ + rd_kafka_buf_read_str(rkbuf, &kMemberId); + /* GroupInstanceId */ + rd_kafka_buf_read_str(rkbuf, &kGroupInstanceId); + } + /* #Topics */ - rd_kafka_buf_read_i32(rkbuf, &TopicsCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicsCnt, 100000); /* Response: #Results */ - rd_kafka_buf_write_i32(resp, TopicsCnt); + rd_kafka_buf_write_arraycnt(resp, TopicsCnt); /* Inject error */ err = rd_kafka_mock_next_request_error(mconn, resp); @@ -1828,36 +2326,48 @@ rd_kafka_mock_handle_TxnOffsetCommit(rd_kafka_mock_connection_t *mconn, while (TopicsCnt-- > 0) { rd_kafkap_str_t Topic; int32_t PartsCnt; + rd_kafka_mock_topic_t *mtopic; /* Topic */ rd_kafka_buf_read_str(rkbuf, &Topic); /* Response: Topic */ rd_kafka_buf_write_kstr(resp, &Topic); - /* #Partitions */ - rd_kafka_buf_read_i32(rkbuf, &PartsCnt); - /* Response: #Partitions */ - rd_kafka_buf_write_i32(resp, PartsCnt); + mtopic = rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); - /* Ignore if the topic or partition exists or not. */ + /* #Partitions */ + rd_kafka_buf_read_arraycnt(rkbuf, &PartsCnt, 100000); + + /* Response: #Partitions */ + rd_kafka_buf_write_arraycnt(resp, PartsCnt); while (PartsCnt-- > 0) { int32_t Partition; int64_t Offset; rd_kafkap_str_t Metadata; + rd_kafka_mock_partition_t *mpart; /* Partition */ rd_kafka_buf_read_i32(rkbuf, &Partition); /* Response: Partition */ rd_kafka_buf_write_i32(resp, Partition); + mpart = rd_kafka_mock_partition_find(mtopic, Partition); + if (!err && !mpart) + err = RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART; + /* CommittedOffset */ rd_kafka_buf_read_i64(rkbuf, &Offset); if (rkbuf->rkbuf_reqhdr.ApiVersion >= 2) { /* CommittedLeaderEpoch */ - int32_t Epoch; - rd_kafka_buf_read_i32(rkbuf, &Epoch); + int32_t CommittedLeaderEpoch; + rd_kafka_buf_read_i32(rkbuf, + &CommittedLeaderEpoch); + if (!err && mpart) + err = + rd_kafka_mock_partition_leader_epoch_check( + mpart, CommittedLeaderEpoch); } /* CommittedMetadata */ @@ -1865,7 +2375,19 @@ rd_kafka_mock_handle_TxnOffsetCommit(rd_kafka_mock_connection_t *mconn, /* Response: ErrorCode */ rd_kafka_buf_write_i16(resp, err); + + /* Request: Struct tags */ + rd_kafka_buf_skip_tags(rkbuf); + + /* Response: Struct tags */ + rd_kafka_buf_write_tags_empty(resp); } + + /* Request: Struct tags */ + rd_kafka_buf_skip_tags(rkbuf); + + /* Response: Struct tags */ + rd_kafka_buf_write_tags_empty(resp); } rd_kafka_mock_connection_send_response(mconn, resp); @@ -1930,6 +2452,544 @@ err_parse: return -1; } +static int +rd_kafka_mock_handle_OffsetForLeaderEpoch(rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *rkbuf) { + const rd_bool_t log_decode_errors = rd_true; + rd_kafka_mock_cluster_t *mcluster = mconn->broker->cluster; + rd_kafka_buf_t *resp = rd_kafka_mock_buf_new_response(rkbuf); + rd_kafka_resp_err_t err; + int32_t TopicsCnt, i; + + /* Response: ThrottleTimeMs */ + rd_kafka_buf_write_i32(resp, 0); + + /* #Topics */ + rd_kafka_buf_read_arraycnt(rkbuf, &TopicsCnt, RD_KAFKAP_TOPICS_MAX); + + /* Response: #Topics */ + rd_kafka_buf_write_arraycnt(resp, TopicsCnt); + + /* Inject error */ + err = rd_kafka_mock_next_request_error(mconn, resp); + + for (i = 0; i < TopicsCnt; i++) { + rd_kafkap_str_t Topic; + int32_t PartitionsCnt, j; + rd_kafka_mock_topic_t *mtopic; + + /* Topic */ + rd_kafka_buf_read_str(rkbuf, &Topic); + + mtopic = rd_kafka_mock_topic_find_by_kstr(mcluster, &Topic); + + /* Response: Topic */ + rd_kafka_buf_write_kstr(resp, &Topic); + + /* #Partitions */ + rd_kafka_buf_read_arraycnt(rkbuf, &PartitionsCnt, + RD_KAFKAP_PARTITIONS_MAX); + + /* Response: #Partitions */ + rd_kafka_buf_write_arraycnt(resp, PartitionsCnt); + + for (j = 0; j < PartitionsCnt; j++) { + rd_kafka_mock_partition_t *mpart; + int32_t Partition, CurrentLeaderEpoch, LeaderEpoch; + int64_t EndOffset = -1; + + /* Partition */ + rd_kafka_buf_read_i32(rkbuf, &Partition); + /* CurrentLeaderEpoch */ + rd_kafka_buf_read_i32(rkbuf, &CurrentLeaderEpoch); + /* LeaderEpoch */ + rd_kafka_buf_read_i32(rkbuf, &LeaderEpoch); + + mpart = rd_kafka_mock_partition_find(mtopic, Partition); + if (!err && !mpart) + err = RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART; + + if (!err && mpart) + err = + rd_kafka_mock_partition_leader_epoch_check( + mpart, CurrentLeaderEpoch); + + if (!err && mpart) { + EndOffset = + rd_kafka_mock_partition_offset_for_leader_epoch( + mpart, LeaderEpoch); + } + + /* Response: ErrorCode */ + rd_kafka_buf_write_i16(resp, err); + /* Response: Partition */ + rd_kafka_buf_write_i32(resp, Partition); + /* Response: LeaderEpoch */ + rd_kafka_buf_write_i32(resp, LeaderEpoch); + /* Response: Partition */ + rd_kafka_buf_write_i64(resp, EndOffset); + } + } + + rd_kafka_mock_connection_send_response(mconn, resp); + + return 0; + +err_parse: + rd_kafka_buf_destroy(resp); + return -1; +} + +/** + * @brief Handle GetTelemetrySubscriptions + */ +static int rd_kafka_mock_handle_GetTelemetrySubscriptions( + rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *rkbuf) { + const rd_bool_t log_decode_errors = rd_true; + rd_kafka_mock_cluster_t *mcluster = mconn->broker->cluster; + rd_kafka_buf_t *resp = rd_kafka_mock_buf_new_response(rkbuf); + rd_kafka_resp_err_t err; + size_t i; + rd_kafka_Uuid_t ClientInstanceId; + rd_kafka_Uuid_t zero_uuid = RD_KAFKA_UUID_ZERO; + + /* Request: ClientInstanceId */ + rd_kafka_buf_read_uuid(rkbuf, &ClientInstanceId); + if (ClientInstanceId.least_significant_bits == + zero_uuid.least_significant_bits && + ClientInstanceId.most_significant_bits == + zero_uuid.most_significant_bits) { + /* Some random numbers */ + ClientInstanceId.least_significant_bits = 129; + ClientInstanceId.most_significant_bits = 298; + } + + /* Response: ThrottleTimeMs */ + rd_kafka_buf_write_i32(resp, 0); + + /* Inject error */ + err = rd_kafka_mock_next_request_error(mconn, resp); + + /* Response: ErrorCode */ + rd_kafka_buf_write_i16(resp, err); + + /* Response: ClientInstanceId*/ + rd_kafka_buf_write_uuid(resp, &ClientInstanceId); + + /* Response: SubscriptionId */ + // TODO: Calculate subscription ID. + rd_kafka_buf_write_i32(resp, 0); + + /* Response: #AcceptedCompressionTypes */ + rd_kafka_buf_write_arraycnt(resp, 4); + + /* Response: AcceptedCompressionTypes */ + rd_kafka_buf_write_i8(resp, RD_KAFKA_COMPRESSION_ZSTD); + rd_kafka_buf_write_i8(resp, RD_KAFKA_COMPRESSION_KLZ4); + rd_kafka_buf_write_i8(resp, RD_KAFKA_COMPRESSION_GZIP); + rd_kafka_buf_write_i8(resp, RD_KAFKA_COMPRESSION_SNAPPY); + + /* Response: PushIntervalMs */ + /* We use the value in telemetry_push_interval_ms, and if not set, the + * default of 5 minutes. */ + rd_kafka_buf_write_i32(resp, mcluster->telemetry_push_interval_ms > 0 + ? mcluster->telemetry_push_interval_ms + : (5 * 60 * 1000)); + + /* Response: TelemetryMaxBytes */ + rd_kafka_buf_write_i32(resp, 10000); + + /* Response: DeltaTemporality */ + rd_kafka_buf_write_bool(resp, rd_true); + + /* Response: #RequestedMetrics */ + rd_kafka_buf_write_arraycnt(resp, mcluster->metrics_cnt); + + for (i = 0; i < mcluster->metrics_cnt; i++) + rd_kafka_buf_write_str(resp, mcluster->metrics[i], -1); + + rd_kafka_mock_connection_send_response(mconn, resp); + + return 0; + +err_parse: + rd_kafka_buf_destroy(resp); + return -1; +} + +/** + * @brief Handle PushTelemetry + */ + +static void rd_kafka_mock_handle_PushTelemetry_decoded_NumberDataPoint( + void *opaque, + const opentelemetry_proto_metrics_v1_NumberDataPoint *decoded) { + rd_kafka_broker_t *rkb = opaque; + if (decoded->which_value == + opentelemetry_proto_metrics_v1_NumberDataPoint_as_int_tag) + rd_rkb_log(rkb, LOG_INFO, "MOCKTELEMETRY", + "NumberDataPoint int value: %" PRId64 + " time: %" PRIu64, + decoded->value.as_int, decoded->time_unix_nano); + else if (decoded->which_value == + opentelemetry_proto_metrics_v1_NumberDataPoint_as_double_tag) + rd_rkb_log(rkb, LOG_INFO, "MOCKTELEMETRY", + "NumberDataPoint double value: %f time: %" PRIu64, + decoded->value.as_double, decoded->time_unix_nano); +} + +static void +rd_kafka_mock_handle_PushTelemetry_decoded_int64(void *opaque, + int64_t int64_value) { + rd_kafka_broker_t *rkb = opaque; + rd_rkb_log(rkb, LOG_INFO, "MOCKTELEMETRY", "int64 value: %" PRId64, + int64_value); +} + +static void +rd_kafka_mock_handle_PushTelemetry_decoded_string(void *opaque, + const uint8_t *decoded) { + rd_kafka_broker_t *rkb = opaque; + rd_rkb_log(rkb, LOG_INFO, "MOCKTELEMETRY", "string value: %s", decoded); +} + +static void rd_kafka_mock_handle_PushTelemetry_decoded_type( + void *opaque, + rd_kafka_telemetry_metric_type_t type) { + rd_kafka_broker_t *rkb = opaque; + rd_rkb_log(rkb, LOG_INFO, "MOCKTELEMETRY", "Metric type: %d", type); +} + +static void rd_kafka_mock_handle_PushTelemetry_decode_error(void *opaque, + const char *error, + ...) { + rd_kafka_broker_t *rkb = opaque; + va_list ap; + va_start(ap, error); + rd_rkb_log(rkb, LOG_ERR, "MOCKTELEMETRY", error, ap); + va_end(ap); + rd_assert(!*"Failure while decoding telemetry data"); +} + +void rd_kafka_mock_handle_PushTelemetry_payload(rd_kafka_broker_t *rkb, + void *payload, + size_t size) { + rd_kafka_telemetry_decode_interface_t decode_interface = { + .decoded_string = rd_kafka_mock_handle_PushTelemetry_decoded_string, + .decoded_NumberDataPoint = + rd_kafka_mock_handle_PushTelemetry_decoded_NumberDataPoint, + .decoded_int64 = rd_kafka_mock_handle_PushTelemetry_decoded_int64, + .decoded_type = rd_kafka_mock_handle_PushTelemetry_decoded_type, + .decode_error = rd_kafka_mock_handle_PushTelemetry_decode_error, + .opaque = rkb, + }; + rd_kafka_telemetry_decode_metrics(&decode_interface, payload, size); +} + +static int rd_kafka_mock_handle_PushTelemetry(rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *rkbuf) { + rd_kafka_broker_t *rkb = mconn->broker->cluster->dummy_rkb; + const rd_bool_t log_decode_errors = rd_true; + rd_kafka_mock_cluster_t *mcluster = mconn->broker->cluster; + rd_kafka_buf_t *resp = rd_kafka_mock_buf_new_response(rkbuf); + rd_kafka_Uuid_t ClientInstanceId; + int32_t SubscriptionId; + rd_bool_t terminating; + rd_kafka_compression_t compression_type = RD_KAFKA_COMPRESSION_NONE; + rd_kafkap_bytes_t metrics; + rd_kafka_resp_err_t err; + + rd_kafka_buf_read_uuid(rkbuf, &ClientInstanceId); + rd_kafka_buf_read_i32(rkbuf, &SubscriptionId); + rd_kafka_buf_read_bool(rkbuf, &terminating); + rd_kafka_buf_read_i8(rkbuf, &compression_type); + rd_kafka_buf_read_kbytes(rkbuf, &metrics); + + void *uncompressed_payload = NULL; + size_t uncompressed_payload_len = 0; + rd_assert(metrics.data != NULL); + + if (compression_type != RD_KAFKA_COMPRESSION_NONE) { + rd_rkb_log(rkb, LOG_DEBUG, "MOCKTELEMETRY", + "Compression type %s", + rd_kafka_compression2str(compression_type)); + int err_uncompress = + rd_kafka_telemetry_uncompress_metrics_payload( + rkb, compression_type, (void *)metrics.data, + metrics.len, &uncompressed_payload, + &uncompressed_payload_len); + if (err_uncompress) { + rd_kafka_dbg(mcluster->rk, MOCK, "MOCKTELEMETRY", + "Failed to uncompress " + "telemetry payload."); + goto err_parse; + } + } else { + uncompressed_payload = (void *)metrics.data; + uncompressed_payload_len = metrics.len; + } + + rd_assert(uncompressed_payload != NULL); + rd_kafka_mock_handle_PushTelemetry_payload(rkb, uncompressed_payload, + uncompressed_payload_len); + if (compression_type != RD_KAFKA_COMPRESSION_NONE) + rd_free(uncompressed_payload); + + /* ThrottleTime */ + rd_kafka_buf_write_i32(resp, 0); + + /* ErrorCode */ + err = rd_kafka_mock_next_request_error(mconn, resp); + rd_kafka_buf_write_i16(resp, err); + + rd_kafka_mock_connection_send_response(mconn, resp); + + return 0; + +err_parse: + rd_kafka_buf_destroy(resp); + return -1; +} +static void rd_kafka_mock_handle_ConsumerGroupHeartbeat_write_TopicPartitions( + rd_kafka_buf_t *rkbuf, + rd_kafka_topic_partition_list_t *rktparlist) { + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + rd_kafka_topic_partition_list_sort_by_topic_id(rktparlist); + rd_kafka_buf_write_topic_partitions( + rkbuf, rktparlist, rd_false /*don't skip invalid offsets*/, + rd_false /*any offset*/, rd_true /* use_topic id */, + rd_false /* don't use topic name */, fields); +} + +static int +rd_kafka_mock_handle_ConsumerGroupHeartbeat(rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *rkbuf) { + const rd_bool_t log_decode_errors = rd_true; + rd_bool_t sent_assignment_parse_err = rd_false; + rd_kafka_mock_cluster_t *mcluster = mconn->broker->cluster; + rd_kafka_buf_t *resp = rd_kafka_mock_buf_new_response(rkbuf); + rd_kafka_topic_partition_list_t *sent_assignment = NULL, + *existing_assignment = NULL, + *next_assignment = NULL; + rd_kafka_topic_partition_t *rktpar; + rd_kafkap_str_t GroupId, MemberId, InstanceId, RackId, ServerAssignor, + SubscribedTopicRegex; + rd_kafkap_str_t *SubscribedTopicNames = NULL; + int32_t MemberEpoch, RebalanceTimeoutMs, SubscribedTopicNamesCnt; + int32_t i; + rd_kafka_resp_err_t err; + rd_kafka_mock_cgrp_consumer_t *mcgrp = NULL; + rd_kafka_mock_broker_t *mrkb = NULL; + rd_kafka_mock_cgrp_consumer_member_t *member = NULL; + + /* GroupId */ + rd_kafka_buf_read_str(rkbuf, &GroupId); + rd_assert(!RD_KAFKAP_STR_IS_NULL(&GroupId)); + + /* MemberId */ + rd_kafka_buf_read_str(rkbuf, &MemberId); + rd_assert(!RD_KAFKAP_STR_IS_NULL(&MemberId)); + + /* MemberEpoch */ + rd_kafka_buf_read_i32(rkbuf, &MemberEpoch); + rd_assert(MemberEpoch >= -2); + + /* InstanceId */ + rd_kafka_buf_read_str(rkbuf, &InstanceId); + + /* RackId */ + rd_kafka_buf_read_str(rkbuf, &RackId); + + /* RebalanceTimeoutMs */ + rd_kafka_buf_read_i32(rkbuf, &RebalanceTimeoutMs); + rd_assert(RebalanceTimeoutMs >= -1); + + /* #SubscribedTopicNames */ + rd_kafka_buf_read_arraycnt(rkbuf, &SubscribedTopicNamesCnt, + RD_KAFKAP_TOPICS_MAX); + if (SubscribedTopicNamesCnt >= 0) { + SubscribedTopicNames = rd_calloc( + SubscribedTopicNamesCnt > 0 ? SubscribedTopicNamesCnt : 1, + sizeof(rd_kafkap_str_t)); + for (i = 0; i < SubscribedTopicNamesCnt; i++) { + /* SubscribedTopicNames[i] */ + rd_kafka_buf_read_str(rkbuf, &SubscribedTopicNames[i]); + } + } + + rd_kafka_buf_read_str(rkbuf, &SubscribedTopicRegex); + + /* ServerAssignor */ + rd_kafka_buf_read_str(rkbuf, &ServerAssignor); + + /* #TopicPartitions */ + const rd_kafka_topic_partition_field_t sent_assignment_fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + sent_assignment = rd_kafka_buf_read_topic_partitions_nullable( + rkbuf, rd_true, rd_false, 0, sent_assignment_fields, + &sent_assignment_parse_err); + if (sent_assignment_parse_err) + goto err_parse; + + if (sent_assignment) { + rd_kafka_Uuid_t last_topic_id = RD_KAFKA_UUID_ZERO; + rd_kafka_mock_topic_t *mtopic = NULL; + existing_assignment = + rd_kafka_topic_partition_list_new(sent_assignment->cnt); + RD_KAFKA_TPLIST_FOREACH(rktpar, sent_assignment) { + rd_kafka_Uuid_t current_topic_id = + rd_kafka_topic_partition_get_topic_id(rktpar); + + if (rd_kafka_Uuid_cmp(current_topic_id, + last_topic_id) != 0) { + last_topic_id = current_topic_id; + mtopic = rd_kafka_mock_topic_find_by_id( + mcluster, current_topic_id); + } + + if (mtopic) { + rd_kafka_topic_partition_t *added = + rd_kafka_topic_partition_list_add( + existing_assignment, "", + rktpar->partition); + rd_kafka_topic_partition_set_topic_id( + added, last_topic_id); + } + } + } + + /* Inject error, if any */ + err = rd_kafka_mock_next_request_error(mconn, resp); + + if (!err) { + mrkb = rd_kafka_mock_cluster_get_coord( + mcluster, RD_KAFKA_COORD_GROUP, &GroupId); + + if (!mrkb) + err = RD_KAFKA_RESP_ERR_COORDINATOR_NOT_AVAILABLE; + else if (mrkb != mconn->broker) + err = RD_KAFKA_RESP_ERR_NOT_COORDINATOR; + } + + if (!err) { + mtx_lock(&mcluster->lock); + mcgrp = rd_kafka_mock_cgrp_consumer_get(mcluster, &GroupId); + rd_assert(mcgrp); + + member = rd_kafka_mock_cgrp_consumer_member_add( + mcgrp, mconn, &MemberId, &InstanceId, SubscribedTopicNames, + SubscribedTopicNamesCnt, &SubscribedTopicRegex); + + if (member) { + if (MemberEpoch >= 0) { + next_assignment = + rd_kafka_mock_cgrp_consumer_member_next_assignment( + member, existing_assignment, + &MemberEpoch); + if (MemberEpoch < 0) { + err = + RD_KAFKA_RESP_ERR_FENCED_MEMBER_EPOCH; + } + } else { + rd_kafka_mock_cgrp_consumer_member_leave( + mcgrp, member, MemberEpoch == -2); + member = NULL; + } + } else { + err = RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID; + } + mtx_unlock(&mcluster->lock); + } else { + switch (err) { + case RD_KAFKA_RESP_ERR_UNKNOWN_MEMBER_ID: + case RD_KAFKA_RESP_ERR_FENCED_MEMBER_EPOCH: + /* In case the error was set + * by `rd_kafka_mock_next_request_error`. */ + MemberEpoch = -1; + mtx_lock(&mcluster->lock); + mcgrp = rd_kafka_mock_cgrp_consumer_find(mcluster, + &GroupId); + if (!mcgrp) { + mtx_unlock(&mcluster->lock); + break; + } + + rd_kafka_mock_cgrp_consumer_member_t *member = + rd_kafka_mock_cgrp_consumer_member_find(mcgrp, + &MemberId); + if (member) { + rd_kafka_mock_cgrp_consumer_member_fenced( + mcgrp, member); + member = NULL; + } + mtx_unlock(&mcluster->lock); + default: + break; + } + } + + /* + * Construct response + */ + /* Response: Throttle */ + rd_kafka_buf_write_i32(resp, 0); + + /* Response: ErrorCode */ + rd_kafka_buf_write_i16(resp, err); + + /* Response: ErrorMessage */ + rd_kafka_buf_write_str(resp, rd_kafka_err2str(err), -1); + + /* Response: MemberId */ + if (!err && member) + rd_kafka_buf_write_str(resp, member->id, -1); + else + rd_kafka_buf_write_str(resp, NULL, -1); + + /* Response: MemberEpoch */ + rd_kafka_buf_write_i32(resp, MemberEpoch); + + /* Response: HeartbeatIntervalMs */ + if (mcgrp) { + rd_kafka_buf_write_i32(resp, mcgrp->heartbeat_interval_ms); + } else { + rd_kafka_buf_write_i32(resp, 0); + } + + if (next_assignment) { + /* Response: Assignment */ + rd_kafka_buf_write_i8(resp, 1); + + /* Response: TopicPartitions */ + rd_kafka_mock_handle_ConsumerGroupHeartbeat_write_TopicPartitions( + resp, next_assignment); + + rd_kafka_buf_write_tags_empty(resp); + } else { + /* Response: Assignment */ + rd_kafka_buf_write_i8(resp, -1); + } + + rd_kafka_mock_connection_send_response(mconn, resp); + + rd_free(SubscribedTopicNames); + RD_IF_FREE(sent_assignment, rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(existing_assignment, rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(next_assignment, rd_kafka_topic_partition_list_destroy); + return 0; + +err_parse: + RD_IF_FREE(SubscribedTopicNames, rd_free); + RD_IF_FREE(sent_assignment, rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(existing_assignment, rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(next_assignment, rd_kafka_topic_partition_list_destroy); + rd_kafka_buf_destroy(resp); + return -1; +} /** * @brief Default request handlers @@ -1937,28 +2997,36 @@ err_parse: const struct rd_kafka_mock_api_handler rd_kafka_mock_api_handlers[RD_KAFKAP__NUM] = { /* [request-type] = { MinVersion, MaxVersion, FlexVersion, callback } */ - [RD_KAFKAP_Produce] = {0, 7, -1, rd_kafka_mock_handle_Produce}, - [RD_KAFKAP_Fetch] = {0, 11, -1, rd_kafka_mock_handle_Fetch}, - [RD_KAFKAP_ListOffsets] = {0, 5, -1, rd_kafka_mock_handle_ListOffsets}, - [RD_KAFKAP_OffsetFetch] = {0, 5, 6, rd_kafka_mock_handle_OffsetFetch}, - [RD_KAFKAP_OffsetCommit] = {0, 7, 8, rd_kafka_mock_handle_OffsetCommit}, + [RD_KAFKAP_Produce] = {0, 10, 9, rd_kafka_mock_handle_Produce}, + [RD_KAFKAP_Fetch] = {0, 16, 12, rd_kafka_mock_handle_Fetch}, + [RD_KAFKAP_ListOffsets] = {0, 7, 6, rd_kafka_mock_handle_ListOffsets}, + [RD_KAFKAP_OffsetFetch] = {0, 6, 6, rd_kafka_mock_handle_OffsetFetch}, + [RD_KAFKAP_OffsetCommit] = {0, 9, 8, rd_kafka_mock_handle_OffsetCommit}, [RD_KAFKAP_ApiVersion] = {0, 2, 3, rd_kafka_mock_handle_ApiVersion}, - [RD_KAFKAP_Metadata] = {0, 2, 9, rd_kafka_mock_handle_Metadata}, - [RD_KAFKAP_FindCoordinator] = {0, 2, 3, + [RD_KAFKAP_Metadata] = {0, 12, 9, rd_kafka_mock_handle_Metadata}, + [RD_KAFKAP_FindCoordinator] = {0, 3, 3, rd_kafka_mock_handle_FindCoordinator}, [RD_KAFKAP_InitProducerId] = {0, 4, 2, - rd_kafka_mock_handle_InitProducerId}, - [RD_KAFKAP_JoinGroup] = {0, 5, 6, rd_kafka_mock_handle_JoinGroup}, - [RD_KAFKAP_Heartbeat] = {0, 3, 4, rd_kafka_mock_handle_Heartbeat}, - [RD_KAFKAP_LeaveGroup] = {0, 1, 4, rd_kafka_mock_handle_LeaveGroup}, - [RD_KAFKAP_SyncGroup] = {0, 3, 4, rd_kafka_mock_handle_SyncGroup}, + rd_kafka_mock_handle_InitProducerId}, + [RD_KAFKAP_JoinGroup] = {0, 6, 6, rd_kafka_mock_handle_JoinGroup}, + [RD_KAFKAP_Heartbeat] = {0, 5, 4, rd_kafka_mock_handle_Heartbeat}, + [RD_KAFKAP_LeaveGroup] = {0, 4, 4, rd_kafka_mock_handle_LeaveGroup}, + [RD_KAFKAP_SyncGroup] = {0, 4, 4, rd_kafka_mock_handle_SyncGroup}, [RD_KAFKAP_AddPartitionsToTxn] = {0, 1, -1, rd_kafka_mock_handle_AddPartitionsToTxn}, [RD_KAFKAP_AddOffsetsToTxn] = {0, 1, -1, rd_kafka_mock_handle_AddOffsetsToTxn}, - [RD_KAFKAP_TxnOffsetCommit] = {0, 2, 3, + [RD_KAFKAP_TxnOffsetCommit] = {0, 3, 3, rd_kafka_mock_handle_TxnOffsetCommit}, [RD_KAFKAP_EndTxn] = {0, 1, -1, rd_kafka_mock_handle_EndTxn}, + [RD_KAFKAP_OffsetForLeaderEpoch] = + {2, 2, -1, rd_kafka_mock_handle_OffsetForLeaderEpoch}, + [RD_KAFKAP_ConsumerGroupHeartbeat] = + {1, 1, 1, rd_kafka_mock_handle_ConsumerGroupHeartbeat}, + [RD_KAFKAP_GetTelemetrySubscriptions] = + {0, 0, 0, rd_kafka_mock_handle_GetTelemetrySubscriptions}, + [RD_KAFKAP_PushTelemetry] = {0, 0, 0, + rd_kafka_mock_handle_PushTelemetry}, }; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_mock_int.h b/src/third_party/librdkafka/dist/src/rdkafka_mock_int.h index 84ccacf02dc..2ef7a2a339c 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_mock_int.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_mock_int.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,6 +30,8 @@ #ifndef _RDKAFKA_MOCK_INT_H_ #define _RDKAFKA_MOCK_INT_H_ +#include "rdkafka_request.h" + /** * @name Mock cluster - internal data types * @@ -64,33 +67,33 @@ typedef TAILQ_HEAD(rd_kafka_mock_error_stack_head_s, /** * @struct Consumer group protocol name and metadata. */ -typedef struct rd_kafka_mock_cgrp_proto_s { +typedef struct rd_kafka_mock_cgrp_classic_proto_s { rd_kafkap_str_t *name; rd_kafkap_bytes_t *metadata; -} rd_kafka_mock_cgrp_proto_t; +} rd_kafka_mock_cgrp_classic_proto_t; /** * @struct Consumer group member */ -typedef struct rd_kafka_mock_cgrp_member_s { - TAILQ_ENTRY(rd_kafka_mock_cgrp_member_s) link; +typedef struct rd_kafka_mock_cgrp_classic_member_s { + TAILQ_ENTRY(rd_kafka_mock_cgrp_classic_member_s) link; char *id; /**< MemberId */ char *group_instance_id; /**< Group instance id */ rd_ts_t ts_last_activity; /**< Last activity, e.g., Heartbeat */ - rd_kafka_mock_cgrp_proto_t *protos; /**< Protocol names */ - int proto_cnt; /**< Number of protocols */ - rd_kafkap_bytes_t *assignment; /**< Current assignment */ + rd_kafka_mock_cgrp_classic_proto_t *protos; /**< Protocol names */ + int proto_cnt; /**< Number of protocols */ + rd_kafkap_bytes_t *assignment; /**< Current assignment */ rd_kafka_buf_t *resp; /**< Current response buffer */ struct rd_kafka_mock_connection_s *conn; /**< Connection, may be NULL * if there is no ongoing * request. */ -} rd_kafka_mock_cgrp_member_t; +} rd_kafka_mock_cgrp_classic_member_t; /** - * @struct Consumer group. + * @struct Classic consumer group. */ -typedef struct rd_kafka_mock_cgrp_s { - TAILQ_ENTRY(rd_kafka_mock_cgrp_s) link; +typedef struct rd_kafka_mock_cgrp_classic_s { + TAILQ_ENTRY(rd_kafka_mock_cgrp_classic_s) link; struct rd_kafka_mock_cluster_s *cluster; /**< Cluster */ struct rd_kafka_mock_connection_s *conn; /**< Connection */ char *id; /**< Group Id */ @@ -98,20 +101,83 @@ typedef struct rd_kafka_mock_cgrp_s { char *protocol_name; /**< Elected protocol name */ int32_t generation_id; /**< Generation Id */ int session_timeout_ms; /**< Session timeout */ - enum { RD_KAFKA_MOCK_CGRP_STATE_EMPTY, /* No members */ - RD_KAFKA_MOCK_CGRP_STATE_JOINING, /* Members are joining */ - RD_KAFKA_MOCK_CGRP_STATE_SYNCING, /* Syncing assignments */ - RD_KAFKA_MOCK_CGRP_STATE_REBALANCING, /* Rebalance triggered */ - RD_KAFKA_MOCK_CGRP_STATE_UP, /* Group is operational */ + enum { + RD_KAFKA_MOCK_CGRP_STATE_EMPTY, /* No members */ + RD_KAFKA_MOCK_CGRP_STATE_JOINING, /* Members are joining */ + RD_KAFKA_MOCK_CGRP_STATE_SYNCING, /* Syncing assignments */ + RD_KAFKA_MOCK_CGRP_STATE_REBALANCING, /* Rebalance triggered */ + RD_KAFKA_MOCK_CGRP_STATE_UP, /* Group is operational */ } state; /**< Consumer group state */ rd_kafka_timer_t session_tmr; /**< Session timeout timer */ rd_kafka_timer_t rebalance_tmr; /**< Rebalance state timer */ - TAILQ_HEAD(, rd_kafka_mock_cgrp_member_s) members; /**< Group members */ + TAILQ_HEAD(, rd_kafka_mock_cgrp_classic_member_s) + members; /**< Group members */ int member_cnt; /**< Number of group members */ int last_member_cnt; /**< Mumber of group members at last election */ int assignment_cnt; /**< Number of member assignments in last Sync */ - rd_kafka_mock_cgrp_member_t *leader; /**< Elected leader */ -} rd_kafka_mock_cgrp_t; + rd_kafka_mock_cgrp_classic_member_t *leader; /**< Elected leader */ +} rd_kafka_mock_cgrp_classic_t; + + +/** + * @struct "Consumer" Consumer group (KIP-848). + */ +typedef struct rd_kafka_mock_cgrp_consumer_s { + TAILQ_ENTRY(rd_kafka_mock_cgrp_consumer_s) link; + struct rd_kafka_mock_cluster_s *cluster; /**< Cluster */ + char *id; /**< Group Id */ + int32_t group_epoch; /**< Group epoch */ + int session_timeout_ms; /**< Session timeout */ + rd_kafka_timer_t session_tmr; /**< Session timeout timer */ + int heartbeat_interval_ms; /**< Heartbeat interval */ + TAILQ_HEAD(, rd_kafka_mock_cgrp_consumer_member_s) + members; /**< Group members */ + int member_cnt; /**< Number of group members */ + rd_bool_t manual_assignment; /**< Use manual assignment */ +} rd_kafka_mock_cgrp_consumer_t; + + +/** + * @struct "Consumer" Consumer group member (KIP-848). + */ +typedef struct rd_kafka_mock_cgrp_consumer_member_s { + TAILQ_ENTRY(rd_kafka_mock_cgrp_consumer_member_s) link; + char *id; /**< MemberId */ + char *instance_id; /**< Group instance id */ + rd_ts_t ts_last_activity; /**< Last activity, e.g., + * ConsumerGroupHeartbeat */ + int32_t current_member_epoch; /**< Current member epoch, + * updated only on heartbeat. */ + int32_t + target_member_epoch; /**< Target member epoch, + * updated only when calling + * rd_kafka_mock_cgrp_consumer_target_assignment. + */ + rd_kafka_topic_partition_list_t + *current_assignment; /**< Current assignment, + * only updated when reported by the client. + */ + rd_kafka_topic_partition_list_t * + target_assignment; /**< Target assignment, + * only updated when calling + * rd_kafka_mock_cgrp_consumer_target_assignment. + */ + rd_kafka_topic_partition_list_t + *returned_assignment; /**< Returned assignment */ + + rd_list_t *subscribed_topics; /**< Final list of Subscribed topics after + considering regex as well*/ + rd_list_t *subscribed_topic_names; /**< Subscribed topic names received + in the heartbeat */ + char *subscribed_topic_regex; /**< Subscribed regex */ + + rd_bool_t left_static_membership; /**< Member left the group + * with static membership. */ + struct rd_kafka_mock_connection_s *conn; /**< Connection, may be NULL + * if there is no ongoing + * request. */ + rd_kafka_mock_cgrp_consumer_t *mcgrp; /**< Consumer group */ +} rd_kafka_mock_cgrp_consumer_member_t; /** @@ -217,6 +283,7 @@ typedef struct rd_kafka_mock_msgset_s { TAILQ_ENTRY(rd_kafka_mock_msgset_s) link; int64_t first_offset; /**< First offset in batch */ int64_t last_offset; /**< Last offset in batch */ + int32_t leader_epoch; /**< Msgset leader epoch */ rd_kafkap_bytes_t bytes; /* Space for bytes.data is allocated after the msgset_t */ } rd_kafka_mock_msgset_t; @@ -233,6 +300,18 @@ typedef struct rd_kafka_mock_committed_offset_s { rd_kafkap_str_t *metadata; /**< Metadata, allocated separately */ } rd_kafka_mock_committed_offset_t; +/** + * @struct Leader id and epoch to return in a Metadata call. + */ +typedef struct rd_kafka_mock_partition_leader_s { + /**< Link to prev/next entries */ + TAILQ_ENTRY(rd_kafka_mock_partition_leader_s) link; + int32_t leader_id; /**< Leader id */ + int32_t leader_epoch; /**< Leader epoch */ +} rd_kafka_mock_partition_leader_t; + + +TAILQ_HEAD(rd_kafka_mock_msgset_tailq_s, rd_kafka_mock_msgset_s); /** * @struct Mock partition @@ -241,6 +320,8 @@ typedef struct rd_kafka_mock_partition_s { TAILQ_ENTRY(rd_kafka_mock_partition_s) leader_link; int32_t id; + int32_t leader_epoch; /**< Leader epoch, bumped on each + * partition leader change. */ int64_t start_offset; /**< Actual/leader start offset */ int64_t end_offset; /**< Actual/leader end offset */ int64_t follower_start_offset; /**< Follower's start offset */ @@ -252,7 +333,7 @@ typedef struct rd_kafka_mock_partition_s { * in synch with end_offset */ - TAILQ_HEAD(, rd_kafka_mock_msgset_s) msgsets; + struct rd_kafka_mock_msgset_tailq_s msgsets; size_t size; /**< Total size of all .msgsets */ size_t cnt; /**< Total count of .msgsets */ size_t max_size; /**< Maximum size of all .msgsets, may be overshot. */ @@ -270,6 +351,10 @@ typedef struct rd_kafka_mock_partition_s { int32_t follower_id; /**< Preferred replica/follower */ struct rd_kafka_mock_topic_s *topic; + + /**< Leader responses */ + TAILQ_HEAD(, rd_kafka_mock_partition_leader_s) + leader_responses; } rd_kafka_mock_partition_t; @@ -279,6 +364,7 @@ typedef struct rd_kafka_mock_partition_s { typedef struct rd_kafka_mock_topic_s { TAILQ_ENTRY(rd_kafka_mock_topic_s) link; char *name; + rd_kafka_Uuid_t id; rd_kafka_mock_partition_t *partitions; int partition_cnt; @@ -339,7 +425,9 @@ struct rd_kafka_mock_cluster_s { TAILQ_HEAD(, rd_kafka_mock_topic_s) topics; int topic_cnt; - TAILQ_HEAD(, rd_kafka_mock_cgrp_s) cgrps; + TAILQ_HEAD(, rd_kafka_mock_cgrp_classic_s) cgrps_classic; + + TAILQ_HEAD(, rd_kafka_mock_cgrp_consumer_s) cgrps_consumer; /** Explicit coordinators (set with mock_set_coordinator()) */ TAILQ_HEAD(, rd_kafka_mock_coord_s) coords; @@ -373,13 +461,19 @@ struct rd_kafka_mock_cluster_s { struct { int partition_cnt; /**< Auto topic create part cnt */ int replication_factor; /**< Auto topic create repl factor */ + /** Group initial rebalance delay */ + int32_t group_initial_rebalance_delay_ms; + /** Session timeout (KIP 848) */ + int group_consumer_session_timeout_ms; + /** Heartbeat interval (KIP 848) */ + int group_consumer_heartbeat_interval_ms; } defaults; /**< Dynamic array of IO handlers for corresponding fd in .fds */ struct { rd_kafka_mock_io_handler_t *cb; /**< Callback */ void *opaque; /**< Callbacks' opaque */ - } * handlers; + } *handlers; /**< Per-protocol request error stack. */ rd_kafka_mock_error_stack_head_t errstacks; @@ -387,9 +481,28 @@ struct rd_kafka_mock_cluster_s { /**< Request handlers */ struct rd_kafka_mock_api_handler api_handlers[RD_KAFKAP__NUM]; + /** Requested metrics. */ + char **metrics; + + /** Requested metric count. */ + size_t metrics_cnt; + + /** Telemetry push interval ms. Default is 5 min */ + int64_t telemetry_push_interval_ms; + + /**< Appends the requests received to mock cluster if set to true, + * defaulted to false for less memory usage. */ + rd_bool_t track_requests; + /**< List of API requests for this broker. Type: + * rd_kafka_mock_request_t* + */ + rd_list_t request_list; + /**< Mutex for: * .errstacks * .apiversions + * .track_requests + * .request_list */ mtx_t lock; @@ -399,8 +512,13 @@ struct rd_kafka_mock_cluster_s { rd_kafka_buf_t *rd_kafka_mock_buf_new_response(const rd_kafka_buf_t *request); -void rd_kafka_mock_connection_send_response(rd_kafka_mock_connection_t *mconn, - rd_kafka_buf_t *resp); + +#define rd_kafka_mock_connection_send_response(mconn, resp) \ + rd_kafka_mock_connection_send_response0(mconn, resp, rd_false) + +void rd_kafka_mock_connection_send_response0(rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *resp, + rd_bool_t tags_written); void rd_kafka_mock_connection_set_blocking(rd_kafka_mock_connection_t *mconn, rd_bool_t blocking); @@ -418,6 +536,11 @@ rd_kafka_mock_topic_find(const rd_kafka_mock_cluster_t *mcluster, rd_kafka_mock_topic_t * rd_kafka_mock_topic_find_by_kstr(const rd_kafka_mock_cluster_t *mcluster, const rd_kafkap_str_t *kname); + +rd_kafka_mock_topic_t * +rd_kafka_mock_topic_find_by_id(const rd_kafka_mock_cluster_t *mcluster, + rd_kafka_Uuid_t id); + rd_kafka_mock_broker_t * rd_kafka_mock_cluster_get_coord(rd_kafka_mock_cluster_t *mcluster, rd_kafka_coordtype_t KeyType, @@ -447,6 +570,21 @@ rd_kafka_mock_partition_log_append(rd_kafka_mock_partition_t *mpart, const rd_kafkap_str_t *TransactionalId, int64_t *BaseOffset); +rd_kafka_resp_err_t rd_kafka_mock_partition_leader_epoch_check( + const rd_kafka_mock_partition_t *mpart, + int32_t leader_epoch); + +int64_t rd_kafka_mock_partition_offset_for_leader_epoch( + const rd_kafka_mock_partition_t *mpart, + int32_t leader_epoch); + +rd_kafka_mock_partition_leader_t * +rd_kafka_mock_partition_next_leader_response(rd_kafka_mock_partition_t *mpart); + +void rd_kafka_mock_partition_leader_destroy( + rd_kafka_mock_partition_t *mpart, + rd_kafka_mock_partition_leader_t *mpart_leader); + /** * @returns true if the ApiVersion is supported, else false. @@ -471,50 +609,97 @@ rd_kafka_mock_pid_find(rd_kafka_mock_cluster_t *mcluster, * @name Mock consumer group (rdkafka_mock_cgrp.c) * @{ */ -void rd_kafka_mock_cgrp_member_active(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member); -void rd_kafka_mock_cgrp_member_assignment_set( - rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member, +void rd_kafka_mock_cgrp_classic_member_active( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member); +void rd_kafka_mock_cgrp_classic_member_assignment_set( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member, const rd_kafkap_bytes_t *Metadata); -rd_kafka_resp_err_t -rd_kafka_mock_cgrp_member_sync_set(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member, - rd_kafka_mock_connection_t *mconn, - rd_kafka_buf_t *resp); -rd_kafka_resp_err_t -rd_kafka_mock_cgrp_member_leave(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member); -void rd_kafka_mock_cgrp_protos_destroy(rd_kafka_mock_cgrp_proto_t *protos, - int proto_cnt); -rd_kafka_resp_err_t -rd_kafka_mock_cgrp_member_add(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_connection_t *mconn, - rd_kafka_buf_t *resp, - const rd_kafkap_str_t *MemberId, - const rd_kafkap_str_t *ProtocolType, - rd_kafka_mock_cgrp_proto_t *protos, - int proto_cnt, - int session_timeout_ms); -rd_kafka_resp_err_t -rd_kafka_mock_cgrp_check_state(rd_kafka_mock_cgrp_t *mcgrp, - rd_kafka_mock_cgrp_member_t *member, - const rd_kafka_buf_t *request, - int32_t generation_id); -rd_kafka_mock_cgrp_member_t * -rd_kafka_mock_cgrp_member_find(const rd_kafka_mock_cgrp_t *mcgrp, - const rd_kafkap_str_t *MemberId); -void rd_kafka_mock_cgrp_destroy(rd_kafka_mock_cgrp_t *mcgrp); -rd_kafka_mock_cgrp_t *rd_kafka_mock_cgrp_find(rd_kafka_mock_cluster_t *mcluster, - const rd_kafkap_str_t *GroupId); -rd_kafka_mock_cgrp_t * -rd_kafka_mock_cgrp_get(rd_kafka_mock_cluster_t *mcluster, - const rd_kafkap_str_t *GroupId, - const rd_kafkap_str_t *ProtocolType); +rd_kafka_resp_err_t rd_kafka_mock_cgrp_classic_member_sync_set( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member, + rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *resp); +rd_kafka_resp_err_t rd_kafka_mock_cgrp_classic_member_leave( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member); +void rd_kafka_mock_cgrp_classic_protos_destroy( + rd_kafka_mock_cgrp_classic_proto_t *protos, + int proto_cnt); +rd_kafka_resp_err_t rd_kafka_mock_cgrp_classic_member_add( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_connection_t *mconn, + rd_kafka_buf_t *resp, + const rd_kafkap_str_t *MemberId, + const rd_kafkap_str_t *GroupInstanceId, + const rd_kafkap_str_t *ProtocolType, + rd_kafka_mock_cgrp_classic_proto_t *protos, + int proto_cnt, + int session_timeout_ms); +rd_kafka_resp_err_t rd_kafka_mock_cgrp_classic_check_state( + rd_kafka_mock_cgrp_classic_t *mcgrp, + rd_kafka_mock_cgrp_classic_member_t *member, + const rd_kafka_buf_t *request, + int32_t generation_id); +rd_kafka_mock_cgrp_classic_member_t *rd_kafka_mock_cgrp_classic_member_find( + const rd_kafka_mock_cgrp_classic_t *mcgrp, + const rd_kafkap_str_t *MemberId); +void rd_kafka_mock_cgrp_classic_destroy(rd_kafka_mock_cgrp_classic_t *mcgrp); +rd_kafka_mock_cgrp_classic_t * +rd_kafka_mock_cgrp_classic_find(rd_kafka_mock_cluster_t *mcluster, + const rd_kafkap_str_t *GroupId); +rd_kafka_mock_cgrp_classic_t * +rd_kafka_mock_cgrp_classic_get(rd_kafka_mock_cluster_t *mcluster, + const rd_kafkap_str_t *GroupId, + const rd_kafkap_str_t *ProtocolType); + +/* "consumer" consumer group (KIP-848) */ + +rd_kafka_topic_partition_list_t * +rd_kafka_mock_cgrp_consumer_member_next_assignment( + rd_kafka_mock_cgrp_consumer_member_t *member, + rd_kafka_topic_partition_list_t *current_assignment, + int *member_epoch); + +void rd_kafka_mock_cgrp_consumer_member_active( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_member_t *member); + +void rd_kafka_mock_cgrp_consumer_destroy(rd_kafka_mock_cgrp_consumer_t *mcgrp); + +rd_kafka_mock_cgrp_consumer_t * +rd_kafka_mock_cgrp_consumer_find(const rd_kafka_mock_cluster_t *mcluster, + const rd_kafkap_str_t *GroupId); + +rd_kafka_mock_cgrp_consumer_t * +rd_kafka_mock_cgrp_consumer_get(rd_kafka_mock_cluster_t *mcluster, + const rd_kafkap_str_t *GroupId); + +void rd_kafka_mock_cgrp_consumer_member_leave( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_member_t *member, + rd_bool_t static_leave); + +void rd_kafka_mock_cgrp_consumer_member_fenced( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + rd_kafka_mock_cgrp_consumer_member_t *member); + +rd_kafka_mock_cgrp_consumer_member_t *rd_kafka_mock_cgrp_consumer_member_find( + const rd_kafka_mock_cgrp_consumer_t *mcgrp, + const rd_kafkap_str_t *MemberId); + +rd_kafka_mock_cgrp_consumer_member_t *rd_kafka_mock_cgrp_consumer_member_add( + rd_kafka_mock_cgrp_consumer_t *mcgrp, + struct rd_kafka_mock_connection_s *conn, + const rd_kafkap_str_t *MemberId, + const rd_kafkap_str_t *InstanceId, + rd_kafkap_str_t *SubscribedTopicNames, + int32_t SubscribedTopicNamesCnt, + const rd_kafkap_str_t *SubscribedTopicRegex); + void rd_kafka_mock_cgrps_connection_closed(rd_kafka_mock_cluster_t *mcluster, rd_kafka_mock_connection_t *mconn); - - /** *@} */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_msg.c b/src/third_party/librdkafka/dist/src/rdkafka_msg.c index ee0e1773795..7c8204dcc79 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_msg.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_msg.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012,2013 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill, + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -57,6 +58,15 @@ const char *rd_kafka_message_errstr(const rd_kafka_message_t *rkmessage) { return rd_kafka_err2str(rkmessage->err); } +const char * +rd_kafka_message_produce_errstr(const rd_kafka_message_t *rkmessage) { + if (!rkmessage->err) + return NULL; + rd_kafka_msg_t *rkm = (rd_kafka_msg_t *)rkmessage; + return rkm->rkm_u.producer.errstr; +} + + /** * @brief Check if producing is allowed. @@ -373,6 +383,7 @@ rd_kafka_produceva(rd_kafka_t *rk, const rd_kafka_vu_t *vus, size_t cnt) { rd_kafka_error_t *error = NULL; rd_kafka_headers_t *hdrs = NULL; rd_kafka_headers_t *app_hdrs = NULL; /* App-provided headers list */ + int existing = 0; size_t i; if (unlikely(rd_kafka_check_produce(rk, &error))) @@ -382,8 +393,11 @@ rd_kafka_produceva(rd_kafka_t *rk, const rd_kafka_vu_t *vus, size_t cnt) { const rd_kafka_vu_t *vu = &vus[i]; switch (vu->vtype) { case RD_KAFKA_VTYPE_TOPIC: - rkt = - rd_kafka_topic_new0(rk, vu->u.cstr, NULL, NULL, 1); + rkt = rd_kafka_topic_new0(rk, vu->u.cstr, NULL, + &existing, 1); + if (!existing) + rd_kafka_topic_fast_leader_query( + rk, rd_true /* force */); break; case RD_KAFKA_VTYPE_RKT: @@ -539,6 +553,7 @@ rd_kafka_resp_err_t rd_kafka_producev(rd_kafka_t *rk, ...) { rd_kafka_resp_err_t err; rd_kafka_headers_t *hdrs = NULL; rd_kafka_headers_t *app_hdrs = NULL; /* App-provided headers list */ + int existing = 0; if (unlikely((err = rd_kafka_check_produce(rk, NULL)))) return err; @@ -549,7 +564,10 @@ rd_kafka_resp_err_t rd_kafka_producev(rd_kafka_t *rk, ...) { switch (vtype) { case RD_KAFKA_VTYPE_TOPIC: rkt = rd_kafka_topic_new0(rk, va_arg(ap, const char *), - NULL, NULL, 1); + NULL, &existing, 1); + if (!existing) + rd_kafka_topic_fast_leader_query( + rk, rd_true /* force */); break; case RD_KAFKA_VTYPE_RKT: @@ -1560,6 +1578,19 @@ rd_kafka_message_status(const rd_kafka_message_t *rkmessage) { } +int32_t rd_kafka_message_leader_epoch(const rd_kafka_message_t *rkmessage) { + rd_kafka_msg_t *rkm; + if (unlikely(!rkmessage->rkt || rd_kafka_rkt_is_lw(rkmessage->rkt) || + !rkmessage->rkt->rkt_rk || + rkmessage->rkt->rkt_rk->rk_type != RD_KAFKA_CONSUMER)) + return -1; + + rkm = rd_kafka_message2msg((rd_kafka_message_t *)rkmessage); + + return rkm->rkm_u.consumer.leader_epoch; +} + + void rd_kafka_msgq_dump(FILE *fp, const char *what, rd_kafka_msgq_t *rkmq) { rd_kafka_msg_t *rkm; int cnt = 0; @@ -1889,7 +1920,45 @@ void rd_kafka_msgq_verify_order0(const char *function, rd_assert(!errcnt); } +rd_kafka_Produce_result_t *rd_kafka_Produce_result_new(int64_t offset, + int64_t timestamp) { + rd_kafka_Produce_result_t *ret = rd_calloc(1, sizeof(*ret)); + ret->offset = offset; + ret->timestamp = timestamp; + return ret; +} +void rd_kafka_Produce_result_destroy(rd_kafka_Produce_result_t *result) { + if (result->record_errors) { + int32_t i; + for (i = 0; i < result->record_errors_cnt; i++) { + RD_IF_FREE(result->record_errors[i].errstr, rd_free); + } + rd_free(result->record_errors); + } + RD_IF_FREE(result->errstr, rd_free); + rd_free(result); +} + +rd_kafka_Produce_result_t * +rd_kafka_Produce_result_copy(const rd_kafka_Produce_result_t *result) { + rd_kafka_Produce_result_t *ret = rd_calloc(1, sizeof(*ret)); + *ret = *result; + if (result->errstr) + ret->errstr = rd_strdup(result->errstr); + if (result->record_errors) { + ret->record_errors = rd_calloc(result->record_errors_cnt, + sizeof(*result->record_errors)); + int32_t i; + for (i = 0; i < result->record_errors_cnt; i++) { + ret->record_errors[i] = result->record_errors[i]; + if (result->record_errors[i].errstr) + ret->record_errors[i].errstr = + rd_strdup(result->record_errors[i].errstr); + } + } + return ret; +} /** * @name Unit tests @@ -2019,9 +2088,11 @@ static int unittest_msgq_order(const char *what, } /* Retry the messages, which moves them back to sendq - * maintaining the original order */ + * maintaining the original order with exponential backoff + * set to false */ rd_kafka_retry_msgq(&rkmq, &sendq, 1, 1, 0, - RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp); + RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp, rd_false, 0, + 0); RD_UT_ASSERT(rd_kafka_msgq_len(&sendq) == 0, "sendq FIFO should be empty, not contain %d messages", @@ -2059,9 +2130,11 @@ static int unittest_msgq_order(const char *what, } /* Retry the messages, which should now keep the 3 first messages - * on sendq (no more retries) and just number 4 moved back. */ + * on sendq (no more retries) and just number 4 moved back. + * No exponential backoff applied. */ rd_kafka_retry_msgq(&rkmq, &sendq, 1, 1, 0, - RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp); + RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp, rd_false, 0, + 0); if (fifo) { if (ut_verify_msgq_order("readded #2", &rkmq, 4, 6, rd_true)) @@ -2080,9 +2153,10 @@ static int unittest_msgq_order(const char *what, return 1; } - /* Move all messages back on rkmq */ + /* Move all messages back on rkmq without any exponential backoff. */ rd_kafka_retry_msgq(&rkmq, &sendq, 0, 1000, 0, - RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp); + RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp, rd_false, 0, + 0); /* Move first half of messages to sendq (1,2,3). @@ -2102,11 +2176,14 @@ static int unittest_msgq_order(const char *what, rkm = ut_rd_kafka_msg_new(msgsize); rkm->rkm_u.producer.msgid = i; rd_kafka_msgq_enq_sorted0(&rkmq, rkm, cmp); - + /* No exponential backoff applied. */ rd_kafka_retry_msgq(&rkmq, &sendq, 0, 1000, 0, - RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp); + RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp, rd_false, 0, + 0); + /* No exponential backoff applied. */ rd_kafka_retry_msgq(&rkmq, &sendq2, 0, 1000, 0, - RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp); + RD_KAFKA_MSG_STATUS_NOT_PERSISTED, cmp, rd_false, 0, + 0); RD_UT_ASSERT(rd_kafka_msgq_len(&sendq) == 0, "sendq FIFO should be empty, not contain %d messages", @@ -2459,22 +2536,28 @@ int unittest_msg(void) { {10, 10}, {33692865, 33692865}, {0, 0}}); - fails += unittest_msgq_insert_sort( - "many messages", insert_baseline, NULL, - (const struct ut_msg_range[]) {{100000, 200000}, - {400000, 450000}, - {900000, 920000}, - {33692864, 33751992}, - {33906868, 33993690}, - {40000000, 44000000}, - {0, 0}}, - (const struct ut_msg_range[]) {{1, 199}, - {350000, 360000}, - {500000, 500010}, - {1000000, 1000200}, - {33751993, 33906867}, - {50000001, 50000001}, - {0, 0}}); + if (rd_unittest_with_valgrind) { + RD_UT_WARN( + "Skipping large message range test " + "when using Valgrind"); + } else { + fails += unittest_msgq_insert_sort( + "many messages", insert_baseline, NULL, + (const struct ut_msg_range[]) {{100000, 200000}, + {400000, 450000}, + {900000, 920000}, + {33692864, 33751992}, + {33906868, 33993690}, + {40000000, 44000000}, + {0, 0}}, + (const struct ut_msg_range[]) {{1, 199}, + {350000, 360000}, + {500000, 500010}, + {1000000, 1000200}, + {33751993, 33906867}, + {50000001, 50000001}, + {0, 0}}); + } fails += unittest_msgq_insert_sort( "issue #2508", insert_baseline, NULL, (const struct ut_msg_range[]) { diff --git a/src/third_party/librdkafka/dist/src/rdkafka_msg.h b/src/third_party/librdkafka/dist/src/rdkafka_msg.h index 85e8aa8fe87..58fd1ae83d6 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_msg.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_msg.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012,2013 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -65,6 +65,26 @@ #define RD_KAFKA_MSGSET_V2_ATTR_TRANSACTIONAL (1 << 4) #define RD_KAFKA_MSGSET_V2_ATTR_CONTROL (1 << 5) +/** + * @struct Error data for a batch index that caused the batch to be dropped. + */ +typedef struct rd_kafka_Produce_result_record_error { + int64_t batch_index; /**< Batch index */ + char *errstr; /**< Error message for batch_index */ +} rd_kafka_Produce_result_record_error_t; + +/** + * @struct Result and return values from ProduceResponse + */ +typedef struct rd_kafka_Produce_result { + int64_t offset; /**< Assigned offset of first message */ + int64_t timestamp; /**< (Possibly assigned) offset of first message */ + char *errstr; /**< Common error message */ + rd_kafka_Produce_result_record_error_t + *record_errors; /**< Errors for records that caused the batch to be + dropped */ + int32_t record_errors_cnt; /**< record_errors count */ +} rd_kafka_Produce_result_t; typedef struct rd_kafka_msg_s { rd_kafka_message_t rkm_rkmessage; /* MUST be first field */ @@ -122,6 +142,7 @@ typedef struct rd_kafka_msg_s { * identically reconstructed. */ int retries; /* Number of retries so far */ + const char *errstr; /* Error string for this message */ } producer; #define rkm_ts_timeout rkm_u.producer.ts_timeout #define rkm_ts_enq rkm_u.producer.ts_enq @@ -131,6 +152,8 @@ typedef struct rd_kafka_msg_s { rd_kafkap_bytes_t binhdrs; /**< Unparsed * binary headers in * protocol msg */ + int32_t leader_epoch; /**< Leader epoch at the time + * the message was fetched. */ } consumer; } rkm_u; } rd_kafka_msg_t; @@ -512,9 +535,9 @@ rd_kafka_msgq_overlap(const rd_kafka_msgq_t *a, const rd_kafka_msgq_t *b) { la = rd_kafka_msgq_last(a); lb = rd_kafka_msgq_last(b); - return (rd_bool_t)( - fa->rkm_u.producer.msgid <= lb->rkm_u.producer.msgid && - fb->rkm_u.producer.msgid <= la->rkm_u.producer.msgid); + return ( + rd_bool_t)(fa->rkm_u.producer.msgid <= lb->rkm_u.producer.msgid && + fb->rkm_u.producer.msgid <= la->rkm_u.producer.msgid); } /** @@ -574,6 +597,16 @@ static RD_INLINE RD_UNUSED int32_t rd_kafka_seq_wrap(int64_t seq) { void rd_kafka_msgq_dump(FILE *fp, const char *what, rd_kafka_msgq_t *rkmq); +rd_kafka_Produce_result_t *rd_kafka_Produce_result_new(int64_t offset, + int64_t timestamp); + +void rd_kafka_Produce_result_destroy(rd_kafka_Produce_result_t *result); + +rd_kafka_Produce_result_t * +rd_kafka_Produce_result_copy(const rd_kafka_Produce_result_t *result); + +/* Unit tests */ + rd_kafka_msg_t *ut_rd_kafka_msg_new(size_t msgsize); void ut_rd_kafka_msgq_purge(rd_kafka_msgq_t *rkmq); int unittest_msg(void); diff --git a/src/third_party/librdkafka/dist/src/rdkafka_msgbatch.h b/src/third_party/librdkafka/dist/src/rdkafka_msgbatch.h index 09c7977067b..b65a0f9c0a3 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_msgbatch.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_msgbatch.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_msgset.h b/src/third_party/librdkafka/dist/src/rdkafka_msgset.h index b79f1c946c9..ee897b35bd5 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_msgset.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_msgset.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -77,6 +78,21 @@ rd_kafka_msgset_parse(rd_kafka_buf_t *rkbuf, rd_kafka_aborted_txns_t *aborted_txns, const struct rd_kafka_toppar_ver *tver); +#if WITH_ZLIB +rd_kafka_resp_err_t rd_kafka_gzip_compress(rd_kafka_broker_t *rkb, + int comp_level, + rd_slice_t *slice, + void **outbuf, + size_t *outlenp); +#endif + +#if WITH_SNAPPY +rd_kafka_resp_err_t rd_kafka_snappy_compress_slice(rd_kafka_broker_t *rkb, + rd_slice_t *slice, + void **outbuf, + size_t *outlenp); +#endif + int unittest_aborted_txns(void); #endif /* _RDKAFKA_MSGSET_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_msgset_reader.c b/src/third_party/librdkafka/dist/src/rdkafka_msgset_reader.c index 1cff18b9364..258506181ed 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_msgset_reader.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_msgset_reader.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -168,6 +169,9 @@ typedef struct rd_kafka_msgset_reader_s { const struct rd_kafka_toppar_ver *msetr_tver; /**< Toppar op version of * request. */ + int32_t msetr_leader_epoch; /**< Current MessageSet's partition + * leader epoch (or -1). */ + int32_t msetr_broker_id; /**< Broker id (of msetr_rkb) */ rd_kafka_broker_t *msetr_rkb; /* @warning Not a refcounted * reference! */ @@ -230,6 +234,7 @@ static void rd_kafka_msgset_reader_init(rd_kafka_msgset_reader_t *msetr, memset(msetr, 0, sizeof(*msetr)); msetr->msetr_rkb = rkbuf->rkbuf_rkb; + msetr->msetr_leader_epoch = -1; msetr->msetr_broker_id = rd_kafka_broker_id(msetr->msetr_rkb); msetr->msetr_rktp = rktp; msetr->msetr_aborted_txns = aborted_txns; @@ -627,10 +632,10 @@ rd_kafka_msgset_reader_msg_v0_1(rd_kafka_msgset_reader_t *msetr) { /* Extract key */ - rd_kafka_buf_read_bytes(rkbuf, &Key); + rd_kafka_buf_read_kbytes(rkbuf, &Key); /* Extract Value */ - rd_kafka_buf_read_bytes(rkbuf, &Value); + rd_kafka_buf_read_kbytes(rkbuf, &Value); Value_len = RD_KAFKAP_BYTES_LEN(&Value); /* MessageSets may contain offsets earlier than we @@ -647,7 +652,8 @@ rd_kafka_msgset_reader_msg_v0_1(rd_kafka_msgset_reader_t *msetr) { * the messageset, and it also means * we cant perform this offset check here * in that case. */ - if (!relative_offsets && hdr.Offset < rktp->rktp_offsets.fetch_offset) + if (!relative_offsets && + hdr.Offset < rktp->rktp_offsets.fetch_pos.offset) return RD_KAFKA_RESP_ERR_NO_ERROR; /* Continue with next msg */ /* Handle compressed MessageSet */ @@ -663,7 +669,8 @@ rd_kafka_msgset_reader_msg_v0_1(rd_kafka_msgset_reader_t *msetr) { /* Create op/message container for message. */ rko = rd_kafka_op_new_fetch_msg( - &rkm, rktp, msetr->msetr_tver->version, rkbuf, hdr.Offset, + &rkm, rktp, msetr->msetr_tver->version, rkbuf, + RD_KAFKA_FETCH_POS(hdr.Offset, msetr->msetr_leader_epoch), (size_t)RD_KAFKAP_BYTES_LEN(&Key), RD_KAFKAP_BYTES_IS_NULL(&Key) ? NULL : Key.data, (size_t)RD_KAFKAP_BYTES_LEN(&Value), @@ -727,6 +734,7 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { ? LOG_DEBUG : 0; size_t message_end; + rd_kafka_fetch_pos_t msetr_pos; rd_kafka_buf_read_varint(rkbuf, &hdr.Length); message_end = @@ -736,15 +744,23 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { rd_kafka_buf_read_varint(rkbuf, &hdr.TimestampDelta); rd_kafka_buf_read_varint(rkbuf, &hdr.OffsetDelta); hdr.Offset = msetr->msetr_v2_hdr->BaseOffset + hdr.OffsetDelta; + msetr_pos = RD_KAFKA_FETCH_POS(hdr.Offset, msetr->msetr_leader_epoch); - /* Skip message if outdated */ - if (hdr.Offset < rktp->rktp_offsets.fetch_offset) { - rd_rkb_dbg(msetr->msetr_rkb, MSG, "MSG", - "%s [%" PRId32 - "]: " - "Skip offset %" PRId64 " < fetch_offset %" PRId64, - rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - hdr.Offset, rktp->rktp_offsets.fetch_offset); + /* Skip message if outdated. + * Don't check offset leader epoch, just log it, as if current leader + * epoch is different the fetch will fail (KIP-320) and if offset leader + * epoch is different it'll return an empty fetch (KIP-595). If we + * checked it, it's possible to have a loop when moving from a broker + * that supports leader epoch to one that doesn't. */ + if (hdr.Offset < rktp->rktp_offsets.fetch_pos.offset) { + rd_rkb_dbg( + msetr->msetr_rkb, MSG, "MSG", + "%s [%" PRId32 + "]: " + "Skip %s < fetch %s", + rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, + rd_kafka_fetch_pos2str(msetr_pos), + rd_kafka_fetch_pos2str(rktp->rktp_offsets.fetch_pos)); rd_kafka_buf_skip_to(rkbuf, message_end); return RD_KAFKA_RESP_ERR_NO_ERROR; /* Continue with next msg */ } @@ -765,10 +781,11 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { rkbuf, "%s [%" PRId32 "]: " - "Ctrl message at offset %" PRId64 + "Ctrl message at %s" " has invalid key size %" PRId64, rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition, hdr.Offset, + rktp->rktp_partition, + rd_kafka_fetch_pos2str(msetr_pos), ctrl_data.KeySize); rd_kafka_buf_read_i16(rkbuf, &ctrl_data.Version); @@ -778,11 +795,10 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { "%s [%" PRId32 "]: " "Skipping ctrl msg with " - "unsupported version %" PRId16 - " at offset %" PRId64, + "unsupported version %" PRId16 " at %s", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, ctrl_data.Version, - hdr.Offset); + rd_kafka_fetch_pos2str(msetr_pos)); rd_kafka_buf_skip_to(rkbuf, message_end); return RD_KAFKA_RESP_ERR_NO_ERROR; /* Continue with next msg */ @@ -793,10 +809,11 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { rkbuf, "%s [%" PRId32 "]: " - "Ctrl message at offset %" PRId64 + "Ctrl message at %s" " has invalid key size %" PRId64, rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition, hdr.Offset, + rktp->rktp_partition, + rd_kafka_fetch_pos2str(msetr_pos), ctrl_data.KeySize); rd_kafka_buf_read_i16(rkbuf, &ctrl_data.Type); @@ -821,14 +838,15 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { MSG | RD_KAFKA_DBG_EOS, "TXN", "%s [%" PRId32 "] received abort txn " - "ctrl msg at offset %" PRId64 + "ctrl msg at %s" " for " "PID %" PRId64 ", but there are no " "known aborted transactions: " "ignoring", rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition, hdr.Offset, + rktp->rktp_partition, + rd_kafka_fetch_pos2str(msetr_pos), msetr->msetr_v2_hdr->PID); break; } @@ -838,14 +856,14 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { aborted_txn_start_offset = rd_kafka_aborted_txns_pop_offset( msetr->msetr_aborted_txns, - msetr->msetr_v2_hdr->PID, hdr.Offset); + msetr->msetr_v2_hdr->PID, msetr_pos.offset); if (unlikely(aborted_txn_start_offset == -1)) { rd_rkb_dbg(msetr->msetr_rkb, MSG | RD_KAFKA_DBG_EOS, "TXN", "%s [%" PRId32 "] received abort txn " - "ctrl msg at offset %" PRId64 + "ctrl msg at %s" " for " "PID %" PRId64 ", but this offset is " @@ -853,7 +871,8 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { "transaction: aborted transaction " "was possibly empty: ignoring", rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition, hdr.Offset, + rktp->rktp_partition, + rd_kafka_fetch_pos2str(msetr_pos), msetr->msetr_v2_hdr->PID); break; } @@ -867,16 +886,16 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { "]: " "Unsupported ctrl message " "type %" PRId16 - " at offset" - " %" PRId64 ": ignoring", + " at " + " %s: ignoring", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, ctrl_data.Type, - hdr.Offset); + rd_kafka_fetch_pos2str(msetr_pos)); break; } rko = rd_kafka_op_new_ctrl_msg(rktp, msetr->msetr_tver->version, - rkbuf, hdr.Offset); + rkbuf, msetr_pos); rd_kafka_q_enq(&msetr->msetr_rkq, rko); msetr->msetr_msgcnt++; @@ -888,8 +907,8 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { /* Note: messages in aborted transactions are skipped at the MessageSet * level */ - rd_kafka_buf_read_bytes_varint(rkbuf, &hdr.Key); - rd_kafka_buf_read_bytes_varint(rkbuf, &hdr.Value); + rd_kafka_buf_read_kbytes_varint(rkbuf, &hdr.Key); + rd_kafka_buf_read_kbytes_varint(rkbuf, &hdr.Value); /* We parse the Headers later, just store the size (possibly truncated) * and pointer to the headers. */ @@ -899,7 +918,7 @@ rd_kafka_msgset_reader_msg_v2(rd_kafka_msgset_reader_t *msetr) { /* Create op/message container for message. */ rko = rd_kafka_op_new_fetch_msg( - &rkm, rktp, msetr->msetr_tver->version, rkbuf, hdr.Offset, + &rkm, rktp, msetr->msetr_tver->version, rkbuf, msetr_pos, (size_t)RD_KAFKAP_BYTES_LEN(&hdr.Key), RD_KAFKAP_BYTES_IS_NULL(&hdr.Key) ? NULL : hdr.Key.data, (size_t)RD_KAFKAP_BYTES_LEN(&hdr.Value), @@ -1045,6 +1064,8 @@ rd_kafka_msgset_reader_v2(rd_kafka_msgset_reader_t *msetr) { RD_KAFKAP_MSGSET_V2_SIZE - 8 - 4); rd_kafka_buf_read_i32(rkbuf, &hdr.PartitionLeaderEpoch); + msetr->msetr_leader_epoch = hdr.PartitionLeaderEpoch; + rd_kafka_buf_read_i8(rkbuf, &hdr.MagicByte); rd_kafka_buf_read_i32(rkbuf, &hdr.Crc); @@ -1105,7 +1126,7 @@ rd_kafka_msgset_reader_v2(rd_kafka_msgset_reader_t *msetr) { hdr.BaseOffset, payload_size); /* If entire MessageSet contains old outdated offsets, skip it. */ - if (LastOffset < rktp->rktp_offsets.fetch_offset) { + if (LastOffset < rktp->rktp_offsets.fetch_pos.offset) { rd_kafka_buf_skip(rkbuf, payload_size); goto done; } @@ -1215,7 +1236,8 @@ rd_kafka_msgset_reader_peek_msg_version(rd_kafka_msgset_reader_t *msetr, (int)*MagicBytep, Offset, read_offset, rd_slice_size(&rkbuf->rkbuf_reader)); - if (Offset >= msetr->msetr_rktp->rktp_offsets.fetch_offset) { + if (Offset >= + msetr->msetr_rktp->rktp_offsets.fetch_pos.offset) { rd_kafka_consumer_err( &msetr->msetr_rkq, msetr->msetr_broker_id, RD_KAFKA_RESP_ERR__NOT_IMPLEMENTED, @@ -1224,7 +1246,7 @@ rd_kafka_msgset_reader_peek_msg_version(rd_kafka_msgset_reader_t *msetr, "at offset %" PRId64, (int)*MagicBytep, Offset); /* Skip message(set) */ - msetr->msetr_rktp->rktp_offsets.fetch_offset = + msetr->msetr_rktp->rktp_offsets.fetch_pos.offset = Offset + 1; } @@ -1311,7 +1333,7 @@ static void rd_kafka_msgset_reader_postproc(rd_kafka_msgset_reader_t *msetr, * fetch offset. */ rd_kafka_q_fix_offsets( &msetr->msetr_rkq, - msetr->msetr_rktp->rktp_offsets.fetch_offset, + msetr->msetr_rktp->rktp_offsets.fetch_pos.offset, msetr->msetr_outer.offset - *last_offsetp); } } @@ -1376,11 +1398,11 @@ rd_kafka_msgset_reader_run(rd_kafka_msgset_reader_t *msetr) { &msetr->msetr_rkq, msetr->msetr_broker_id, RD_KAFKA_RESP_ERR_MSG_SIZE_TOO_LARGE, msetr->msetr_tver->version, NULL, rktp, - rktp->rktp_offsets.fetch_offset, + rktp->rktp_offsets.fetch_pos.offset, "Message at offset %" PRId64 " might be too large to fetch, try increasing " "receive.message.max.bytes", - rktp->rktp_offsets.fetch_offset); + rktp->rktp_offsets.fetch_pos.offset); } else if (msetr->msetr_aborted_cnt > 0) { /* Noop */ @@ -1421,13 +1443,15 @@ rd_kafka_msgset_reader_run(rd_kafka_msgset_reader_t *msetr) { /* Update partition's fetch offset based on * last message's offest. */ if (likely(last_offset != -1)) - rktp->rktp_offsets.fetch_offset = last_offset + 1; + rktp->rktp_offsets.fetch_pos.offset = last_offset + 1; } /* Adjust next fetch offset if outlier code has indicated * an even later next offset. */ - if (msetr->msetr_next_offset > rktp->rktp_offsets.fetch_offset) - rktp->rktp_offsets.fetch_offset = msetr->msetr_next_offset; + if (msetr->msetr_next_offset > rktp->rktp_offsets.fetch_pos.offset) + rktp->rktp_offsets.fetch_pos.offset = msetr->msetr_next_offset; + + rktp->rktp_offsets.fetch_pos.leader_epoch = msetr->msetr_leader_epoch; rd_kafka_q_destroy_owner(&msetr->msetr_rkq); diff --git a/src/third_party/librdkafka/dist/src/rdkafka_msgset_writer.c b/src/third_party/librdkafka/dist/src/rdkafka_msgset_writer.c index 694162e50f7..6324d152f26 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_msgset_writer.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_msgset_writer.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ /** @brief The maxium ProduceRequestion ApiVersion supported by librdkafka */ -static const int16_t rd_kafka_ProduceRequest_max_version = 7; +static const int16_t rd_kafka_ProduceRequest_max_version = 10; typedef struct rd_kafka_msgset_writer_s { @@ -113,9 +114,9 @@ rd_kafka_msgset_writer_select_MsgVersion(rd_kafka_msgset_writer_t *msetw) { int feature; int16_t ApiVersion; } compr_req[RD_KAFKA_COMPRESSION_NUM] = { - [RD_KAFKA_COMPRESSION_KLZ4] = {RD_KAFKA_FEATURE_KLZ4, 0}, + [RD_KAFKA_COMPRESSION_KLZ4] = {RD_KAFKA_FEATURE_KLZ4, 0}, #if WITH_ZSTD - [RD_KAFKA_COMPRESSION_ZSTD] = {RD_KAFKA_FEATURE_ZSTD, 7}, + [RD_KAFKA_COMPRESSION_ZSTD] = {RD_KAFKA_FEATURE_ZSTD, 7}, #endif }; @@ -267,6 +268,9 @@ static void rd_kafka_msgset_writer_alloc_buf(rd_kafka_msgset_writer_t *msetw) { * ProduceRequest header sizes */ switch (msetw->msetw_ApiVersion) { + case 10: + case 9: + case 8: case 7: case 6: case 5: @@ -352,9 +356,10 @@ static void rd_kafka_msgset_writer_alloc_buf(rd_kafka_msgset_writer_t *msetw) { * Allocate iovecs to hold all headers and messages, * and allocate auxilliery space for message headers, etc. */ - msetw->msetw_rkbuf = - rd_kafka_buf_new_request(msetw->msetw_rkb, RD_KAFKAP_Produce, - msetw->msetw_msgcntmax / 2 + 10, bufsize); + msetw->msetw_rkbuf = rd_kafka_buf_new_flexver_request( + msetw->msetw_rkb, RD_KAFKAP_Produce, + msetw->msetw_msgcntmax / 2 + 10, bufsize, + msetw->msetw_ApiVersion >= 9); rd_kafka_buf_ApiVersion_set(msetw->msetw_rkbuf, msetw->msetw_ApiVersion, msetw->msetw_features); @@ -441,19 +446,19 @@ rd_kafka_msgset_writer_write_Produce_header(rd_kafka_msgset_writer_t *msetw) { rd_kafka_buf_write_i32(rkbuf, rkt->rkt_conf.request_timeout_ms); /* TopicArrayCnt */ - rd_kafka_buf_write_i32(rkbuf, 1); + rd_kafka_buf_write_arraycnt(rkbuf, 1); /* Insert topic */ rd_kafka_buf_write_kstr(rkbuf, rkt->rkt_topic); /* PartitionArrayCnt */ - rd_kafka_buf_write_i32(rkbuf, 1); + rd_kafka_buf_write_arraycnt(rkbuf, 1); /* Partition */ rd_kafka_buf_write_i32(rkbuf, msetw->msetw_rktp->rktp_partition); /* MessageSetSize: Will be finalized later*/ - msetw->msetw_of_MessageSetSize = rd_kafka_buf_write_i32(rkbuf, 0); + msetw->msetw_of_MessageSetSize = rd_kafka_buf_write_arraycnt_pos(rkbuf); if (msetw->msetw_MsgVersion == 2) { /* MessageSet v2 header */ @@ -700,8 +705,8 @@ rd_kafka_msgset_writer_write_msg_v2(rd_kafka_msgset_writer_t *msetw, sizeof(varint_OffsetDelta), Offset); sz_KeyLen = rd_uvarint_enc_i32(varint_KeyLen, sizeof(varint_KeyLen), rkm->rkm_key - ? (int32_t)rkm->rkm_key_len - : (int32_t)RD_KAFKAP_BYTES_LEN_NULL); + ? (int32_t)rkm->rkm_key_len + : (int32_t)RD_KAFKAP_BYTES_LEN_NULL); sz_ValueLen = rd_uvarint_enc_i32( varint_ValueLen, sizeof(varint_ValueLen), rkm->rkm_payload ? (int32_t)rkm->rkm_len @@ -941,21 +946,18 @@ static int rd_kafka_msgset_writer_write_msgq(rd_kafka_msgset_writer_t *msetw, #if WITH_ZLIB /** - * @brief Compress messageset using gzip/zlib + * @brief Compress slice using gzip/zlib */ -static int rd_kafka_msgset_writer_compress_gzip(rd_kafka_msgset_writer_t *msetw, - rd_slice_t *slice, - struct iovec *ciov) { - - rd_kafka_broker_t *rkb = msetw->msetw_rkb; - rd_kafka_toppar_t *rktp = msetw->msetw_rktp; +rd_kafka_resp_err_t rd_kafka_gzip_compress(rd_kafka_broker_t *rkb, + int comp_level, + rd_slice_t *slice, + void **outbuf, + size_t *outlenp) { z_stream strm; size_t len = rd_slice_remains(slice); const void *p; size_t rlen; int r; - int comp_level = - msetw->msetw_rktp->rktp_rkt->rkt_conf.compression_level; memset(&strm, 0, sizeof(strm)); r = deflateInit2(&strm, comp_level, Z_DEFLATED, 15 + 16, 8, @@ -964,23 +966,21 @@ static int rd_kafka_msgset_writer_compress_gzip(rd_kafka_msgset_writer_t *msetw, rd_rkb_log(rkb, LOG_ERR, "GZIP", "Failed to initialize gzip for " "compressing %" PRIusz - " bytes in " - "topic %.*s [%" PRId32 - "]: %s (%i): " + " bytes: " + "%s (%i): " "sending uncompressed", - len, RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, strm.msg ? strm.msg : "", r); - return -1; + len, strm.msg ? strm.msg : "", r); + return RD_KAFKA_RESP_ERR__BAD_COMPRESSION; } /* Calculate maximum compressed size and * allocate an output buffer accordingly, being * prefixed with the Message header. */ - ciov->iov_len = deflateBound(&strm, (uLong)rd_slice_remains(slice)); - ciov->iov_base = rd_malloc(ciov->iov_len); + *outlenp = deflateBound(&strm, (uLong)rd_slice_remains(slice)); + *outbuf = rd_malloc(*outlenp); - strm.next_out = (void *)ciov->iov_base; - strm.avail_out = (uInt)ciov->iov_len; + strm.next_out = *outbuf; + strm.avail_out = (uInt)*outlenp; /* Iterate through each segment and compress it. */ while ((rlen = rd_slice_reader(slice, &p))) { @@ -993,18 +993,14 @@ static int rd_kafka_msgset_writer_compress_gzip(rd_kafka_msgset_writer_t *msetw, rd_rkb_log(rkb, LOG_ERR, "GZIP", "Failed to gzip-compress " "%" PRIusz " bytes (%" PRIusz - " total) for " - "topic %.*s [%" PRId32 - "]: " + " total): " "%s (%i): " "sending uncompressed", - rlen, len, - RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, - strm.msg ? strm.msg : "", r); + rlen, len, strm.msg ? strm.msg : "", r); deflateEnd(&strm); - rd_free(ciov->iov_base); - return -1; + rd_free(*outbuf); + *outbuf = NULL; + return RD_KAFKA_RESP_ERR__BAD_COMPRESSION; } rd_kafka_assert(rkb->rkb_rk, strm.avail_in == 0); @@ -1015,51 +1011,62 @@ static int rd_kafka_msgset_writer_compress_gzip(rd_kafka_msgset_writer_t *msetw, rd_rkb_log(rkb, LOG_ERR, "GZIP", "Failed to finish gzip compression " " of %" PRIusz - " bytes for " - "topic %.*s [%" PRId32 - "]: " + " bytes: " "%s (%i): " "sending uncompressed", - len, RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, strm.msg ? strm.msg : "", r); + len, strm.msg ? strm.msg : "", r); deflateEnd(&strm); - rd_free(ciov->iov_base); - return -1; + rd_free(*outbuf); + *outbuf = NULL; + return RD_KAFKA_RESP_ERR__BAD_COMPRESSION; } - ciov->iov_len = strm.total_out; + *outlenp = strm.total_out; /* Deinitialize compression */ deflateEnd(&strm); - return 0; + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Compress messageset using gzip/zlib + */ +static int rd_kafka_msgset_writer_compress_gzip(rd_kafka_msgset_writer_t *msetw, + rd_slice_t *slice, + struct iovec *ciov) { + rd_kafka_resp_err_t err; + int comp_level = + msetw->msetw_rktp->rktp_rkt->rkt_conf.compression_level; + err = rd_kafka_gzip_compress(msetw->msetw_rkb, comp_level, slice, + &ciov->iov_base, &ciov->iov_len); + return (err ? -1 : 0); } #endif #if WITH_SNAPPY /** - * @brief Compress messageset using Snappy + * @brief Compress slice using Snappy */ -static int -rd_kafka_msgset_writer_compress_snappy(rd_kafka_msgset_writer_t *msetw, - rd_slice_t *slice, - struct iovec *ciov) { - rd_kafka_broker_t *rkb = msetw->msetw_rkb; - rd_kafka_toppar_t *rktp = msetw->msetw_rktp; +rd_kafka_resp_err_t rd_kafka_snappy_compress_slice(rd_kafka_broker_t *rkb, + rd_slice_t *slice, + void **outbuf, + size_t *outlenp) { struct iovec *iov; size_t iov_max, iov_cnt; struct snappy_env senv; size_t len = rd_slice_remains(slice); int r; + struct iovec ciov; /* Initialize snappy compression environment */ rd_kafka_snappy_init_env_sg(&senv, 1 /*iov enable*/); /* Calculate maximum compressed size and * allocate an output buffer accordingly. */ - ciov->iov_len = rd_kafka_snappy_max_compressed_length(len); - ciov->iov_base = rd_malloc(ciov->iov_len); + ciov.iov_len = rd_kafka_snappy_max_compressed_length(len); + ciov.iov_base = rd_malloc(ciov.iov_len); iov_max = slice->buf->rbuf_segment_cnt; iov = rd_alloca(sizeof(*iov) * iov_max); @@ -1068,24 +1075,37 @@ rd_kafka_msgset_writer_compress_snappy(rd_kafka_msgset_writer_t *msetw, /* Compress each message */ if ((r = rd_kafka_snappy_compress_iov(&senv, iov, iov_cnt, len, - ciov)) != 0) { + &ciov)) != 0) { rd_rkb_log(rkb, LOG_ERR, "SNAPPY", "Failed to snappy-compress " "%" PRIusz - " bytes for " - "topic %.*s [%" PRId32 - "]: %s: " + " bytes: %s:" "sending uncompressed", - len, RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, rd_strerror(-r)); - rd_free(ciov->iov_base); - return -1; + len, rd_strerror(-r)); + rd_free(ciov.iov_base); + return RD_KAFKA_RESP_ERR__BAD_COMPRESSION; } /* rd_free snappy environment */ rd_kafka_snappy_free_env(&senv); - return 0; + *outbuf = ciov.iov_base; + *outlenp = ciov.iov_len; + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Compress messageset using Snappy + */ +static int +rd_kafka_msgset_writer_compress_snappy(rd_kafka_msgset_writer_t *msetw, + rd_slice_t *slice, + struct iovec *ciov) { + rd_kafka_resp_err_t err; + err = rd_kafka_snappy_compress_slice(msetw->msetw_rkb, slice, + &ciov->iov_base, &ciov->iov_len); + return (err ? -1 : 0); } #endif @@ -1315,9 +1335,9 @@ rd_kafka_msgset_writer_finalize_MessageSet(rd_kafka_msgset_writer_t *msetw) { RD_KAFKAP_MSGSET_V0_SIZE + msetw->msetw_messages_len; /* Update MessageSetSize */ - rd_kafka_buf_update_i32(msetw->msetw_rkbuf, - msetw->msetw_of_MessageSetSize, - (int32_t)msetw->msetw_MessageSetSize); + rd_kafka_buf_finalize_arraycnt(msetw->msetw_rkbuf, + msetw->msetw_of_MessageSetSize, + (int32_t)msetw->msetw_MessageSetSize); } @@ -1377,6 +1397,11 @@ rd_kafka_msgset_writer_finalize(rd_kafka_msgset_writer_t *msetw, /* Finalize MessageSet header fields */ rd_kafka_msgset_writer_finalize_MessageSet(msetw); + /* Partition tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + /* Topics tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + /* Return final MessageSetSize */ *MessageSetSizep = msetw->msetw_MessageSetSize; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_offset.c b/src/third_party/librdkafka/dist/src/rdkafka_offset.c index 805da2d18b9..cf21d60c55b 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_offset.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_offset.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012,2013 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -52,6 +53,7 @@ #include "rdkafka_partition.h" #include "rdkafka_offset.h" #include "rdkafka_broker.h" +#include "rdkafka_request.h" #include #include @@ -261,7 +263,7 @@ rd_kafka_offset_file_commit(rd_kafka_toppar_t *rktp) { rd_kafka_topic_t *rkt = rktp->rktp_rkt; int attempt; rd_kafka_resp_err_t err = RD_KAFKA_RESP_ERR_NO_ERROR; - int64_t offset = rktp->rktp_stored_offset; + int64_t offset = rktp->rktp_stored_pos.offset; for (attempt = 0; attempt < 2; attempt++) { char buf[22]; @@ -322,7 +324,7 @@ rd_kafka_offset_file_commit(rd_kafka_toppar_t *rktp) { rktp->rktp_partition, offset, rktp->rktp_offset_path); - rktp->rktp_committed_offset = offset; + rktp->rktp_committed_pos.offset = offset; /* If sync interval is set to immediate we sync right away. */ if (rkt->rkt_conf.offset_store_sync_interval_ms == 0) @@ -378,8 +380,6 @@ rd_kafka_commit0(rd_kafka_t *rk, return RD_KAFKA_RESP_ERR_NO_ERROR; } - - /** * NOTE: 'offsets' may be NULL, see official documentation. */ @@ -528,7 +528,7 @@ rd_kafka_offset_broker_commit_cb(rd_kafka_t *rk, rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, rktpar->offset, err ? "not " : "", rd_kafka_err2str(err)); - rktp->rktp_committing_offset = 0; + rktp->rktp_committing_pos.offset = 0; rd_kafka_toppar_lock(rktp); if (rktp->rktp_flags & RD_KAFKA_TOPPAR_F_OFFSET_STORE_STOPPING) @@ -539,6 +539,9 @@ rd_kafka_offset_broker_commit_cb(rd_kafka_t *rk, } +/** + * @locks_required rd_kafka_toppar_lock(rktp) MUST be held. + */ static rd_kafka_resp_err_t rd_kafka_offset_broker_commit(rd_kafka_toppar_t *rktp, const char *reason) { rd_kafka_topic_partition_list_t *offsets; @@ -548,18 +551,21 @@ rd_kafka_offset_broker_commit(rd_kafka_toppar_t *rktp, const char *reason) { rd_kafka_assert(rktp->rktp_rkt->rkt_rk, rktp->rktp_flags & RD_KAFKA_TOPPAR_F_OFFSET_STORE); - rktp->rktp_committing_offset = rktp->rktp_stored_offset; + rktp->rktp_committing_pos = rktp->rktp_stored_pos; offsets = rd_kafka_topic_partition_list_new(1); rktpar = rd_kafka_topic_partition_list_add( offsets, rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition); - rktpar->offset = rktp->rktp_committing_offset; + + rd_kafka_topic_partition_set_from_fetch_pos(rktpar, + rktp->rktp_committing_pos); + rd_kafka_topic_partition_set_metadata_from_rktp_stored(rktpar, rktp); rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSETCMT", - "%.*s [%" PRId32 "]: committing offset %" PRId64 ": %s", + "%.*s [%" PRId32 "]: committing %s: %s", RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, rktp->rktp_committing_offset, - reason); + rktp->rktp_partition, + rd_kafka_fetch_pos2str(rktp->rktp_committing_pos), reason); rd_kafka_commit0(rktp->rktp_rkt->rkt_rk, offsets, rktp, RD_KAFKA_REPLYQ(rktp->rktp_ops, 0), @@ -580,21 +586,20 @@ rd_kafka_offset_broker_commit(rd_kafka_toppar_t *rktp, const char *reason) { */ static rd_kafka_resp_err_t rd_kafka_offset_commit(rd_kafka_toppar_t *rktp, const char *reason) { - if (1) // FIXME - rd_kafka_dbg( - rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", - "%s [%" PRId32 - "]: commit: " - "stored offset %" PRId64 " > committed offset %" PRId64 "?", - rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rktp->rktp_stored_offset, rktp->rktp_committed_offset); + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", + "%s [%" PRId32 "]: commit: stored %s > committed %s?", + rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, + rd_kafka_fetch_pos2str(rktp->rktp_stored_pos), + rd_kafka_fetch_pos2str(rktp->rktp_committed_pos)); /* Already committed */ - if (rktp->rktp_stored_offset <= rktp->rktp_committed_offset) + if (rd_kafka_fetch_pos_cmp(&rktp->rktp_stored_pos, + &rktp->rktp_committed_pos) <= 0) return RD_KAFKA_RESP_ERR_NO_ERROR; /* Already committing (for async ops) */ - if (rktp->rktp_stored_offset <= rktp->rktp_committing_offset) + if (rd_kafka_fetch_pos_cmp(&rktp->rktp_stored_pos, + &rktp->rktp_committing_pos) <= 0) return RD_KAFKA_RESP_ERR__PREV_IN_PROGRESS; switch (rktp->rktp_rkt->rkt_conf.offset_store_method) { @@ -630,6 +635,8 @@ rd_kafka_resp_err_t rd_kafka_offset_sync(rd_kafka_toppar_t *rktp) { * Typically called from application code. * * NOTE: No locks must be held. + * + * @deprecated Use rd_kafka_offsets_store(). */ rd_kafka_resp_err_t rd_kafka_offset_store(rd_kafka_topic_t *app_rkt, int32_t partition, @@ -637,6 +644,8 @@ rd_kafka_resp_err_t rd_kafka_offset_store(rd_kafka_topic_t *app_rkt, rd_kafka_topic_t *rkt = rd_kafka_topic_proper(app_rkt); rd_kafka_toppar_t *rktp; rd_kafka_resp_err_t err; + rd_kafka_fetch_pos_t pos = + RD_KAFKA_FETCH_POS(offset + 1, -1 /*no leader epoch known*/); /* Find toppar */ rd_kafka_topic_rdlock(rkt); @@ -646,7 +655,7 @@ rd_kafka_resp_err_t rd_kafka_offset_store(rd_kafka_topic_t *app_rkt, } rd_kafka_topic_rdunlock(rkt); - err = rd_kafka_offset_store0(rktp, offset + 1, + err = rd_kafka_offset_store0(rktp, pos, NULL, 0, rd_false /* Don't force */, RD_DO_LOCK); rd_kafka_toppar_destroy(rktp); @@ -668,6 +677,8 @@ rd_kafka_offsets_store(rd_kafka_t *rk, for (i = 0; i < offsets->cnt; i++) { rd_kafka_topic_partition_t *rktpar = &offsets->elems[i]; rd_kafka_toppar_t *rktp; + rd_kafka_fetch_pos_t pos = + RD_KAFKA_FETCH_POS(rktpar->offset, -1); rktp = rd_kafka_topic_partition_get_toppar(rk, rktpar, rd_false); @@ -677,9 +688,12 @@ rd_kafka_offsets_store(rd_kafka_t *rk, continue; } - rktpar->err = rd_kafka_offset_store0(rktp, rktpar->offset, - rd_false /* don't force */, - RD_DO_LOCK); + pos.leader_epoch = + rd_kafka_topic_partition_get_leader_epoch(rktpar); + + rktpar->err = rd_kafka_offset_store0( + rktp, pos, rktpar->metadata, rktpar->metadata_size, + rd_false /* don't force */, RD_DO_LOCK); rd_kafka_toppar_destroy(rktp); if (rktpar->err) @@ -693,6 +707,39 @@ rd_kafka_offsets_store(rd_kafka_t *rk, } +rd_kafka_error_t *rd_kafka_offset_store_message(rd_kafka_message_t *rkmessage) { + rd_kafka_toppar_t *rktp; + rd_kafka_op_t *rko; + rd_kafka_resp_err_t err; + rd_kafka_msg_t *rkm = (rd_kafka_msg_t *)rkmessage; + rd_kafka_fetch_pos_t pos; + + if (rkmessage->err) + return rd_kafka_error_new(RD_KAFKA_RESP_ERR__INVALID_ARG, + "Message object must not have an " + "error set"); + + if (unlikely(!(rko = rd_kafka_message2rko(rkmessage)) || + !(rktp = rko->rko_rktp))) + return rd_kafka_error_new(RD_KAFKA_RESP_ERR__INVALID_ARG, + "Invalid message object, " + "not a consumed message"); + + pos = RD_KAFKA_FETCH_POS(rkmessage->offset + 1, + rkm->rkm_u.consumer.leader_epoch); + err = rd_kafka_offset_store0(rktp, pos, NULL, 0, + rd_false /* Don't force */, RD_DO_LOCK); + + if (err == RD_KAFKA_RESP_ERR__STATE) + return rd_kafka_error_new(err, "Partition is not assigned"); + else if (err) + return rd_kafka_error_new(err, "Failed to store offset: %s", + rd_kafka_err2str(err)); + + return NULL; +} + + /** * Decommissions the use of an offset file for a toppar. @@ -723,7 +770,7 @@ static rd_kafka_op_res_t rd_kafka_offset_reset_op_cb(rd_kafka_t *rk, rd_kafka_toppar_t *rktp = rko->rko_rktp; rd_kafka_toppar_lock(rktp); rd_kafka_offset_reset(rktp, rko->rko_u.offset_reset.broker_id, - rko->rko_u.offset_reset.offset, rko->rko_err, + rko->rko_u.offset_reset.pos, rko->rko_err, "%s", rko->rko_u.offset_reset.reason); rd_kafka_toppar_unlock(rktp); return RD_KAFKA_OP_RES_HANDLED; @@ -735,20 +782,27 @@ static rd_kafka_op_res_t rd_kafka_offset_reset_op_cb(rd_kafka_t *rk, * * @param rktp the toppar * @param broker_id Originating broker, if any, else RD_KAFKA_NODEID_UA. - * @param err_offset a logical offset, or offset corresponding to the error. + * @param err_pos a logical offset, or offset corresponding to the error. * @param err the error, or RD_KAFKA_RESP_ERR_NO_ERROR if offset is logical. - * @param reason a reason string for logging. + * @param fmt a reason string for logging. * - * @locality: any. if not main thread, work will be enqued on main thread. - * @ocks: toppar_lock() MUST be held + * @locality any. if not main thread, work will be enqued on main thread. + * @locks_required toppar_lock() MUST be held */ void rd_kafka_offset_reset(rd_kafka_toppar_t *rktp, int32_t broker_id, - int64_t err_offset, + rd_kafka_fetch_pos_t err_pos, rd_kafka_resp_err_t err, - const char *reason) { - int64_t offset = RD_KAFKA_OFFSET_INVALID; - const char *extra = ""; + const char *fmt, + ...) { + rd_kafka_fetch_pos_t pos = {RD_KAFKA_OFFSET_INVALID, -1}; + const char *extra = ""; + char reason[512]; + va_list ap; + + va_start(ap, fmt); + rd_vsnprintf(reason, sizeof(reason), fmt, ap); + va_end(ap); /* Enqueue op for toppar handler thread if we're on the wrong thread. */ if (!thrd_is_current(rktp->rktp_rkt->rkt_rk->rk_thread)) { @@ -758,48 +812,49 @@ void rd_kafka_offset_reset(rd_kafka_toppar_t *rktp, rko->rko_err = err; rko->rko_rktp = rd_kafka_toppar_keep(rktp); rko->rko_u.offset_reset.broker_id = broker_id; - rko->rko_u.offset_reset.offset = err_offset; + rko->rko_u.offset_reset.pos = err_pos; rko->rko_u.offset_reset.reason = rd_strdup(reason); rd_kafka_q_enq(rktp->rktp_ops, rko); return; } - if (err_offset == RD_KAFKA_OFFSET_INVALID || err) - offset = rktp->rktp_rkt->rkt_conf.auto_offset_reset; + if (err_pos.offset == RD_KAFKA_OFFSET_INVALID || err) + pos.offset = rktp->rktp_rkt->rkt_conf.auto_offset_reset; else - offset = err_offset; + pos.offset = err_pos.offset; - if (offset == RD_KAFKA_OFFSET_INVALID) { + if (pos.offset == RD_KAFKA_OFFSET_INVALID) { /* Error, auto.offset.reset tells us to error out. */ if (broker_id != RD_KAFKA_NODEID_UA) rd_kafka_consumer_err( rktp->rktp_fetchq, broker_id, RD_KAFKA_RESP_ERR__AUTO_OFFSET_RESET, 0, NULL, rktp, - err_offset, "%s: %s (broker %" PRId32 ")", reason, - rd_kafka_err2str(err), broker_id); + err_pos.offset, "%s: %s (broker %" PRId32 ")", + reason, rd_kafka_err2str(err), broker_id); else rd_kafka_consumer_err( rktp->rktp_fetchq, broker_id, RD_KAFKA_RESP_ERR__AUTO_OFFSET_RESET, 0, NULL, rktp, - err_offset, "%s: %s", reason, + err_pos.offset, "%s: %s", reason, rd_kafka_err2str(err)); rd_kafka_toppar_set_fetch_state(rktp, RD_KAFKA_TOPPAR_FETCH_NONE); - } else if (offset == RD_KAFKA_OFFSET_BEGINNING && + } else if (pos.offset == RD_KAFKA_OFFSET_BEGINNING && rktp->rktp_lo_offset >= 0) { /* Use cached log start from last Fetch if available. * Note: The cached end offset (rktp_ls_offset) can't be * used here since the End offset is a constantly moving * target as new messages are produced. */ - extra = "cached BEGINNING offset "; - offset = rktp->rktp_lo_offset; - rd_kafka_toppar_next_offset_handle(rktp, offset); + extra = "cached BEGINNING offset "; + pos.offset = rktp->rktp_lo_offset; + pos.leader_epoch = -1; + rd_kafka_toppar_next_offset_handle(rktp, pos); } else { /* Else query cluster for offset */ - rktp->rktp_query_offset = offset; + rktp->rktp_query_pos = pos; rd_kafka_toppar_set_fetch_state( rktp, RD_KAFKA_TOPPAR_FETCH_OFFSET_QUERY); } @@ -808,37 +863,368 @@ void rd_kafka_offset_reset(rd_kafka_toppar_t *rktp, * critical impact. For non-errors, or for auto.offset.reset=error, * the reason is simply debug-logged. */ if (!err || err == RD_KAFKA_RESP_ERR__NO_OFFSET || - offset == RD_KAFKA_OFFSET_INVALID) + pos.offset == RD_KAFKA_OFFSET_INVALID) rd_kafka_dbg( rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", - "%s [%" PRId32 - "]: offset reset (at offset %s, broker %" PRId32 + "%s [%" PRId32 "]: offset reset (at %s, broker %" PRId32 ") " "to %s%s: %s: %s", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rd_kafka_offset2str(err_offset), broker_id, extra, - rd_kafka_offset2str(offset), reason, rd_kafka_err2str(err)); + rd_kafka_fetch_pos2str(err_pos), broker_id, extra, + rd_kafka_fetch_pos2str(pos), reason, rd_kafka_err2str(err)); else rd_kafka_log( rktp->rktp_rkt->rkt_rk, LOG_WARNING, "OFFSET", - "%s [%" PRId32 - "]: offset reset (at offset %s, broker %" PRId32 - ") " - "to %s%s: %s: %s", + "%s [%" PRId32 "]: offset reset (at %s, broker %" PRId32 + ") to %s%s: %s: %s", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rd_kafka_offset2str(err_offset), broker_id, extra, - rd_kafka_offset2str(offset), reason, rd_kafka_err2str(err)); + rd_kafka_fetch_pos2str(err_pos), broker_id, extra, + rd_kafka_fetch_pos2str(pos), reason, rd_kafka_err2str(err)); /* Note: If rktp is not delegated to the leader, then low and high offsets will necessarily be cached from the last FETCH request, and so this offset query will never occur in that case for BEGINNING / END logical offsets. */ if (rktp->rktp_fetch_state == RD_KAFKA_TOPPAR_FETCH_OFFSET_QUERY) - rd_kafka_toppar_offset_request(rktp, rktp->rktp_query_offset, + rd_kafka_toppar_offset_request(rktp, rktp->rktp_query_pos, err ? 100 : 0); } + +/** + * @brief Offset validation retry timer + */ +static void rd_kafka_offset_validate_tmr_cb(rd_kafka_timers_t *rkts, + void *arg) { + rd_kafka_toppar_t *rktp = arg; + + rd_kafka_toppar_lock(rktp); + /* Retry validation only when it's still needed. + * Even if validation can be started in fetch states ACTIVE and + * VALIDATE_EPOCH_WAIT, its retry should be done only + * in fetch state VALIDATE_EPOCH_WAIT. */ + if (rktp->rktp_fetch_state == RD_KAFKA_TOPPAR_FETCH_VALIDATE_EPOCH_WAIT) + rd_kafka_offset_validate(rktp, "retrying offset validation"); + else { + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, FETCH, "VALIDATE", + "%.*s [%" PRId32 + "]: skipping offset " + "validation retry in fetch state %s", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition, + rd_kafka_fetch_states[rktp->rktp_fetch_state]); + } + rd_kafka_toppar_unlock(rktp); +} + + + +/** + * @brief OffsetForLeaderEpochResponse handler that + * pushes the matched toppar's to the next state. + * + * @locality rdkafka main thread + */ +static void rd_kafka_toppar_handle_OffsetForLeaderEpoch(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + void *opaque) { + rd_kafka_topic_partition_list_t *parts = NULL; + rd_kafka_toppar_t *rktp = opaque; + rd_kafka_topic_partition_t *rktpar; + int64_t end_offset; + int32_t end_offset_leader_epoch; + rd_kafka_toppar_lock(rktp); + rktp->rktp_flags &= ~RD_KAFKA_TOPPAR_F_VALIDATING; + rd_kafka_toppar_unlock(rktp); + + if (err == RD_KAFKA_RESP_ERR__DESTROY) { + rd_kafka_toppar_destroy(rktp); /* Drop refcnt */ + return; + } + + err = rd_kafka_handle_OffsetForLeaderEpoch(rk, rkb, err, rkbuf, request, + &parts); + + rd_kafka_toppar_lock(rktp); + + if (rktp->rktp_fetch_state != RD_KAFKA_TOPPAR_FETCH_VALIDATE_EPOCH_WAIT) + err = RD_KAFKA_RESP_ERR__OUTDATED; + + if (unlikely(!err && parts->cnt == 0)) + err = RD_KAFKA_RESP_ERR__UNKNOWN_PARTITION; + + if (!err) { + err = (&parts->elems[0])->err; + } + + if (err) { + int actions; + + rd_rkb_dbg(rkb, FETCH, "OFFSETVALID", + "%.*s [%" PRId32 + "]: OffsetForLeaderEpoch requested failed: %s", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition, rd_kafka_err2str(err)); + + if (err == RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE) { + rd_rkb_dbg(rkb, FETCH, "VALIDATE", + "%.*s [%" PRId32 + "]: offset and epoch validation not " + "supported by broker: validation skipped", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition); + rd_kafka_toppar_set_fetch_state( + rktp, RD_KAFKA_TOPPAR_FETCH_ACTIVE); + goto done; + + } else if (err == RD_KAFKA_RESP_ERR__OUTDATED) { + /* Partition state has changed, this response + * is outdated. */ + goto done; + } + + actions = rd_kafka_err_action( + rkb, err, request, RD_KAFKA_ERR_ACTION_REFRESH, + RD_KAFKA_RESP_ERR_UNKNOWN_LEADER_EPOCH, + RD_KAFKA_ERR_ACTION_REFRESH, + RD_KAFKA_RESP_ERR_FENCED_LEADER_EPOCH, + RD_KAFKA_ERR_ACTION_REFRESH, + RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART, + RD_KAFKA_ERR_ACTION_REFRESH, + RD_KAFKA_RESP_ERR_OFFSET_NOT_AVAILABLE, + RD_KAFKA_ERR_ACTION_REFRESH, + RD_KAFKA_RESP_ERR_KAFKA_STORAGE_ERROR, + RD_KAFKA_ERR_ACTION_END); + + + if (actions & RD_KAFKA_ERR_ACTION_REFRESH) + /* Metadata refresh is ongoing, so force it */ + rd_kafka_topic_leader_query0(rk, rktp->rktp_rkt, 1, + rd_true /* force */); + + /* No need for refcnt on rktp for timer opaque + * since the timer resides on the rktp and will be + * stopped on toppar remove. + * Retries the validation with a new call even in + * case of permanent error. */ + rd_kafka_timer_start_oneshot( + &rk->rk_timers, &rktp->rktp_validate_tmr, rd_false, + 500 * 1000 /* 500ms */, rd_kafka_offset_validate_tmr_cb, + rktp); + goto done; + } + + + rktpar = &parts->elems[0]; + end_offset = rktpar->offset; + end_offset_leader_epoch = + rd_kafka_topic_partition_get_leader_epoch(rktpar); + + if (end_offset < 0 || end_offset_leader_epoch < 0) { + rd_kafka_offset_reset( + rktp, rd_kafka_broker_id(rkb), + rktp->rktp_offset_validation_pos, + RD_KAFKA_RESP_ERR__LOG_TRUNCATION, + "No epoch found less or equal to " + "%s: broker end offset is %" PRId64 + " (offset leader epoch %" PRId32 + ")." + " Reset using configured policy.", + rd_kafka_fetch_pos2str(rktp->rktp_offset_validation_pos), + end_offset, end_offset_leader_epoch); + + } else if (end_offset < rktp->rktp_offset_validation_pos.offset) { + + if (rktp->rktp_rkt->rkt_conf.auto_offset_reset == + RD_KAFKA_OFFSET_INVALID /* auto.offset.reset=error */) { + rd_kafka_offset_reset( + rktp, rd_kafka_broker_id(rkb), + RD_KAFKA_FETCH_POS(RD_KAFKA_OFFSET_INVALID, + rktp->rktp_leader_epoch), + RD_KAFKA_RESP_ERR__LOG_TRUNCATION, + "Partition log truncation detected at %s: " + "broker end offset is %" PRId64 + " (offset leader epoch %" PRId32 + "). " + "Reset to INVALID.", + rd_kafka_fetch_pos2str( + rktp->rktp_offset_validation_pos), + end_offset, end_offset_leader_epoch); + + } else { + rd_kafka_toppar_unlock(rktp); + + /* Seek to the updated end offset */ + rd_kafka_fetch_pos_t fetch_pos = + rd_kafka_topic_partition_get_fetch_pos(rktpar); + fetch_pos.validated = rd_true; + + rd_kafka_toppar_op_seek(rktp, fetch_pos, + RD_KAFKA_NO_REPLYQ); + + rd_kafka_topic_partition_list_destroy(parts); + rd_kafka_toppar_destroy(rktp); + + return; + } + + } else { + rd_rkb_dbg(rkb, FETCH, "OFFSETVALID", + "%.*s [%" PRId32 + "]: offset and epoch validation " + "succeeded: broker end offset %" PRId64 + " (offset leader epoch %" PRId32 ")", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition, end_offset, + end_offset_leader_epoch); + + rd_kafka_toppar_set_fetch_state(rktp, + RD_KAFKA_TOPPAR_FETCH_ACTIVE); + } + +done: + rd_kafka_toppar_unlock(rktp); + + if (parts) + rd_kafka_topic_partition_list_destroy(parts); + rd_kafka_toppar_destroy(rktp); +} + + +static rd_kafka_op_res_t rd_kafka_offset_validate_op_cb(rd_kafka_t *rk, + rd_kafka_q_t *rkq, + rd_kafka_op_t *rko) { + rd_kafka_toppar_t *rktp = rko->rko_rktp; + rd_kafka_toppar_lock(rktp); + rd_kafka_offset_validate(rktp, "%s", rko->rko_u.offset_reset.reason); + rd_kafka_toppar_unlock(rktp); + return RD_KAFKA_OP_RES_HANDLED; +} + +/** + * @brief Validate partition epoch and offset (KIP-320). + * + * @param rktp the toppar + * @param err Optional error code that triggered the validation. + * @param fmt a reason string for logging. + * + * @locality any. if not main thread, work will be enqued on main thread. + * @locks_required toppar_lock() MUST be held + */ +void rd_kafka_offset_validate(rd_kafka_toppar_t *rktp, const char *fmt, ...) { + rd_kafka_topic_partition_list_t *parts; + rd_kafka_topic_partition_t *rktpar; + char reason[512]; + va_list ap; + + if (rktp->rktp_rkt->rkt_rk->rk_type != RD_KAFKA_CONSUMER) + return; + + va_start(ap, fmt); + rd_vsnprintf(reason, sizeof(reason), fmt, ap); + va_end(ap); + + /* Enqueue op for toppar handler thread if we're on the wrong thread. */ + if (!thrd_is_current(rktp->rktp_rkt->rkt_rk->rk_thread)) { + /* Reuse OP_OFFSET_RESET type */ + rd_kafka_op_t *rko = + rd_kafka_op_new(RD_KAFKA_OP_OFFSET_RESET | RD_KAFKA_OP_CB); + rko->rko_op_cb = rd_kafka_offset_validate_op_cb; + rko->rko_rktp = rd_kafka_toppar_keep(rktp); + rko->rko_u.offset_reset.reason = rd_strdup(reason); + rd_kafka_q_enq(rktp->rktp_ops, rko); + return; + } + + if (rktp->rktp_fetch_state != RD_KAFKA_TOPPAR_FETCH_ACTIVE && + rktp->rktp_fetch_state != + RD_KAFKA_TOPPAR_FETCH_VALIDATE_EPOCH_WAIT) { + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, FETCH, "VALIDATE", + "%.*s [%" PRId32 + "]: skipping offset " + "validation in fetch state %s", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition, + rd_kafka_fetch_states[rktp->rktp_fetch_state]); + return; + } + + + if (rktp->rktp_leader_id == -1 || !rktp->rktp_leader || + rktp->rktp_leader->rkb_source == RD_KAFKA_INTERNAL) { + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, FETCH, "VALIDATE", + "%.*s [%" PRId32 + "]: unable to perform offset " + "validation: partition leader not available. " + "Retrying when available", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition); + return; + } + + /* If the fetch start position does not have an epoch set then + * there is no point in doing validation. + * This is the case for epoch-less seek()s or epoch-less + * committed offsets. */ + if (rktp->rktp_offset_validation_pos.leader_epoch == -1) { + rd_kafka_dbg( + rktp->rktp_rkt->rkt_rk, FETCH, "VALIDATE", + "%.*s [%" PRId32 + "]: skipping offset " + "validation for %s: no leader epoch set", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition, + rd_kafka_fetch_pos2str(rktp->rktp_offset_validation_pos)); + rd_kafka_toppar_set_fetch_state(rktp, + RD_KAFKA_TOPPAR_FETCH_ACTIVE); + return; + } + + if (rktp->rktp_flags & RD_KAFKA_TOPPAR_F_VALIDATING) { + rd_kafka_dbg( + rktp->rktp_rkt->rkt_rk, FETCH, "VALIDATE", + "%.*s [%" PRId32 + "]: skipping offset " + "validation for %s: validation is already ongoing", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition, + rd_kafka_fetch_pos2str(rktp->rktp_offset_validation_pos)); + return; + } + + rd_kafka_toppar_set_fetch_state( + rktp, RD_KAFKA_TOPPAR_FETCH_VALIDATE_EPOCH_WAIT); + rktp->rktp_flags |= RD_KAFKA_TOPPAR_F_VALIDATING; + + /* Construct and send OffsetForLeaderEpochRequest */ + parts = rd_kafka_topic_partition_list_new(1); + rktpar = rd_kafka_topic_partition_list_add( + parts, rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition); + rd_kafka_topic_partition_set_leader_epoch( + rktpar, rktp->rktp_offset_validation_pos.leader_epoch); + rd_kafka_topic_partition_set_current_leader_epoch( + rktpar, rktp->rktp_leader_epoch); + rd_kafka_toppar_keep(rktp); /* for request opaque */ + + rd_rkb_dbg( + rktp->rktp_leader, FETCH, "VALIDATE", + "%.*s [%" PRId32 + "]: querying broker for epoch " + "validation of %s: %s", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), rktp->rktp_partition, + rd_kafka_fetch_pos2str(rktp->rktp_offset_validation_pos), reason); + + rd_kafka_OffsetForLeaderEpochRequest( + rktp->rktp_leader, parts, RD_KAFKA_REPLYQ(rktp->rktp_ops, 0), + rd_kafka_toppar_handle_OffsetForLeaderEpoch, rktp); + rd_kafka_topic_partition_list_destroy(parts); +} + + /** * Escape any special characters in filename 'in' and write escaped * string to 'out' (of max size out_size). @@ -953,15 +1339,16 @@ static void rd_kafka_offset_file_init(rd_kafka_toppar_t *rktp) { if (offset != RD_KAFKA_OFFSET_INVALID) { /* Start fetching from offset */ - rktp->rktp_stored_offset = offset; - rktp->rktp_committed_offset = offset; - rd_kafka_toppar_next_offset_handle(rktp, offset); + rktp->rktp_stored_pos.offset = offset; + rktp->rktp_committed_pos.offset = offset; + rd_kafka_toppar_next_offset_handle(rktp, rktp->rktp_stored_pos); } else { /* Offset was not usable: perform offset reset logic */ - rktp->rktp_committed_offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_committed_pos.offset = RD_KAFKA_OFFSET_INVALID; rd_kafka_offset_reset( - rktp, RD_KAFKA_NODEID_UA, RD_KAFKA_OFFSET_INVALID, + rktp, RD_KAFKA_NODEID_UA, + RD_KAFKA_FETCH_POS(RD_KAFKA_OFFSET_INVALID, -1), RD_KAFKA_RESP_ERR__FS, "non-readable offset file"); } } @@ -978,14 +1365,16 @@ rd_kafka_offset_broker_term(rd_kafka_toppar_t *rktp) { /** - * Prepare a toppar for using broker offset commit (broker 0.8.2 or later). - * When using KafkaConsumer (high-level consumer) this functionality is - * disabled in favour of the cgrp commits for the entire set of subscriptions. + * Prepare a toppar for using broker offset commit (broker 0.8.2 or + * later). When using KafkaConsumer (high-level consumer) this + * functionality is disabled in favour of the cgrp commits for the + * entire set of subscriptions. */ static void rd_kafka_offset_broker_init(rd_kafka_toppar_t *rktp) { if (!rd_kafka_is_simple_consumer(rktp->rktp_rkt->rkt_rk)) return; - rd_kafka_offset_reset(rktp, RD_KAFKA_NODEID_UA, RD_KAFKA_OFFSET_STORED, + rd_kafka_offset_reset(rktp, RD_KAFKA_NODEID_UA, + RD_KAFKA_FETCH_POS(RD_KAFKA_OFFSET_STORED, -1), RD_KAFKA_RESP_ERR_NO_ERROR, "query broker for offsets"); } @@ -1055,23 +1444,27 @@ rd_kafka_resp_err_t rd_kafka_offset_store_stop(rd_kafka_toppar_t *rktp) { rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", "%s [%" PRId32 "]: stopping offset store " - "(stored offset %" PRId64 ", committed offset %" PRId64 - ", EOF offset %" PRId64 ")", + "(stored %s, committed %s, EOF offset %" PRId64 ")", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rktp->rktp_stored_offset, rktp->rktp_committed_offset, + rd_kafka_fetch_pos2str(rktp->rktp_stored_pos), + rd_kafka_fetch_pos2str(rktp->rktp_committed_pos), rktp->rktp_offsets_fin.eof_offset); /* Store end offset for empty partitions */ if (rktp->rktp_rkt->rkt_rk->rk_conf.enable_auto_offset_store && - rktp->rktp_stored_offset == RD_KAFKA_OFFSET_INVALID && + rktp->rktp_stored_pos.offset == RD_KAFKA_OFFSET_INVALID && rktp->rktp_offsets_fin.eof_offset > 0) - rd_kafka_offset_store0(rktp, rktp->rktp_offsets_fin.eof_offset, - rd_true /* force */, RD_DONT_LOCK); + rd_kafka_offset_store0( + rktp, + RD_KAFKA_FETCH_POS(rktp->rktp_offsets_fin.eof_offset, + rktp->rktp_leader_epoch), + NULL, 0, rd_true /* force */, RD_DONT_LOCK); /* Commit offset to backing store. * This might be an async operation. */ if (rd_kafka_is_simple_consumer(rktp->rktp_rkt->rkt_rk) && - rktp->rktp_stored_offset > rktp->rktp_committed_offset) + rd_kafka_fetch_pos_cmp(&rktp->rktp_stored_pos, + &rktp->rktp_committed_pos) > 0) err = rd_kafka_offset_commit(rktp, "offset store stop"); /* If stop is in progress (async commit), return now. */ @@ -1097,12 +1490,11 @@ void rd_kafka_offset_query_tmr_cb(rd_kafka_timers_t *rkts, void *arg) { rd_kafka_toppar_lock(rktp); rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", "Topic %s [%" PRId32 - "]: timed offset query for %s in " - "state %s", + "]: timed offset query for %s in state %s", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rd_kafka_offset2str(rktp->rktp_query_offset), + rd_kafka_fetch_pos2str(rktp->rktp_query_pos), rd_kafka_fetch_states[rktp->rktp_fetch_state]); - rd_kafka_toppar_offset_request(rktp, rktp->rktp_query_offset, 0); + rd_kafka_toppar_offset_request(rktp, rktp->rktp_query_pos, 0); rd_kafka_toppar_unlock(rktp); } @@ -1121,7 +1513,7 @@ void rd_kafka_offset_store_init(rd_kafka_toppar_t *rktp) { store_names[rktp->rktp_rkt->rkt_conf.offset_store_method]); /* The committed offset is unknown at this point. */ - rktp->rktp_committed_offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_committed_pos.offset = RD_KAFKA_OFFSET_INVALID; /* Set up the commit interval (for simple consumer). */ if (rd_kafka_is_simple_consumer(rktp->rktp_rkt->rkt_rk) && @@ -1148,3 +1540,26 @@ void rd_kafka_offset_store_init(rd_kafka_toppar_t *rktp) { rktp->rktp_flags |= RD_KAFKA_TOPPAR_F_OFFSET_STORE; } + + +/** + * Update toppar app_pos and store_offset (if enabled) to the provided + * offset and epoch. + */ +void rd_kafka_update_app_pos(rd_kafka_t *rk, + rd_kafka_toppar_t *rktp, + rd_kafka_fetch_pos_t pos, + rd_dolock_t do_lock) { + + if (do_lock) + rd_kafka_toppar_lock(rktp); + + rktp->rktp_app_pos = pos; + if (rk->rk_conf.enable_auto_offset_store) + rd_kafka_offset_store0(rktp, pos, NULL, 0, + /* force: ignore assignment state */ + rd_true, RD_DONT_LOCK); + + if (do_lock) + rd_kafka_toppar_unlock(rktp); +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_offset.h b/src/third_party/librdkafka/dist/src/rdkafka_offset.h index c085224cb3f..de9b5dec985 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_offset.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_offset.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012,2013 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -71,7 +72,10 @@ const char *rd_kafka_offset2str(int64_t offset); * 4. background rebalance assigns the partition again, but forcibly sets * the stored offset to .._INVALID to provide a clean state. * - * @param offset Offset to set, may be an absolute offset or .._INVALID. + * @param pos Offset and leader epoch to set, may be an absolute offset + * or .._INVALID. + * @param metadata Metadata to be set (optional). + * @param metadata_size Size of the metadata to be set. * @param force Forcibly set \p offset regardless of assignment state. * @param do_lock Whether to lock the \p rktp or not (already locked by caller). * @@ -82,7 +86,9 @@ const char *rd_kafka_offset2str(int64_t offset); */ static RD_INLINE RD_UNUSED rd_kafka_resp_err_t rd_kafka_offset_store0(rd_kafka_toppar_t *rktp, - int64_t offset, + const rd_kafka_fetch_pos_t pos, + void *metadata, + size_t metadata_size, rd_bool_t force, rd_dolock_t do_lock) { rd_kafka_resp_err_t err = RD_KAFKA_RESP_ERR_NO_ERROR; @@ -90,12 +96,23 @@ rd_kafka_offset_store0(rd_kafka_toppar_t *rktp, if (do_lock) rd_kafka_toppar_lock(rktp); - if (unlikely(!force && !RD_KAFKA_OFFSET_IS_LOGICAL(offset) && + if (unlikely(!force && !RD_KAFKA_OFFSET_IS_LOGICAL(pos.offset) && !(rktp->rktp_flags & RD_KAFKA_TOPPAR_F_ASSIGNED) && - !rd_kafka_is_simple_consumer(rktp->rktp_rkt->rkt_rk))) + !rd_kafka_is_simple_consumer(rktp->rktp_rkt->rkt_rk))) { err = RD_KAFKA_RESP_ERR__STATE; - else - rktp->rktp_stored_offset = offset; + } else { + if (rktp->rktp_stored_metadata) { + rd_free(rktp->rktp_stored_metadata); + rktp->rktp_stored_metadata = NULL; + } + rktp->rktp_stored_pos = pos; + rktp->rktp_stored_metadata_size = metadata_size; + if (metadata) { + rktp->rktp_stored_metadata = rd_malloc(metadata_size); + memcpy(rktp->rktp_stored_metadata, metadata, + rktp->rktp_stored_metadata_size); + } + } if (do_lock) rd_kafka_toppar_unlock(rktp); @@ -115,10 +132,19 @@ void rd_kafka_offset_store_init(rd_kafka_toppar_t *rktp); void rd_kafka_offset_reset(rd_kafka_toppar_t *rktp, int32_t broker_id, - int64_t err_offset, + rd_kafka_fetch_pos_t err_pos, rd_kafka_resp_err_t err, - const char *reason); + const char *fmt, + ...) RD_FORMAT(printf, 5, 6); + +void rd_kafka_offset_validate(rd_kafka_toppar_t *rktp, const char *fmt, ...) + RD_FORMAT(printf, 2, 3); void rd_kafka_offset_query_tmr_cb(rd_kafka_timers_t *rkts, void *arg); +void rd_kafka_update_app_pos(rd_kafka_t *rk, + rd_kafka_toppar_t *rktp, + rd_kafka_fetch_pos_t pos, + rd_dolock_t do_lock); + #endif /* _RDKAFKA_OFFSET_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_op.c b/src/third_party/librdkafka/dist/src/rdkafka_op.c index e1324c513f2..5dbbf9c9d4d 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_op.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_op.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -43,48 +44,52 @@ rd_atomic32_t rd_kafka_op_cnt; const char *rd_kafka_op2str(rd_kafka_op_type_t type) { int skiplen = 6; static const char *names[RD_KAFKA_OP__END] = { - [RD_KAFKA_OP_NONE] = "REPLY:NONE", - [RD_KAFKA_OP_FETCH] = "REPLY:FETCH", - [RD_KAFKA_OP_ERR] = "REPLY:ERR", - [RD_KAFKA_OP_CONSUMER_ERR] = "REPLY:CONSUMER_ERR", - [RD_KAFKA_OP_DR] = "REPLY:DR", - [RD_KAFKA_OP_STATS] = "REPLY:STATS", - [RD_KAFKA_OP_OFFSET_COMMIT] = "REPLY:OFFSET_COMMIT", - [RD_KAFKA_OP_NODE_UPDATE] = "REPLY:NODE_UPDATE", - [RD_KAFKA_OP_XMIT_BUF] = "REPLY:XMIT_BUF", - [RD_KAFKA_OP_RECV_BUF] = "REPLY:RECV_BUF", - [RD_KAFKA_OP_XMIT_RETRY] = "REPLY:XMIT_RETRY", - [RD_KAFKA_OP_FETCH_START] = "REPLY:FETCH_START", - [RD_KAFKA_OP_FETCH_STOP] = "REPLY:FETCH_STOP", - [RD_KAFKA_OP_SEEK] = "REPLY:SEEK", - [RD_KAFKA_OP_PAUSE] = "REPLY:PAUSE", - [RD_KAFKA_OP_OFFSET_FETCH] = "REPLY:OFFSET_FETCH", - [RD_KAFKA_OP_PARTITION_JOIN] = "REPLY:PARTITION_JOIN", - [RD_KAFKA_OP_PARTITION_LEAVE] = "REPLY:PARTITION_LEAVE", - [RD_KAFKA_OP_REBALANCE] = "REPLY:REBALANCE", - [RD_KAFKA_OP_TERMINATE] = "REPLY:TERMINATE", - [RD_KAFKA_OP_COORD_QUERY] = "REPLY:COORD_QUERY", - [RD_KAFKA_OP_SUBSCRIBE] = "REPLY:SUBSCRIBE", - [RD_KAFKA_OP_ASSIGN] = "REPLY:ASSIGN", - [RD_KAFKA_OP_GET_SUBSCRIPTION] = "REPLY:GET_SUBSCRIPTION", - [RD_KAFKA_OP_GET_ASSIGNMENT] = "REPLY:GET_ASSIGNMENT", - [RD_KAFKA_OP_THROTTLE] = "REPLY:THROTTLE", - [RD_KAFKA_OP_NAME] = "REPLY:NAME", - [RD_KAFKA_OP_CG_METADATA] = "REPLY:CG_METADATA", - [RD_KAFKA_OP_OFFSET_RESET] = "REPLY:OFFSET_RESET", - [RD_KAFKA_OP_METADATA] = "REPLY:METADATA", - [RD_KAFKA_OP_LOG] = "REPLY:LOG", - [RD_KAFKA_OP_WAKEUP] = "REPLY:WAKEUP", - [RD_KAFKA_OP_CREATETOPICS] = "REPLY:CREATETOPICS", - [RD_KAFKA_OP_DELETETOPICS] = "REPLY:DELETETOPICS", - [RD_KAFKA_OP_CREATEPARTITIONS] = "REPLY:CREATEPARTITIONS", - [RD_KAFKA_OP_ALTERCONFIGS] = "REPLY:ALTERCONFIGS", + [RD_KAFKA_OP_NONE] = "REPLY:NONE", + [RD_KAFKA_OP_FETCH] = "REPLY:FETCH", + [RD_KAFKA_OP_ERR] = "REPLY:ERR", + [RD_KAFKA_OP_CONSUMER_ERR] = "REPLY:CONSUMER_ERR", + [RD_KAFKA_OP_DR] = "REPLY:DR", + [RD_KAFKA_OP_STATS] = "REPLY:STATS", + [RD_KAFKA_OP_OFFSET_COMMIT] = "REPLY:OFFSET_COMMIT", + [RD_KAFKA_OP_NODE_UPDATE] = "REPLY:NODE_UPDATE", + [RD_KAFKA_OP_XMIT_BUF] = "REPLY:XMIT_BUF", + [RD_KAFKA_OP_RECV_BUF] = "REPLY:RECV_BUF", + [RD_KAFKA_OP_XMIT_RETRY] = "REPLY:XMIT_RETRY", + [RD_KAFKA_OP_FETCH_START] = "REPLY:FETCH_START", + [RD_KAFKA_OP_FETCH_STOP] = "REPLY:FETCH_STOP", + [RD_KAFKA_OP_SEEK] = "REPLY:SEEK", + [RD_KAFKA_OP_PAUSE] = "REPLY:PAUSE", + [RD_KAFKA_OP_OFFSET_FETCH] = "REPLY:OFFSET_FETCH", + [RD_KAFKA_OP_PARTITION_JOIN] = "REPLY:PARTITION_JOIN", + [RD_KAFKA_OP_PARTITION_LEAVE] = "REPLY:PARTITION_LEAVE", + [RD_KAFKA_OP_REBALANCE] = "REPLY:REBALANCE", + [RD_KAFKA_OP_TERMINATE] = "REPLY:TERMINATE", + [RD_KAFKA_OP_COORD_QUERY] = "REPLY:COORD_QUERY", + [RD_KAFKA_OP_SUBSCRIBE] = "REPLY:SUBSCRIBE", + [RD_KAFKA_OP_ASSIGN] = "REPLY:ASSIGN", + [RD_KAFKA_OP_GET_SUBSCRIPTION] = "REPLY:GET_SUBSCRIPTION", + [RD_KAFKA_OP_GET_ASSIGNMENT] = "REPLY:GET_ASSIGNMENT", + [RD_KAFKA_OP_THROTTLE] = "REPLY:THROTTLE", + [RD_KAFKA_OP_NAME] = "REPLY:NAME", + [RD_KAFKA_OP_CG_METADATA] = "REPLY:CG_METADATA", + [RD_KAFKA_OP_OFFSET_RESET] = "REPLY:OFFSET_RESET", + [RD_KAFKA_OP_METADATA] = "REPLY:METADATA", + [RD_KAFKA_OP_LOG] = "REPLY:LOG", + [RD_KAFKA_OP_WAKEUP] = "REPLY:WAKEUP", + [RD_KAFKA_OP_CREATETOPICS] = "REPLY:CREATETOPICS", + [RD_KAFKA_OP_DELETETOPICS] = "REPLY:DELETETOPICS", + [RD_KAFKA_OP_CREATEPARTITIONS] = "REPLY:CREATEPARTITIONS", + [RD_KAFKA_OP_ALTERCONFIGS] = "REPLY:ALTERCONFIGS", + [RD_KAFKA_OP_INCREMENTALALTERCONFIGS] = + "REPLY:INCREMENTALALTERCONFIGS", [RD_KAFKA_OP_DESCRIBECONFIGS] = "REPLY:DESCRIBECONFIGS", [RD_KAFKA_OP_DELETERECORDS] = "REPLY:DELETERECORDS", [RD_KAFKA_OP_LISTCONSUMERGROUPS] = "REPLY:LISTCONSUMERGROUPS", [RD_KAFKA_OP_DESCRIBECONSUMERGROUPS] = "REPLY:DESCRIBECONSUMERGROUPS", - [RD_KAFKA_OP_DELETEGROUPS] = "REPLY:DELETEGROUPS", + [RD_KAFKA_OP_DESCRIBETOPICS] = "REPLY:DESCRIBETOPICS", + [RD_KAFKA_OP_DESCRIBECLUSTER] = "REPLY:DESCRIBECLUSTER", + [RD_KAFKA_OP_DELETEGROUPS] = "REPLY:DELETEGROUPS", [RD_KAFKA_OP_DELETECONSUMERGROUPOFFSETS] = "REPLY:DELETECONSUMERGROUPOFFSETS", [RD_KAFKA_OP_CREATEACLS] = "REPLY:CREATEACLS", @@ -104,8 +109,20 @@ const char *rd_kafka_op2str(rd_kafka_op_type_t type) { [RD_KAFKA_OP_TXN] = "REPLY:TXN", [RD_KAFKA_OP_GET_REBALANCE_PROTOCOL] = "REPLY:GET_REBALANCE_PROTOCOL", - [RD_KAFKA_OP_LEADERS] = "REPLY:LEADERS", - [RD_KAFKA_OP_BARRIER] = "REPLY:BARRIER", + [RD_KAFKA_OP_LEADERS] = "REPLY:LEADERS", + [RD_KAFKA_OP_BARRIER] = "REPLY:BARRIER", + [RD_KAFKA_OP_SASL_REAUTH] = "REPLY:SASL_REAUTH", + [RD_KAFKA_OP_ALTERUSERSCRAMCREDENTIALS] = + "REPLY:ALTERUSERSCRAMCREDENTIALS", + [RD_KAFKA_OP_DESCRIBEUSERSCRAMCREDENTIALS] = + "REPLY:DESCRIBEUSERSCRAMCREDENTIALS", + [RD_KAFKA_OP_LISTOFFSETS] = "REPLY:LISTOFFSETS", + [RD_KAFKA_OP_METADATA_UPDATE] = "REPLY:METADATA_UPDATE", + [RD_KAFKA_OP_SET_TELEMETRY_BROKER] = + "REPLY:RD_KAFKA_OP_SET_TELEMETRY_BROKER", + [RD_KAFKA_OP_TERMINATE_TELEMETRY] = + "REPLY:RD_KAFKA_OP_TERMINATE_TELEMETRY", + [RD_KAFKA_OP_ELECTLEADERS] = "REPLY:ELECTLEADERS", }; if (type & RD_KAFKA_OP_REPLY) @@ -193,47 +210,51 @@ rd_kafka_op_t *rd_kafka_op_new0(const char *source, rd_kafka_op_type_t type) { * if we forgot to add an op type to \ * this list. */ static const size_t op2size[RD_KAFKA_OP__END] = { - [RD_KAFKA_OP_FETCH] = sizeof(rko->rko_u.fetch), - [RD_KAFKA_OP_ERR] = sizeof(rko->rko_u.err), - [RD_KAFKA_OP_CONSUMER_ERR] = sizeof(rko->rko_u.err), - [RD_KAFKA_OP_DR] = sizeof(rko->rko_u.dr), - [RD_KAFKA_OP_STATS] = sizeof(rko->rko_u.stats), - [RD_KAFKA_OP_OFFSET_COMMIT] = sizeof(rko->rko_u.offset_commit), - [RD_KAFKA_OP_NODE_UPDATE] = sizeof(rko->rko_u.node), - [RD_KAFKA_OP_XMIT_BUF] = sizeof(rko->rko_u.xbuf), - [RD_KAFKA_OP_RECV_BUF] = sizeof(rko->rko_u.xbuf), - [RD_KAFKA_OP_XMIT_RETRY] = sizeof(rko->rko_u.xbuf), - [RD_KAFKA_OP_FETCH_START] = sizeof(rko->rko_u.fetch_start), - [RD_KAFKA_OP_FETCH_STOP] = _RD_KAFKA_OP_EMPTY, - [RD_KAFKA_OP_SEEK] = sizeof(rko->rko_u.fetch_start), - [RD_KAFKA_OP_PAUSE] = sizeof(rko->rko_u.pause), - [RD_KAFKA_OP_OFFSET_FETCH] = sizeof(rko->rko_u.offset_fetch), - [RD_KAFKA_OP_PARTITION_JOIN] = _RD_KAFKA_OP_EMPTY, - [RD_KAFKA_OP_PARTITION_LEAVE] = _RD_KAFKA_OP_EMPTY, - [RD_KAFKA_OP_REBALANCE] = sizeof(rko->rko_u.rebalance), - [RD_KAFKA_OP_TERMINATE] = _RD_KAFKA_OP_EMPTY, - [RD_KAFKA_OP_COORD_QUERY] = _RD_KAFKA_OP_EMPTY, - [RD_KAFKA_OP_SUBSCRIBE] = sizeof(rko->rko_u.subscribe), - [RD_KAFKA_OP_ASSIGN] = sizeof(rko->rko_u.assign), - [RD_KAFKA_OP_GET_SUBSCRIPTION] = sizeof(rko->rko_u.subscribe), - [RD_KAFKA_OP_GET_ASSIGNMENT] = sizeof(rko->rko_u.assign), - [RD_KAFKA_OP_THROTTLE] = sizeof(rko->rko_u.throttle), - [RD_KAFKA_OP_NAME] = sizeof(rko->rko_u.name), - [RD_KAFKA_OP_CG_METADATA] = sizeof(rko->rko_u.cg_metadata), - [RD_KAFKA_OP_OFFSET_RESET] = sizeof(rko->rko_u.offset_reset), - [RD_KAFKA_OP_METADATA] = sizeof(rko->rko_u.metadata), - [RD_KAFKA_OP_LOG] = sizeof(rko->rko_u.log), - [RD_KAFKA_OP_WAKEUP] = _RD_KAFKA_OP_EMPTY, - [RD_KAFKA_OP_CREATETOPICS] = sizeof(rko->rko_u.admin_request), - [RD_KAFKA_OP_DELETETOPICS] = sizeof(rko->rko_u.admin_request), - [RD_KAFKA_OP_CREATEPARTITIONS] = sizeof(rko->rko_u.admin_request), - [RD_KAFKA_OP_ALTERCONFIGS] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_FETCH] = sizeof(rko->rko_u.fetch), + [RD_KAFKA_OP_ERR] = sizeof(rko->rko_u.err), + [RD_KAFKA_OP_CONSUMER_ERR] = sizeof(rko->rko_u.err), + [RD_KAFKA_OP_DR] = sizeof(rko->rko_u.dr), + [RD_KAFKA_OP_STATS] = sizeof(rko->rko_u.stats), + [RD_KAFKA_OP_OFFSET_COMMIT] = sizeof(rko->rko_u.offset_commit), + [RD_KAFKA_OP_NODE_UPDATE] = sizeof(rko->rko_u.node), + [RD_KAFKA_OP_XMIT_BUF] = sizeof(rko->rko_u.xbuf), + [RD_KAFKA_OP_RECV_BUF] = sizeof(rko->rko_u.xbuf), + [RD_KAFKA_OP_XMIT_RETRY] = sizeof(rko->rko_u.xbuf), + [RD_KAFKA_OP_FETCH_START] = sizeof(rko->rko_u.fetch_start), + [RD_KAFKA_OP_FETCH_STOP] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_SEEK] = sizeof(rko->rko_u.fetch_start), + [RD_KAFKA_OP_PAUSE] = sizeof(rko->rko_u.pause), + [RD_KAFKA_OP_OFFSET_FETCH] = sizeof(rko->rko_u.offset_fetch), + [RD_KAFKA_OP_PARTITION_JOIN] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_PARTITION_LEAVE] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_REBALANCE] = sizeof(rko->rko_u.rebalance), + [RD_KAFKA_OP_TERMINATE] = sizeof(rko->rko_u.terminated), + [RD_KAFKA_OP_COORD_QUERY] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_SUBSCRIBE] = sizeof(rko->rko_u.subscribe), + [RD_KAFKA_OP_ASSIGN] = sizeof(rko->rko_u.assign), + [RD_KAFKA_OP_GET_SUBSCRIPTION] = sizeof(rko->rko_u.subscribe), + [RD_KAFKA_OP_GET_ASSIGNMENT] = sizeof(rko->rko_u.assign), + [RD_KAFKA_OP_THROTTLE] = sizeof(rko->rko_u.throttle), + [RD_KAFKA_OP_NAME] = sizeof(rko->rko_u.name), + [RD_KAFKA_OP_CG_METADATA] = sizeof(rko->rko_u.cg_metadata), + [RD_KAFKA_OP_OFFSET_RESET] = sizeof(rko->rko_u.offset_reset), + [RD_KAFKA_OP_METADATA] = sizeof(rko->rko_u.metadata), + [RD_KAFKA_OP_LOG] = sizeof(rko->rko_u.log), + [RD_KAFKA_OP_WAKEUP] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_CREATETOPICS] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_DELETETOPICS] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_CREATEPARTITIONS] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_ALTERCONFIGS] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_INCREMENTALALTERCONFIGS] = + sizeof(rko->rko_u.admin_request), [RD_KAFKA_OP_DESCRIBECONFIGS] = sizeof(rko->rko_u.admin_request), [RD_KAFKA_OP_DELETERECORDS] = sizeof(rko->rko_u.admin_request), [RD_KAFKA_OP_LISTCONSUMERGROUPS] = sizeof(rko->rko_u.admin_request), [RD_KAFKA_OP_DESCRIBECONSUMERGROUPS] = sizeof(rko->rko_u.admin_request), - [RD_KAFKA_OP_DELETEGROUPS] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_DESCRIBETOPICS] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_DESCRIBECLUSTER] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_DELETEGROUPS] = sizeof(rko->rko_u.admin_request), [RD_KAFKA_OP_DELETECONSUMERGROUPOFFSETS] = sizeof(rko->rko_u.admin_request), [RD_KAFKA_OP_CREATEACLS] = sizeof(rko->rko_u.admin_request), @@ -253,8 +274,19 @@ rd_kafka_op_t *rd_kafka_op_new0(const char *source, rd_kafka_op_type_t type) { [RD_KAFKA_OP_TXN] = sizeof(rko->rko_u.txn), [RD_KAFKA_OP_GET_REBALANCE_PROTOCOL] = sizeof(rko->rko_u.rebalance_protocol), - [RD_KAFKA_OP_LEADERS] = sizeof(rko->rko_u.leaders), - [RD_KAFKA_OP_BARRIER] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_LEADERS] = sizeof(rko->rko_u.leaders), + [RD_KAFKA_OP_BARRIER] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_SASL_REAUTH] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_ALTERUSERSCRAMCREDENTIALS] = + sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_DESCRIBEUSERSCRAMCREDENTIALS] = + sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_LISTOFFSETS] = sizeof(rko->rko_u.admin_request), + [RD_KAFKA_OP_METADATA_UPDATE] = sizeof(rko->rko_u.metadata), + [RD_KAFKA_OP_SET_TELEMETRY_BROKER] = + sizeof(rko->rko_u.telemetry_broker), + [RD_KAFKA_OP_TERMINATE_TELEMETRY] = _RD_KAFKA_OP_EMPTY, + [RD_KAFKA_OP_ELECTLEADERS] = sizeof(rko->rko_u.admin_request), }; size_t tsize = op2size[type & ~RD_KAFKA_OP_FLAGMASK]; @@ -366,6 +398,8 @@ void rd_kafka_op_destroy(rd_kafka_op_t *rko) { if (rko->rko_u.dr.rkt) rd_kafka_topic_destroy0(rko->rko_u.dr.rkt); + if (rko->rko_u.dr.presult) + rd_kafka_Produce_result_destroy(rko->rko_u.dr.presult); break; case RD_KAFKA_OP_OFFSET_RESET: @@ -374,6 +408,8 @@ void rd_kafka_op_destroy(rd_kafka_op_t *rko) { case RD_KAFKA_OP_METADATA: RD_IF_FREE(rko->rko_u.metadata.md, rd_kafka_metadata_destroy); + /* It's not needed to free metadata.mdi because they + are the in the same memory allocation. */ break; case RD_KAFKA_OP_LOG: @@ -387,6 +423,7 @@ void rd_kafka_op_destroy(rd_kafka_op_t *rko) { case RD_KAFKA_OP_DELETETOPICS: case RD_KAFKA_OP_CREATEPARTITIONS: case RD_KAFKA_OP_ALTERCONFIGS: + case RD_KAFKA_OP_INCREMENTALALTERCONFIGS: case RD_KAFKA_OP_DESCRIBECONFIGS: case RD_KAFKA_OP_DELETERECORDS: case RD_KAFKA_OP_LISTCONSUMERGROUPS: @@ -397,7 +434,13 @@ void rd_kafka_op_destroy(rd_kafka_op_t *rko) { case RD_KAFKA_OP_DESCRIBEACLS: case RD_KAFKA_OP_DELETEACLS: case RD_KAFKA_OP_ALTERCONSUMERGROUPOFFSETS: + case RD_KAFKA_OP_DESCRIBETOPICS: + case RD_KAFKA_OP_DESCRIBECLUSTER: case RD_KAFKA_OP_LISTCONSUMERGROUPOFFSETS: + case RD_KAFKA_OP_ALTERUSERSCRAMCREDENTIALS: + case RD_KAFKA_OP_DESCRIBEUSERSCRAMCREDENTIALS: + case RD_KAFKA_OP_LISTOFFSETS: + case RD_KAFKA_OP_ELECTLEADERS: rd_kafka_replyq_destroy(&rko->rko_u.admin_request.replyq); rd_list_destroy(&rko->rko_u.admin_request.args); if (rko->rko_u.admin_request.options.match_consumer_group_states @@ -405,6 +448,11 @@ void rd_kafka_op_destroy(rd_kafka_op_t *rko) { rd_list_destroy(rko->rko_u.admin_request.options .match_consumer_group_states.u.PTR); } + if (rko->rko_u.admin_request.options.match_consumer_group_types + .u.PTR) { + rd_list_destroy(rko->rko_u.admin_request.options + .match_consumer_group_types.u.PTR); + } rd_assert(!rko->rko_u.admin_request.fanout_parent); RD_IF_FREE(rko->rko_u.admin_request.coordkey, rd_free); break; @@ -420,6 +468,12 @@ void rd_kafka_op_destroy(rd_kafka_op_t *rko) { case RD_KAFKA_OP_MOCK: RD_IF_FREE(rko->rko_u.mock.name, rd_free); RD_IF_FREE(rko->rko_u.mock.str, rd_free); + if (rko->rko_u.mock.metrics) { + int64_t i; + for (i = 0; i < rko->rko_u.mock.hi; i++) + rd_free(rko->rko_u.mock.metrics[i]); + rd_free(rko->rko_u.mock.metrics); + } break; case RD_KAFKA_OP_BROKER_MONITOR: @@ -442,6 +496,17 @@ void rd_kafka_op_destroy(rd_kafka_op_t *rko) { rd_kafka_topic_partition_list_destroy); break; + case RD_KAFKA_OP_METADATA_UPDATE: + RD_IF_FREE(rko->rko_u.metadata.md, rd_kafka_metadata_destroy); + /* It's not needed to free metadata.mdi because they + are the in the same memory allocation. */ + break; + + case RD_KAFKA_OP_SET_TELEMETRY_BROKER: + RD_IF_FREE(rko->rko_u.telemetry_broker.rkb, + rd_kafka_broker_destroy); + break; + default: break; } @@ -727,11 +792,11 @@ rd_kafka_op_call(rd_kafka_t *rk, rd_kafka_q_t *rkq, rd_kafka_op_t *rko) { rd_kafka_op_t *rd_kafka_op_new_ctrl_msg(rd_kafka_toppar_t *rktp, int32_t version, rd_kafka_buf_t *rkbuf, - int64_t offset) { + rd_kafka_fetch_pos_t pos) { rd_kafka_msg_t *rkm; rd_kafka_op_t *rko; - rko = rd_kafka_op_new_fetch_msg(&rkm, rktp, version, rkbuf, offset, 0, + rko = rd_kafka_op_new_fetch_msg(&rkm, rktp, version, rkbuf, pos, 0, NULL, 0, NULL); rkm->rkm_flags |= RD_KAFKA_MSG_F_CONTROL; @@ -750,7 +815,7 @@ rd_kafka_op_t *rd_kafka_op_new_fetch_msg(rd_kafka_msg_t **rkmp, rd_kafka_toppar_t *rktp, int32_t version, rd_kafka_buf_t *rkbuf, - int64_t offset, + rd_kafka_fetch_pos_t pos, size_t key_len, const void *key, size_t val_len, @@ -772,7 +837,8 @@ rd_kafka_op_t *rd_kafka_op_new_fetch_msg(rd_kafka_msg_t **rkmp, rko->rko_u.fetch.rkbuf = rkbuf; rd_kafka_buf_keep(rkbuf); - rkm->rkm_offset = offset; + rkm->rkm_offset = pos.offset; + rkm->rkm_u.consumer.leader_epoch = pos.leader_epoch; rkm->rkm_key = (void *)key; rkm->rkm_key_len = key_len; @@ -799,8 +865,11 @@ void rd_kafka_op_throttle_time(rd_kafka_broker_t *rkb, int throttle_time) { rd_kafka_op_t *rko; - if (unlikely(throttle_time > 0)) + if (unlikely(throttle_time > 0)) { rd_avg_add(&rkb->rkb_avg_throttle, throttle_time); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle, + throttle_time); + } /* We send throttle events when: * - throttle_time > 0 @@ -911,7 +980,7 @@ rd_kafka_op_res_t rd_kafka_op_handle(rd_kafka_t *rk, */ void rd_kafka_fetch_op_app_prepare(rd_kafka_t *rk, rd_kafka_op_t *rko) { rd_kafka_toppar_t *rktp; - int64_t offset; + rd_kafka_fetch_pos_t pos; if (unlikely(rko->rko_type != RD_KAFKA_OP_FETCH || rko->rko_err)) return; @@ -921,13 +990,8 @@ void rd_kafka_fetch_op_app_prepare(rd_kafka_t *rk, rd_kafka_op_t *rko) { if (unlikely(!rk)) rk = rktp->rktp_rkt->rkt_rk; - offset = rko->rko_u.fetch.rkm.rkm_rkmessage.offset + 1; + pos.offset = rko->rko_u.fetch.rkm.rkm_rkmessage.offset + 1; + pos.leader_epoch = rko->rko_u.fetch.rkm.rkm_u.consumer.leader_epoch; - rd_kafka_toppar_lock(rktp); - rktp->rktp_app_offset = offset; - if (rk->rk_conf.enable_auto_offset_store) - rd_kafka_offset_store0(rktp, offset, - /* force: ignore assignment state */ - rd_true, RD_DONT_LOCK); - rd_kafka_toppar_unlock(rktp); + rd_kafka_update_app_pos(rk, rktp, pos, RD_DO_LOCK); } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_op.h b/src/third_party/librdkafka/dist/src/rdkafka_op.h index 05b967100a2..e79309aa021 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_op.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_op.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,6 +39,7 @@ typedef struct rd_kafka_q_s rd_kafka_q_t; typedef struct rd_kafka_toppar_s rd_kafka_toppar_t; typedef struct rd_kafka_op_s rd_kafka_op_t; +typedef struct rd_kafka_broker_s rd_kafka_broker_t; /* One-off reply queue + reply version. * All APIs that take a rd_kafka_replyq_t makes a copy of the @@ -126,18 +128,28 @@ typedef enum { RD_KAFKA_OP_DELETETOPICS, /**< Admin: DeleteTopics: u.admin_request*/ RD_KAFKA_OP_CREATEPARTITIONS, /**< Admin: CreatePartitions: * u.admin_request*/ - RD_KAFKA_OP_ALTERCONFIGS, /**< Admin: AlterConfigs: u.admin_request*/ - RD_KAFKA_OP_DESCRIBECONFIGS, /**< Admin: DescribeConfigs: - * u.admin_request*/ - RD_KAFKA_OP_DELETERECORDS, /**< Admin: DeleteRecords: - * u.admin_request*/ - RD_KAFKA_OP_LISTCONSUMERGROUPS, /**< Admin: - * ListConsumerGroups - * u.admin_request */ - RD_KAFKA_OP_DESCRIBECONSUMERGROUPS, /**< Admin: - * DescribeConsumerGroups - * u.admin_request */ - RD_KAFKA_OP_DELETEGROUPS, /**< Admin: DeleteGroups: u.admin_request*/ + RD_KAFKA_OP_ALTERCONFIGS, /**< Admin: AlterConfigs: u.admin_request*/ + RD_KAFKA_OP_INCREMENTALALTERCONFIGS, /**< Admin: + * IncrementalAlterConfigs: + * u.admin_request */ + RD_KAFKA_OP_DESCRIBECONFIGS, /**< Admin: DescribeConfigs: + * u.admin_request*/ + RD_KAFKA_OP_DELETERECORDS, /**< Admin: DeleteRecords: + * u.admin_request*/ + RD_KAFKA_OP_LISTCONSUMERGROUPS, /**< Admin: + * ListConsumerGroups + * u.admin_request */ + RD_KAFKA_OP_DESCRIBECONSUMERGROUPS, /**< Admin: + * DescribeConsumerGroups + * u.admin_request */ + RD_KAFKA_OP_DESCRIBECLUSTER, /**< Admin: + * DescribeCluster + * u.admin_request */ + + RD_KAFKA_OP_DESCRIBETOPICS, /**< Admin: + * DescribeTopics + * u.admin_request */ + RD_KAFKA_OP_DELETEGROUPS, /**< Admin: DeleteGroups: u.admin_request*/ RD_KAFKA_OP_DELETECONSUMERGROUPOFFSETS, /**< Admin: * DeleteConsumerGroupOffsets * u.admin_request */ @@ -161,6 +173,22 @@ typedef enum { RD_KAFKA_OP_GET_REBALANCE_PROTOCOL, /**< Get rebalance protocol */ RD_KAFKA_OP_LEADERS, /**< Partition leader query */ RD_KAFKA_OP_BARRIER, /**< Version barrier bump */ + RD_KAFKA_OP_SASL_REAUTH, /**< Sasl reauthentication for broker */ + RD_KAFKA_OP_DESCRIBEUSERSCRAMCREDENTIALS, /* < Admin: + DescribeUserScramCredentials + u.admin_request >*/ + RD_KAFKA_OP_ALTERUSERSCRAMCREDENTIALS, /* < Admin: + AlterUserScramCredentials + u.admin_request >*/ + RD_KAFKA_OP_LISTOFFSETS, /**< Admin: ListOffsets u.admin_request >*/ + RD_KAFKA_OP_METADATA_UPDATE, /**< Metadata update (KIP 951) **/ + RD_KAFKA_OP_SET_TELEMETRY_BROKER, /**< Set preferred broker for + telemetry. */ + RD_KAFKA_OP_TERMINATE_TELEMETRY, /**< Start termination sequence for + telemetry. */ + RD_KAFKA_OP_ELECTLEADERS, /**< Admin: + * ElectLeaders + * u.admin_request */ RD_KAFKA_OP__END } rd_kafka_op_type_t; @@ -253,6 +281,7 @@ struct rd_kafka_admin_fanout_worker_cbs; #define RD_KAFKA_OP_TYPE_ASSERT(rko, type) \ rd_assert(((rko)->rko_type & ~RD_KAFKA_OP_FLAGMASK) == (type)) + struct rd_kafka_op_s { TAILQ_ENTRY(rd_kafka_op_s) rko_link; @@ -370,6 +399,9 @@ struct rd_kafka_op_s { /* RD_KAFKA_OP_METADATA */ struct { rd_kafka_metadata_t *md; + rd_kafka_metadata_internal_t *mdi; + /* subscription version for this call */ + int32_t subscription_version; int force; /* force request regardless of outstanding * metadata requests. */ } metadata; @@ -379,21 +411,21 @@ struct rd_kafka_op_s { rd_kafka_msgq_t msgq; rd_kafka_msgq_t msgq2; int do_purge2; + rd_kafka_Produce_result_t *presult; } dr; struct { - int32_t nodeid; char nodename[RD_KAFKA_NODENAME_SIZE]; } node; struct { - int64_t offset; + rd_kafka_fetch_pos_t pos; int32_t broker_id; /**< Originating broker, or -1 */ char *reason; } offset_reset; struct { - int64_t offset; + rd_kafka_fetch_pos_t pos; struct rd_kafka_cgrp_s *rkcg; } fetch_start; /* reused for SEEK */ @@ -439,13 +471,14 @@ struct rd_kafka_op_s { struct rd_kafka_admin_worker_cbs *cbs; /** Worker state */ - enum { RD_KAFKA_ADMIN_STATE_INIT, - RD_KAFKA_ADMIN_STATE_WAIT_BROKER, - RD_KAFKA_ADMIN_STATE_WAIT_CONTROLLER, - RD_KAFKA_ADMIN_STATE_WAIT_FANOUTS, - RD_KAFKA_ADMIN_STATE_CONSTRUCT_REQUEST, - RD_KAFKA_ADMIN_STATE_WAIT_RESPONSE, - RD_KAFKA_ADMIN_STATE_WAIT_BROKER_LIST, + enum { + RD_KAFKA_ADMIN_STATE_INIT, + RD_KAFKA_ADMIN_STATE_WAIT_BROKER, + RD_KAFKA_ADMIN_STATE_WAIT_CONTROLLER, + RD_KAFKA_ADMIN_STATE_WAIT_FANOUTS, + RD_KAFKA_ADMIN_STATE_CONSTRUCT_REQUEST, + RD_KAFKA_ADMIN_STATE_WAIT_RESPONSE, + RD_KAFKA_ADMIN_STATE_WAIT_BROKER_LIST, } state; int32_t broker_id; /**< Requested broker id to @@ -510,6 +543,14 @@ struct rd_kafka_op_s { char *errstr; /**< Error string, if rko_err * is set, else NULL. */ + /** Result cb for this op */ + void (*result_cb)(rd_kafka_op_t *); + + struct rd_kafka_admin_worker_cbs + *cbs; /**< Worker Callbacks + * Moved from admin request + */ + rd_list_t results; /**< Type depends on request type: * * (rd_kafka_topic_result_t *): @@ -518,6 +559,7 @@ struct rd_kafka_op_s { * * (rd_kafka_ConfigResource_t *): * AlterConfigs, DescribeConfigs + * IncrementalAlterConfigs */ void *opaque; /**< Application's opaque as set by @@ -535,16 +577,22 @@ struct rd_kafka_op_s { /**< Mock cluster command */ struct { - enum { RD_KAFKA_MOCK_CMD_TOPIC_SET_ERROR, - RD_KAFKA_MOCK_CMD_TOPIC_CREATE, - RD_KAFKA_MOCK_CMD_PART_SET_LEADER, - RD_KAFKA_MOCK_CMD_PART_SET_FOLLOWER, - RD_KAFKA_MOCK_CMD_PART_SET_FOLLOWER_WMARKS, - RD_KAFKA_MOCK_CMD_BROKER_SET_UPDOWN, - RD_KAFKA_MOCK_CMD_BROKER_SET_RTT, - RD_KAFKA_MOCK_CMD_BROKER_SET_RACK, - RD_KAFKA_MOCK_CMD_COORD_SET, - RD_KAFKA_MOCK_CMD_APIVERSION_SET, + enum { + RD_KAFKA_MOCK_CMD_TOPIC_SET_ERROR, + RD_KAFKA_MOCK_CMD_TOPIC_CREATE, + RD_KAFKA_MOCK_CMD_PART_SET_LEADER, + RD_KAFKA_MOCK_CMD_PART_SET_FOLLOWER, + RD_KAFKA_MOCK_CMD_PART_SET_FOLLOWER_WMARKS, + RD_KAFKA_MOCK_CMD_PART_PUSH_LEADER_RESPONSE, + RD_KAFKA_MOCK_CMD_BROKER_SET_UPDOWN, + RD_KAFKA_MOCK_CMD_BROKER_SET_RTT, + RD_KAFKA_MOCK_CMD_BROKER_SET_RACK, + RD_KAFKA_MOCK_CMD_BROKER_DECOMMISSION, + RD_KAFKA_MOCK_CMD_BROKER_ADD, + RD_KAFKA_MOCK_CMD_COORD_SET, + RD_KAFKA_MOCK_CMD_APIVERSION_SET, + RD_KAFKA_MOCK_CMD_REQUESTED_METRICS_SET, + RD_KAFKA_MOCK_CMD_TELEMETRY_PUSH_INTERVAL_SET, } cmd; rd_kafka_resp_err_t err; /**< Error for: @@ -555,7 +603,9 @@ struct rd_kafka_op_s { * PART_SET_FOLLOWER * PART_SET_FOLLOWER_WMARKS * BROKER_SET_RACK - * COORD_SET (key_type) */ + * COORD_SET (key_type) + * PART_PUSH_LEADER_RESPONSE + */ char *str; /**< For: * COORD_SET (key) */ int32_t partition; /**< For: @@ -563,12 +613,15 @@ struct rd_kafka_op_s { * PART_SET_FOLLOWER_WMARKS * PART_SET_LEADER * APIVERSION_SET (ApiKey) + * PART_PUSH_LEADER_RESPONSE */ int32_t broker_id; /**< For: * PART_SET_FOLLOWER * PART_SET_LEADER * BROKER_SET_UPDOWN * BROKER_SET_RACK + * BROKER_DECOMMISSION + * BROKER_ADD * COORD_SET */ int64_t lo; /**< Low offset, for: * TOPIC_CREATE (part cnt) @@ -581,7 +634,17 @@ struct rd_kafka_op_s { * TOPIC_CREATE (repl fact) * PART_SET_FOLLOWER_WMARKS * APIVERSION_SET (maxver) + * REQUESTED_METRICS_SET (metrics_cnt) + * TELEMETRY_PUSH_INTERVAL_SET (interval) */ + int32_t leader_id; /**< Leader id, for: + * PART_PUSH_LEADER_RESPONSE + */ + int32_t leader_epoch; /**< Leader epoch, for: + * PART_PUSH_LEADER_RESPONSE + */ + char **metrics; /**< Metrics requested, for: + * REQUESTED_METRICS_SET */ } mock; struct { @@ -644,6 +707,23 @@ struct rd_kafka_op_s { } leaders; + struct { + /** Preferred broker for telemetry. */ + rd_kafka_broker_t *rkb; + } telemetry_broker; + + struct { + /** + * Terminated and freed broker pointer, + * can only be used for pointer comparison. + */ + void *rkb; + + /** Termination callback to trigger + * on the op handler's thread. */ + void (*cb)(rd_kafka_t *rk, void *rkb); + } terminated; + } rko_u; }; @@ -712,7 +792,7 @@ rd_kafka_op_t *rd_kafka_op_new_fetch_msg(rd_kafka_msg_t **rkmp, rd_kafka_toppar_t *rktp, int32_t version, rd_kafka_buf_t *rkbuf, - int64_t offset, + rd_kafka_fetch_pos_t pos, size_t key_len, const void *key, size_t val_len, @@ -721,7 +801,7 @@ rd_kafka_op_t *rd_kafka_op_new_fetch_msg(rd_kafka_msg_t **rkmp, rd_kafka_op_t *rd_kafka_op_new_ctrl_msg(rd_kafka_toppar_t *rktp, int32_t version, rd_kafka_buf_t *rkbuf, - int64_t offset); + rd_kafka_fetch_pos_t pos); void rd_kafka_op_throttle_time(struct rd_kafka_broker_s *rkb, rd_kafka_q_t *rkq, @@ -758,4 +838,21 @@ void rd_kafka_fetch_op_app_prepare(rd_kafka_t *rk, rd_kafka_op_t *rko); (rd_kafka_replyq_is_valid(&(RKO)->rko_replyq) && \ !rd_kafka_op_version_outdated((RKO), 0)) + + +/** + * @returns the rko for a consumer message (RD_KAFKA_OP_FETCH). + */ +static RD_UNUSED rd_kafka_op_t * +rd_kafka_message2rko(rd_kafka_message_t *rkmessage) { + rd_kafka_op_t *rko = rkmessage->_private; + + if (!rko || rko->rko_type != RD_KAFKA_OP_FETCH) + return NULL; + + return rko; +} + + + #endif /* _RDKAFKA_OP_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_partition.c b/src/third_party/librdkafka/dist/src/rdkafka_partition.c index 7e3cb2d17b5..a88d77a489b 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_partition.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_partition.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill, + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,8 +38,10 @@ #include "rdunittest.h" -const char *rd_kafka_fetch_states[] = {"none", "stopping", "stopped", - "offset-query", "offset-wait", "active"}; +const char *rd_kafka_fetch_states[] = {"none", "stopping", + "stopped", "offset-query", + "offset-wait", "validate-epoch-wait", + "active"}; static rd_kafka_op_res_t rd_kafka_toppar_op_serve(rd_kafka_t *rk, @@ -121,6 +124,7 @@ static void rd_kafka_toppar_lag_handle_Offset(rd_kafka_t *rk, */ static void rd_kafka_toppar_consumer_lag_req(rd_kafka_toppar_t *rktp) { rd_kafka_topic_partition_list_t *partitions; + rd_kafka_topic_partition_t *rktpar; if (rktp->rktp_wait_consumer_lag_resp) return; /* Previous request not finished yet */ @@ -151,15 +155,19 @@ static void rd_kafka_toppar_consumer_lag_req(rd_kafka_toppar_t *rktp) { rktp->rktp_wait_consumer_lag_resp = 1; partitions = rd_kafka_topic_partition_list_new(1); - rd_kafka_topic_partition_list_add( - partitions, rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition) - ->offset = RD_KAFKA_OFFSET_BEGINNING; + rktpar = rd_kafka_topic_partition_list_add( + partitions, rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition); + rktpar->offset = RD_KAFKA_OFFSET_BEGINNING; + rd_kafka_topic_partition_set_current_leader_epoch( + rktpar, rktp->rktp_leader_epoch); /* Ask for oldest offset. The newest offset is automatically * propagated in FetchResponse.HighwaterMark. */ - rd_kafka_ListOffsetsRequest( - rktp->rktp_broker, partitions, RD_KAFKA_REPLYQ(rktp->rktp_ops, 0), - rd_kafka_toppar_lag_handle_Offset, rd_kafka_toppar_keep(rktp)); + rd_kafka_ListOffsetsRequest(rktp->rktp_broker, partitions, + RD_KAFKA_REPLYQ(rktp->rktp_ops, 0), + rd_kafka_toppar_lag_handle_Offset, + -1, /* don't set an absolute timeout */ + rd_kafka_toppar_keep(rktp)); rd_kafka_toppar_unlock(rktp); @@ -213,10 +221,11 @@ rd_kafka_toppar_t *rd_kafka_toppar_new0(rd_kafka_topic_t *rkt, rktp = rd_calloc(1, sizeof(*rktp)); - rktp->rktp_partition = partition; - rktp->rktp_rkt = rkt; - rktp->rktp_leader_id = -1; - rktp->rktp_broker_id = -1; + rktp->rktp_partition = partition; + rktp->rktp_rkt = rkt; + rktp->rktp_leader_id = -1; + rktp->rktp_broker_id = -1; + rktp->rktp_leader_epoch = -1; rd_interval_init(&rktp->rktp_lease_intvl); rd_interval_init(&rktp->rktp_new_lease_intvl); rd_interval_init(&rktp->rktp_new_lease_log_intvl); @@ -231,22 +240,23 @@ rd_kafka_toppar_t *rd_kafka_toppar_new0(rd_kafka_topic_t *rkt, rktp->rktp_offset_fp = NULL; rd_kafka_offset_stats_reset(&rktp->rktp_offsets); rd_kafka_offset_stats_reset(&rktp->rktp_offsets_fin); - rktp->rktp_ls_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_hi_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_lo_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_query_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_next_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_last_next_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_app_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_stored_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_committing_offset = RD_KAFKA_OFFSET_INVALID; - rktp->rktp_committed_offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_ls_offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_hi_offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_lo_offset = RD_KAFKA_OFFSET_INVALID; + rd_kafka_fetch_pos_init(&rktp->rktp_query_pos); + rd_kafka_fetch_pos_init(&rktp->rktp_next_fetch_start); + rd_kafka_fetch_pos_init(&rktp->rktp_last_next_fetch_start); + rd_kafka_fetch_pos_init(&rktp->rktp_offset_validation_pos); + rd_kafka_fetch_pos_init(&rktp->rktp_app_pos); + rd_kafka_fetch_pos_init(&rktp->rktp_stored_pos); + rd_kafka_fetch_pos_init(&rktp->rktp_committing_pos); + rd_kafka_fetch_pos_init(&rktp->rktp_committed_pos); rd_kafka_msgq_init(&rktp->rktp_msgq); rd_kafka_msgq_init(&rktp->rktp_xmit_msgq); mtx_init(&rktp->rktp_lock, mtx_plain); rd_refcnt_init(&rktp->rktp_refcnt, 0); - rktp->rktp_fetchq = rd_kafka_q_new(rkt->rkt_rk); + rktp->rktp_fetchq = rd_kafka_consume_q_new(rkt->rkt_rk); rktp->rktp_ops = rd_kafka_q_new(rkt->rkt_rk); rktp->rktp_ops->rkq_serve = rd_kafka_toppar_op_serve; rktp->rktp_ops->rkq_opaque = rktp; @@ -300,6 +310,8 @@ static void rd_kafka_toppar_remove(rd_kafka_toppar_t *rktp) { rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, rktp); + rd_kafka_timer_stop(&rktp->rktp_rkt->rkt_rk->rk_timers, + &rktp->rktp_validate_tmr, 1 /*lock*/); rd_kafka_timer_stop(&rktp->rktp_rkt->rkt_rk->rk_timers, &rktp->rktp_offset_query_tmr, 1 /*lock*/); rd_kafka_timer_stop(&rktp->rktp_rkt->rkt_rk->rk_timers, @@ -340,6 +352,7 @@ void rd_kafka_toppar_destroy_final(rd_kafka_toppar_t *rktp) { rd_refcnt_destroy(&rktp->rktp_refcnt); + rd_free(rktp->rktp_stored_metadata); rd_free(rktp); } @@ -347,13 +360,10 @@ void rd_kafka_toppar_destroy_final(rd_kafka_toppar_t *rktp) { /** * Set toppar fetching state. * - * Locality: broker thread - * Locks: rd_kafka_toppar_lock() MUST be held. + * @locality any + * @locks_required rd_kafka_toppar_lock() MUST be held. */ void rd_kafka_toppar_set_fetch_state(rd_kafka_toppar_t *rktp, int fetch_state) { - rd_kafka_assert(NULL, - thrd_is_current(rktp->rktp_rkt->rkt_rk->rk_thread)); - if ((int)rktp->rktp_fetch_state == fetch_state) return; @@ -366,15 +376,24 @@ void rd_kafka_toppar_set_fetch_state(rd_kafka_toppar_t *rktp, int fetch_state) { rktp->rktp_fetch_state = fetch_state; - if (fetch_state == RD_KAFKA_TOPPAR_FETCH_ACTIVE) - rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, - CONSUMER | RD_KAFKA_DBG_TOPIC, "FETCH", - "Partition %.*s [%" PRId32 - "] start fetching " - "at offset %s", - RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, - rd_kafka_offset2str(rktp->rktp_next_offset)); + if (fetch_state == RD_KAFKA_TOPPAR_FETCH_ACTIVE) { + rktp->rktp_ts_fetch_backoff = 0; + + /* Wake-up broker thread which might be idling on IO */ + if (rktp->rktp_broker) + rd_kafka_broker_wakeup(rktp->rktp_broker, + "fetch start"); + + rd_kafka_dbg( + rktp->rktp_rkt->rkt_rk, CONSUMER | RD_KAFKA_DBG_TOPIC, + "FETCH", + "Partition %.*s [%" PRId32 "] start fetching at %s", + RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), + rktp->rktp_partition, + rd_kafka_fetch_pos2str( + rd_kafka_toppar_fetch_decide_next_fetch_start_pos( + rktp))); + } } @@ -866,6 +885,11 @@ void rd_kafka_msgq_insert_msgq(rd_kafka_msgq_t *destq, * @param incr_retry Increment retry count for messages. * @param max_retries Maximum retries allowed per message. * @param backoff Absolute retry backoff for retried messages. + * @param exponential_backoff If true the backoff should be exponential with + * 2**(retry_count - 1)*retry_ms with jitter. The + * \p backoff is ignored. + * @param retry_ms The retry ms used for exponential backoff calculation + * @param retry_max_ms The max backoff limit for exponential backoff calculation * * @returns 0 if all messages were retried, or 1 if some messages * could not be retried. @@ -876,16 +900,28 @@ int rd_kafka_retry_msgq(rd_kafka_msgq_t *destq, int max_retries, rd_ts_t backoff, rd_kafka_msg_status_t status, - int (*cmp)(const void *a, const void *b)) { + int (*cmp)(const void *a, const void *b), + rd_bool_t exponential_backoff, + int retry_ms, + int retry_max_ms) { rd_kafka_msgq_t retryable = RD_KAFKA_MSGQ_INITIALIZER(retryable); rd_kafka_msg_t *rkm, *tmp; - + rd_ts_t now; + int64_t jitter = rd_jitter(100 - RD_KAFKA_RETRY_JITTER_PERCENT, + 100 + RD_KAFKA_RETRY_JITTER_PERCENT); /* Scan through messages to see which ones are eligible for retry, * move the retryable ones to temporary queue and * set backoff time for first message and optionally * increase retry count for each message. * Sorted insert is not necessary since the original order - * srcq order is maintained. */ + * srcq order is maintained. + * + * Start timestamp for calculating backoff is common, + * to avoid that messages from the same batch + * have different backoff, as they need to be retried + * by reconstructing the same batch, when idempotency is + * enabled. */ + now = rd_clock(); TAILQ_FOREACH_SAFE(rkm, &srcq->rkmq_msgs, rkm_link, tmp) { if (rkm->rkm_u.producer.retries + incr_retry > max_retries) continue; @@ -893,8 +929,25 @@ int rd_kafka_retry_msgq(rd_kafka_msgq_t *destq, rd_kafka_msgq_deq(srcq, rkm, 1); rd_kafka_msgq_enq(&retryable, rkm); - rkm->rkm_u.producer.ts_backoff = backoff; rkm->rkm_u.producer.retries += incr_retry; + if (exponential_backoff) { + /* In some cases, like failed Produce requests do not + * increment the retry count, see + * rd_kafka_handle_Produce_error. */ + if (rkm->rkm_u.producer.retries > 0) + backoff = + (1 << (rkm->rkm_u.producer.retries - 1)) * + retry_ms; + else + backoff = retry_ms; + /* Multiplied by 10 as backoff should be in nano + * seconds. */ + backoff = jitter * backoff * 10; + if (backoff > retry_max_ms * 1000) + backoff = retry_max_ms * 1000; + backoff = now + backoff; + } + rkm->rkm_u.producer.ts_backoff = backoff; /* Don't downgrade a message from any form of PERSISTED * to NOT_PERSISTED, since the original cause of indicating @@ -933,17 +986,21 @@ int rd_kafka_toppar_retry_msgq(rd_kafka_toppar_t *rktp, rd_kafka_msgq_t *rkmq, int incr_retry, rd_kafka_msg_status_t status) { - rd_kafka_t *rk = rktp->rktp_rkt->rkt_rk; - rd_ts_t backoff = rd_clock() + (rk->rk_conf.retry_backoff_ms * 1000); + rd_kafka_t *rk = rktp->rktp_rkt->rkt_rk; + int retry_ms = rk->rk_conf.retry_backoff_ms; + int retry_max_ms = rk->rk_conf.retry_backoff_max_ms; int r; if (rd_kafka_terminating(rk)) return 1; rd_kafka_toppar_lock(rktp); + /* Exponential backoff applied. */ r = rd_kafka_retry_msgq(&rktp->rktp_msgq, rkmq, incr_retry, - rk->rk_conf.max_retries, backoff, status, - rktp->rktp_rkt->rkt_conf.msg_order_cmp); + rk->rk_conf.max_retries, + 0 /* backoff will be calculated */, status, + rktp->rktp_rkt->rkt_conf.msg_order_cmp, rd_true, + retry_ms, retry_max_ms); rd_kafka_toppar_unlock(rktp); return r; @@ -962,7 +1019,71 @@ void rd_kafka_toppar_insert_msgq(rd_kafka_toppar_t *rktp, rd_kafka_toppar_unlock(rktp); } +/** + * @brief Purge internal fetch queue if toppar is stopped + * (RD_KAFKA_TOPPAR_FETCH_STOPPED) and removed from the cluster + * (RD_KAFKA_TOPPAR_F_REMOVE). Will be called from different places as it's + * removed starting from a metadata response and stopped from a rebalance or a + * consumer close. + * + * @remark Avoids circular dependencies in from `rktp_fetchq` ops to the same + * toppar that stop destroying a consumer. + * + * @locks rd_kafka_toppar_lock() MUST be held + */ +void rd_kafka_toppar_purge_internal_fetch_queue_maybe(rd_kafka_toppar_t *rktp) { + rd_kafka_q_t *rkq; + rkq = rktp->rktp_fetchq; + mtx_lock(&rkq->rkq_lock); + if (rktp->rktp_flags & RD_KAFKA_TOPPAR_F_REMOVE && + !rktp->rktp_fetchq->rkq_fwdq) { + rd_kafka_op_t *rko; + int cnt = 0, barrier_cnt = 0, message_cnt = 0, other_cnt = 0; + /* Partition is being removed from the cluster and it's stopped, + * so rktp->rktp_fetchq->rkq_fwdq is NULL. + * Purge remaining operations in rktp->rktp_fetchq->rkq_q, + * while holding lock, to avoid circular references */ + rko = TAILQ_FIRST(&rkq->rkq_q); + while (rko) { + if (rko->rko_type != RD_KAFKA_OP_BARRIER && + rko->rko_type != RD_KAFKA_OP_FETCH) { + rd_kafka_log( + rktp->rktp_rkt->rkt_rk, LOG_WARNING, + "PARTDEL", + "Purging toppar fetch queue buffer op" + "with unexpected type: %s", + rd_kafka_op2str(rko->rko_type)); + } + + if (rko->rko_type == RD_KAFKA_OP_BARRIER) + barrier_cnt++; + else if (rko->rko_type == RD_KAFKA_OP_FETCH) + message_cnt++; + else + other_cnt++; + + rko = TAILQ_NEXT(rko, rko_link); + cnt++; + } + + if (cnt) { + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, CGRP, "PARTDEL", + "Purge toppar fetch queue buffer " + "containing %d op(s) " + "(%d barrier(s), %d message(s), %d other)" + " to avoid " + "circular references", + cnt, barrier_cnt, message_cnt, other_cnt); + rd_kafka_q_purge0(rktp->rktp_fetchq, rd_false); + } else { + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, CGRP, "PARTDEL", + "Not purging toppar fetch queue buffer." + " No ops present in the buffer."); + } + } + mtx_unlock(&rkq->rkq_lock); +} /** * Helper method for purging queues when removing a toppar. @@ -1114,7 +1235,8 @@ void rd_kafka_toppar_broker_delegate(rd_kafka_toppar_t *rktp, /* Undelegated toppars are delgated to the internal * broker for bookkeeping. */ - if (!rkb && !rd_kafka_terminating(rk)) { + if (!rd_kafka_terminating(rk) && + (!rkb || rd_kafka_broker_termination_in_progress(rkb))) { rkb = rd_kafka_broker_internal(rk); internal_fallback = 1; } @@ -1185,7 +1307,8 @@ void rd_kafka_toppar_offset_commit_result( rd_kafka_toppar_lock(rktp); if (!err) - rktp->rktp_committed_offset = offsets->elems[0].offset; + rktp->rktp_committed_pos = + rd_kafka_topic_partition_get_fetch_pos(&offsets->elems[0]); /* When stopping toppars: * Final commit is now done (or failed), propagate. */ @@ -1206,53 +1329,47 @@ void rd_kafka_toppar_offset_commit_result( * Locks: toppar_lock(rktp) must be held */ void rd_kafka_toppar_next_offset_handle(rd_kafka_toppar_t *rktp, - int64_t Offset) { + rd_kafka_fetch_pos_t next_pos) { - if (RD_KAFKA_OFFSET_IS_LOGICAL(Offset)) { + if (RD_KAFKA_OFFSET_IS_LOGICAL(next_pos.offset)) { /* Offset storage returned logical offset (e.g. "end"), * look it up. */ /* Save next offset, even if logical, so that e.g., * assign(BEGINNING) survives a pause+resume, etc. * See issue #2105. */ - rktp->rktp_next_offset = Offset; + rd_kafka_toppar_set_next_fetch_position(rktp, next_pos); - rd_kafka_offset_reset(rktp, RD_KAFKA_NODEID_UA, Offset, + rd_kafka_offset_reset(rktp, RD_KAFKA_NODEID_UA, next_pos, RD_KAFKA_RESP_ERR_NO_ERROR, "update"); return; } /* Adjust by TAIL count if, if wanted */ - if (rktp->rktp_query_offset <= RD_KAFKA_OFFSET_TAIL_BASE) { - int64_t orig_Offset = Offset; - int64_t tail_cnt = - llabs(rktp->rktp_query_offset - RD_KAFKA_OFFSET_TAIL_BASE); + if (rktp->rktp_query_pos.offset <= RD_KAFKA_OFFSET_TAIL_BASE) { + int64_t orig_offset = next_pos.offset; + int64_t tail_cnt = llabs(rktp->rktp_query_pos.offset - + RD_KAFKA_OFFSET_TAIL_BASE); - if (tail_cnt > Offset) - Offset = 0; + if (tail_cnt > next_pos.offset) + next_pos.offset = 0; else - Offset -= tail_cnt; + next_pos.offset -= tail_cnt; rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", "OffsetReply for topic %s [%" PRId32 "]: " "offset %" PRId64 ": adjusting for " - "OFFSET_TAIL(%" PRId64 - "): " - "effective offset %" PRId64, + "OFFSET_TAIL(%" PRId64 "): effective %s", rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition, orig_Offset, tail_cnt, - Offset); + rktp->rktp_partition, orig_offset, tail_cnt, + rd_kafka_fetch_pos2str(next_pos)); } - rktp->rktp_next_offset = Offset; + rd_kafka_toppar_set_next_fetch_position(rktp, next_pos); rd_kafka_toppar_set_fetch_state(rktp, RD_KAFKA_TOPPAR_FETCH_ACTIVE); - - /* Wake-up broker thread which might be idling on IO */ - if (rktp->rktp_broker) - rd_kafka_broker_wakeup(rktp->rktp_broker, "ready to fetch"); } @@ -1278,7 +1395,7 @@ void rd_kafka_toppar_offset_fetch(rd_kafka_toppar_t *rktp, part = rd_kafka_topic_partition_list_new(1); rd_kafka_topic_partition_list_add0(__FUNCTION__, __LINE__, part, rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition, rktp); + rktp->rktp_partition, rktp, NULL); rko = rd_kafka_op_new(RD_KAFKA_OP_OFFSET_FETCH); rko->rko_rktp = rd_kafka_toppar_keep(rktp); @@ -1309,12 +1426,11 @@ static void rd_kafka_toppar_handle_Offset(rd_kafka_t *rk, rd_kafka_toppar_t *rktp = opaque; rd_kafka_topic_partition_list_t *offsets; rd_kafka_topic_partition_t *rktpar; - int64_t Offset; int actions = 0; rd_kafka_toppar_lock(rktp); /* Drop reply from previous partition leader */ - if (err != RD_KAFKA_RESP_ERR__DESTROY && rktp->rktp_broker != rkb) + if (err != RD_KAFKA_RESP_ERR__DESTROY && rktp->rktp_leader != rkb) err = RD_KAFKA_RESP_ERR__OUTDATED; rd_kafka_toppar_unlock(rktp); @@ -1342,7 +1458,7 @@ static void rd_kafka_toppar_handle_Offset(rd_kafka_t *rk, if (!err && !(rktpar = rd_kafka_topic_partition_list_find( offsets, rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition))) { - /* Request partition not found in response */ + /* Requested partition not found in response */ err = RD_KAFKA_RESP_ERR__UNKNOWN_PARTITION; actions |= RD_KAFKA_ERR_ACTION_PERMANENT; } @@ -1385,19 +1501,19 @@ static void rd_kafka_toppar_handle_Offset(rd_kafka_t *rk, * and signal error back to application. */ rd_kafka_offset_reset(rktp, rkb->rkb_nodeid, - rktp->rktp_query_offset, err, + rktp->rktp_query_pos, err, "failed to query logical offset"); rd_kafka_consumer_err( rktp->rktp_fetchq, rkb->rkb_nodeid, err, 0, NULL, rktp, - (rktp->rktp_query_offset <= + (rktp->rktp_query_pos.offset <= RD_KAFKA_OFFSET_TAIL_BASE - ? rktp->rktp_query_offset - + ? rktp->rktp_query_pos.offset - RD_KAFKA_OFFSET_TAIL_BASE - : rktp->rktp_query_offset), + : rktp->rktp_query_pos.offset), "Failed to query logical offset %s: %s", - rd_kafka_offset2str(rktp->rktp_query_offset), + rd_kafka_offset2str(rktp->rktp_query_pos.offset), rd_kafka_err2str(err)); } else { @@ -1407,7 +1523,7 @@ static void rd_kafka_toppar_handle_Offset(rd_kafka_t *rk, rd_snprintf( tmp, sizeof(tmp), "failed to query logical offset %s: %s", - rd_kafka_offset2str(rktp->rktp_query_offset), + rd_kafka_offset2str(rktp->rktp_query_pos.offset), rd_kafka_err2str(err)); rd_kafka_toppar_offset_retry(rktp, 500, tmp); @@ -1419,21 +1535,27 @@ static void rd_kafka_toppar_handle_Offset(rd_kafka_t *rk, return; } - Offset = rktpar->offset; - rd_kafka_topic_partition_list_destroy(offsets); rd_kafka_toppar_lock(rktp); rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", "Offset %s request for %.*s [%" PRId32 "] " - "returned offset %s (%" PRId64 ")", - rd_kafka_offset2str(rktp->rktp_query_offset), + "returned offset %s (%" PRId64 ") leader epoch %" PRId32, + rd_kafka_offset2str(rktp->rktp_query_pos.offset), RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, rd_kafka_offset2str(Offset), Offset); + rktp->rktp_partition, rd_kafka_offset2str(rktpar->offset), + rktpar->offset, + rd_kafka_topic_partition_get_leader_epoch(rktpar)); - rd_kafka_toppar_next_offset_handle(rktp, Offset); + + rd_kafka_toppar_next_offset_handle( + rktp, RD_KAFKA_FETCH_POS( + rktpar->offset, + rd_kafka_topic_partition_get_leader_epoch(rktpar))); rd_kafka_toppar_unlock(rktp); + rd_kafka_topic_partition_list_destroy(offsets); + rd_kafka_toppar_destroy(rktp); /* from request.opaque */ } @@ -1462,12 +1584,12 @@ static void rd_kafka_toppar_offset_retry(rd_kafka_toppar_t *rktp, (tmr_next == -1 || tmr_next > rd_clock() + (backoff_ms * 1000ll)); rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", - "%s [%" PRId32 "]: %s: %s for offset %s", + "%s [%" PRId32 "]: %s: %s for %s", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, reason, restart_tmr ? "(re)starting offset query timer" : "offset query timer already scheduled", - rd_kafka_offset2str(rktp->rktp_query_offset)); + rd_kafka_fetch_pos2str(rktp->rktp_query_pos)); rd_kafka_toppar_set_fetch_state(rktp, RD_KAFKA_TOPPAR_FETCH_OFFSET_QUERY); @@ -1491,14 +1613,14 @@ static void rd_kafka_toppar_offset_retry(rd_kafka_toppar_t *rktp, * Locks: toppar_lock() must be held */ void rd_kafka_toppar_offset_request(rd_kafka_toppar_t *rktp, - int64_t query_offset, + rd_kafka_fetch_pos_t query_pos, int backoff_ms) { rd_kafka_broker_t *rkb; rd_kafka_assert(NULL, thrd_is_current(rktp->rktp_rkt->rkt_rk->rk_thread)); - rkb = rktp->rktp_broker; + rkb = rktp->rktp_leader; if (!backoff_ms && (!rkb || rkb->rkb_source == RD_KAFKA_INTERNAL)) backoff_ms = 500; @@ -1515,7 +1637,7 @@ void rd_kafka_toppar_offset_request(rd_kafka_toppar_t *rktp, &rktp->rktp_offset_query_tmr, 1 /*lock*/); - if (query_offset == RD_KAFKA_OFFSET_STORED && + if (query_pos.offset == RD_KAFKA_OFFSET_STORED && rktp->rktp_rkt->rkt_conf.offset_store_method == RD_KAFKA_OFFSET_METHOD_BROKER) { /* @@ -1528,6 +1650,7 @@ void rd_kafka_toppar_offset_request(rd_kafka_toppar_t *rktp, } else { rd_kafka_topic_partition_list_t *offsets; + rd_kafka_topic_partition_t *rktpar; /* * Look up logical offset (end,beginning,tail,..) @@ -1539,24 +1662,28 @@ void rd_kafka_toppar_offset_request(rd_kafka_toppar_t *rktp, "offset %s (opv %d)", RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), rktp->rktp_partition, - rd_kafka_offset2str(query_offset), + rd_kafka_offset2str(query_pos.offset), rktp->rktp_op_version); rd_kafka_toppar_keep(rktp); /* refcnt for OffsetRequest opaque*/ - if (query_offset <= RD_KAFKA_OFFSET_TAIL_BASE) - query_offset = RD_KAFKA_OFFSET_END; + if (query_pos.offset <= RD_KAFKA_OFFSET_TAIL_BASE) + query_pos.offset = RD_KAFKA_OFFSET_END; offsets = rd_kafka_topic_partition_list_new(1); - rd_kafka_topic_partition_list_add( + rktpar = rd_kafka_topic_partition_list_add( offsets, rktp->rktp_rkt->rkt_topic->str, - rktp->rktp_partition) - ->offset = query_offset; + rktp->rktp_partition); + rd_kafka_topic_partition_set_from_fetch_pos(rktpar, query_pos); + rd_kafka_topic_partition_set_current_leader_epoch( + rktpar, rktp->rktp_leader_epoch); rd_kafka_ListOffsetsRequest( rkb, offsets, RD_KAFKA_REPLYQ(rktp->rktp_ops, rktp->rktp_op_version), - rd_kafka_toppar_handle_Offset, rktp); + rd_kafka_toppar_handle_Offset, + -1, /* don't set an absolute timeout */ + rktp); rd_kafka_topic_partition_list_destroy(offsets); } @@ -1573,7 +1700,7 @@ void rd_kafka_toppar_offset_request(rd_kafka_toppar_t *rktp, * Locks: none */ static void rd_kafka_toppar_fetch_start(rd_kafka_toppar_t *rktp, - int64_t offset, + rd_kafka_fetch_pos_t pos, rd_kafka_op_t *rko_orig) { rd_kafka_cgrp_t *rkcg = rko_orig->rko_u.fetch_start.rkcg; rd_kafka_resp_err_t err = 0; @@ -1584,11 +1711,11 @@ static void rd_kafka_toppar_fetch_start(rd_kafka_toppar_t *rktp, rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "FETCH", "Start fetch for %.*s [%" PRId32 "] in " - "state %s at offset %s (v%" PRId32 ")", + "state %s at %s (v%" PRId32 ")", RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), rktp->rktp_partition, rd_kafka_fetch_states[rktp->rktp_fetch_state], - rd_kafka_offset2str(offset), version); + rd_kafka_fetch_pos2str(pos), version); if (rktp->rktp_fetch_state == RD_KAFKA_TOPPAR_FETCH_STOPPING) { err = RD_KAFKA_RESP_ERR__PREV_IN_PROGRESS; @@ -1607,29 +1734,25 @@ static void rd_kafka_toppar_fetch_start(rd_kafka_toppar_t *rktp, } - if (offset == RD_KAFKA_OFFSET_BEGINNING || - offset == RD_KAFKA_OFFSET_END || - offset <= RD_KAFKA_OFFSET_TAIL_BASE) { - rd_kafka_toppar_next_offset_handle(rktp, offset); + if (pos.offset == RD_KAFKA_OFFSET_BEGINNING || + pos.offset == RD_KAFKA_OFFSET_END || + pos.offset <= RD_KAFKA_OFFSET_TAIL_BASE) { + rd_kafka_toppar_next_offset_handle(rktp, pos); - } else if (offset == RD_KAFKA_OFFSET_STORED) { + } else if (pos.offset == RD_KAFKA_OFFSET_STORED) { rd_kafka_offset_store_init(rktp); - } else if (offset == RD_KAFKA_OFFSET_INVALID) { - rd_kafka_offset_reset(rktp, RD_KAFKA_NODEID_UA, offset, + } else if (pos.offset == RD_KAFKA_OFFSET_INVALID) { + rd_kafka_offset_reset(rktp, RD_KAFKA_NODEID_UA, pos, RD_KAFKA_RESP_ERR__NO_OFFSET, "no previously committed offset " "available"); } else { - rktp->rktp_next_offset = offset; + rd_kafka_toppar_set_next_fetch_position(rktp, pos); + rd_kafka_toppar_set_fetch_state(rktp, RD_KAFKA_TOPPAR_FETCH_ACTIVE); - - /* Wake-up broker thread which might be idling on IO */ - if (rktp->rktp_broker) - rd_kafka_broker_wakeup(rktp->rktp_broker, - "fetch start"); } rktp->rktp_offsets_fin.eof_offset = RD_KAFKA_OFFSET_INVALID; @@ -1665,7 +1788,8 @@ void rd_kafka_toppar_fetch_stopped(rd_kafka_toppar_t *rktp, rd_kafka_toppar_set_fetch_state(rktp, RD_KAFKA_TOPPAR_FETCH_STOPPED); - rktp->rktp_app_offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_app_pos.offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_app_pos.leader_epoch = -1; if (rktp->rktp_cgrp) { /* Detach toppar from cgrp */ @@ -1738,7 +1862,7 @@ void rd_kafka_toppar_fetch_stop(rd_kafka_toppar_t *rktp, * Locality: toppar handler thread */ void rd_kafka_toppar_seek(rd_kafka_toppar_t *rktp, - int64_t offset, + rd_kafka_fetch_pos_t pos, rd_kafka_op_t *rko_orig) { rd_kafka_resp_err_t err = 0; int32_t version = rko_orig->rko_version; @@ -1746,11 +1870,9 @@ void rd_kafka_toppar_seek(rd_kafka_toppar_t *rktp, rd_kafka_toppar_lock(rktp); rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "FETCH", - "Seek %.*s [%" PRId32 - "] to offset %s " - "in state %s (v%" PRId32 ")", + "Seek %.*s [%" PRId32 "] to %s in state %s (v%" PRId32 ")", RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, rd_kafka_offset2str(offset), + rktp->rktp_partition, rd_kafka_fetch_pos2str(pos), rd_kafka_fetch_states[rktp->rktp_fetch_state], version); @@ -1760,7 +1882,7 @@ void rd_kafka_toppar_seek(rd_kafka_toppar_t *rktp, } else if (!RD_KAFKA_TOPPAR_FETCH_IS_STARTED(rktp->rktp_fetch_state)) { err = RD_KAFKA_RESP_ERR__STATE; goto err_reply; - } else if (offset == RD_KAFKA_OFFSET_STORED) { + } else if (pos.offset == RD_KAFKA_OFFSET_STORED) { err = RD_KAFKA_RESP_ERR__INVALID_ARG; goto err_reply; } @@ -1770,23 +1892,22 @@ void rd_kafka_toppar_seek(rd_kafka_toppar_t *rktp, /* Reset app offsets since seek()ing is analogue to a (re)assign(), * and we want to avoid using the current app offset on resume() * following a seek (#3567). */ - rktp->rktp_app_offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_app_pos.offset = RD_KAFKA_OFFSET_INVALID; + rktp->rktp_app_pos.leader_epoch = -1; /* Abort pending offset lookups. */ if (rktp->rktp_fetch_state == RD_KAFKA_TOPPAR_FETCH_OFFSET_QUERY) rd_kafka_timer_stop(&rktp->rktp_rkt->rkt_rk->rk_timers, &rktp->rktp_offset_query_tmr, 1 /*lock*/); - if (RD_KAFKA_OFFSET_IS_LOGICAL(offset)) - rd_kafka_toppar_next_offset_handle(rktp, offset); - else { - rktp->rktp_next_offset = offset; - rd_kafka_toppar_set_fetch_state(rktp, - RD_KAFKA_TOPPAR_FETCH_ACTIVE); - - /* Wake-up broker thread which might be idling on IO */ - if (rktp->rktp_broker) - rd_kafka_broker_wakeup(rktp->rktp_broker, "seek done"); + if (pos.offset <= 0 || pos.validated) { + rd_kafka_toppar_next_offset_handle(rktp, pos); + } else { + rd_kafka_toppar_set_fetch_state( + rktp, RD_KAFKA_TOPPAR_FETCH_VALIDATE_EPOCH_WAIT); + rd_kafka_toppar_set_next_fetch_position(rktp, pos); + rd_kafka_toppar_set_offset_validation_position(rktp, pos); + rd_kafka_offset_validate(rktp, "seek"); } /* Signal back to caller thread that seek has commenced, or err */ @@ -1798,10 +1919,9 @@ err_reply: rko = rd_kafka_op_new(RD_KAFKA_OP_SEEK | RD_KAFKA_OP_REPLY); - rko->rko_err = err; - rko->rko_u.fetch_start.offset = - rko_orig->rko_u.fetch_start.offset; - rko->rko_rktp = rd_kafka_toppar_keep(rktp); + rko->rko_err = err; + rko->rko_u.fetch_start.pos = rko_orig->rko_u.fetch_start.pos; + rko->rko_rktp = rd_kafka_toppar_keep(rktp); rd_kafka_replyq_enq(&rko_orig->rko_replyq, rko, 0); } @@ -1848,19 +1968,18 @@ static void rd_kafka_toppar_pause_resume(rd_kafka_toppar_t *rktp, if (rk->rk_type == RD_KAFKA_CONSUMER) { /* Save offset of last consumed message+1 as the * next message to fetch on resume. */ - if (rktp->rktp_app_offset != RD_KAFKA_OFFSET_INVALID) { - rktp->rktp_next_offset = rktp->rktp_app_offset; - } + if (rktp->rktp_app_pos.offset != + RD_KAFKA_OFFSET_INVALID) + rd_kafka_toppar_set_next_fetch_position( + rktp, rktp->rktp_app_pos); rd_kafka_dbg( rk, TOPIC, pause ? "PAUSE" : "RESUME", - "%s %s [%" PRId32 - "]: at offset %s " - "(state %s, v%d)", + "%s %s [%" PRId32 "]: at %s (state %s, v%d)", pause ? "Pause" : "Resume", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rd_kafka_offset2str(rktp->rktp_next_offset), + rd_kafka_fetch_pos2str(rktp->rktp_next_fetch_start), rd_kafka_fetch_states[rktp->rktp_fetch_state], version); } else { @@ -1882,16 +2001,14 @@ static void rd_kafka_toppar_pause_resume(rd_kafka_toppar_t *rktp, if (rk->rk_type == RD_KAFKA_CONSUMER) { rd_kafka_dbg( rk, TOPIC, pause ? "PAUSE" : "RESUME", - "%s %s [%" PRId32 - "]: at offset %s " - "(state %s, v%d)", + "%s %s [%" PRId32 "]: at %s (state %s, v%d)", rktp->rktp_fetch_state == RD_KAFKA_TOPPAR_FETCH_ACTIVE ? "Resuming" : "Not resuming stopped", rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - rd_kafka_offset2str(rktp->rktp_next_offset), + rd_kafka_fetch_pos2str(rktp->rktp_next_fetch_start), rd_kafka_fetch_states[rktp->rktp_fetch_state], version); @@ -1907,9 +2024,10 @@ static void rd_kafka_toppar_pause_resume(rd_kafka_toppar_t *rktp, RD_KAFKA_TOPPAR_FETCH_ACTIVE || rktp->rktp_fetch_state == RD_KAFKA_TOPPAR_FETCH_OFFSET_WAIT) && - rktp->rktp_next_offset == RD_KAFKA_OFFSET_INVALID) + rktp->rktp_next_fetch_start.offset == + RD_KAFKA_OFFSET_INVALID) rd_kafka_toppar_next_offset_handle( - rktp, rktp->rktp_next_offset); + rktp, rktp->rktp_next_fetch_start); } else rd_kafka_dbg( @@ -1995,7 +2113,7 @@ static rd_kafka_op_res_t rd_kafka_toppar_op_serve(rd_kafka_t *rk, switch ((int)rko->rko_type) { case RD_KAFKA_OP_FETCH_START: - rd_kafka_toppar_fetch_start(rktp, rko->rko_u.fetch_start.offset, + rd_kafka_toppar_fetch_start(rktp, rko->rko_u.fetch_start.pos, rko); break; @@ -2004,7 +2122,7 @@ static rd_kafka_op_res_t rd_kafka_toppar_op_serve(rd_kafka_t *rk, break; case RD_KAFKA_OP_SEEK: - rd_kafka_toppar_seek(rktp, rko->rko_u.fetch_start.offset, rko); + rd_kafka_toppar_seek(rktp, rko->rko_u.fetch_start.pos, rko); break; case RD_KAFKA_OP_PAUSE: @@ -2022,16 +2140,19 @@ static rd_kafka_op_res_t rd_kafka_toppar_op_serve(rd_kafka_t *rk, /* OffsetFetch reply */ rd_kafka_topic_partition_list_t *offsets = rko->rko_u.offset_fetch.partitions; - int64_t offset = RD_KAFKA_OFFSET_INVALID; + rd_kafka_fetch_pos_t pos = {RD_KAFKA_OFFSET_INVALID, -1}; + + rktp = rd_kafka_topic_partition_get_toppar( + rk, &offsets->elems[0], rd_true /*create-on-miss*/); - rktp = offsets->elems[0]._private; if (!rko->rko_err) { /* Request succeeded but per-partition might have failed */ rko->rko_err = offsets->elems[0].err; - offset = offsets->elems[0].offset; + pos = rd_kafka_topic_partition_get_fetch_pos( + &offsets->elems[0]); } - offsets->elems[0]._private = NULL; + rd_kafka_topic_partition_list_destroy(offsets); rko->rko_u.offset_fetch.partitions = NULL; @@ -2067,31 +2188,30 @@ static rd_kafka_op_res_t rd_kafka_toppar_op_serve(rd_kafka_t *rk, "offsets from brokers: %s", rd_kafka_err2str(rko->rko_err)); + /* Refcount from get_toppar() */ rd_kafka_toppar_destroy(rktp); break; } rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "OFFSET", - "%.*s [%" PRId32 - "]: OffsetFetch returned " - "offset %s (%" PRId64 ")", + "%.*s [%" PRId32 "]: OffsetFetch returned %s", RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, rd_kafka_offset2str(offset), - offset); + rktp->rktp_partition, rd_kafka_fetch_pos2str(pos)); - if (offset > 0) - rktp->rktp_committed_offset = offset; + if (pos.offset > 0) + rktp->rktp_committed_pos = pos; - if (offset >= 0) - rd_kafka_toppar_next_offset_handle(rktp, offset); + if (pos.offset >= 0) + rd_kafka_toppar_next_offset_handle(rktp, pos); else - rd_kafka_offset_reset(rktp, RD_KAFKA_NODEID_UA, offset, + rd_kafka_offset_reset(rktp, RD_KAFKA_NODEID_UA, pos, RD_KAFKA_RESP_ERR__NO_OFFSET, "no previously committed offset " "available"); rd_kafka_toppar_unlock(rktp); + /* Refcount from get_toppar() */ rd_kafka_toppar_destroy(rktp); } break; @@ -2130,7 +2250,7 @@ static void rd_kafka_toppar_op0(rd_kafka_toppar_t *rktp, static void rd_kafka_toppar_op(rd_kafka_toppar_t *rktp, rd_kafka_op_type_t type, int32_t version, - int64_t offset, + rd_kafka_fetch_pos_t pos, rd_kafka_cgrp_t *rkcg, rd_kafka_replyq_t replyq) { rd_kafka_op_t *rko; @@ -2140,7 +2260,7 @@ static void rd_kafka_toppar_op(rd_kafka_toppar_t *rktp, if (type == RD_KAFKA_OP_FETCH_START || type == RD_KAFKA_OP_SEEK) { if (rkcg) rko->rko_u.fetch_start.rkcg = rkcg; - rko->rko_u.fetch_start.offset = offset; + rko->rko_u.fetch_start.pos = pos; } rd_kafka_toppar_op0(rktp, rko, replyq); @@ -2158,7 +2278,7 @@ static void rd_kafka_toppar_op(rd_kafka_toppar_t *rktp, * This is the thread-safe interface that can be called from any thread. */ rd_kafka_resp_err_t rd_kafka_toppar_op_fetch_start(rd_kafka_toppar_t *rktp, - int64_t offset, + rd_kafka_fetch_pos_t pos, rd_kafka_q_t *fwdq, rd_kafka_replyq_t replyq) { int32_t version; @@ -2173,14 +2293,12 @@ rd_kafka_resp_err_t rd_kafka_toppar_op_fetch_start(rd_kafka_toppar_t *rktp, version = rd_kafka_toppar_version_new_barrier(rktp); rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "CONSUMER", - "Start consuming %.*s [%" PRId32 - "] at " - "offset %s (v%" PRId32 ")", + "Start consuming %.*s [%" PRId32 "] at %s (v%" PRId32 ")", RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, rd_kafka_offset2str(offset), + rktp->rktp_partition, rd_kafka_fetch_pos2str(pos), version); - rd_kafka_toppar_op(rktp, RD_KAFKA_OP_FETCH_START, version, offset, + rd_kafka_toppar_op(rktp, RD_KAFKA_OP_FETCH_START, version, pos, rktp->rktp_rkt->rkt_rk->rk_cgrp, replyq); return RD_KAFKA_RESP_ERR_NO_ERROR; @@ -2205,22 +2323,24 @@ rd_kafka_resp_err_t rd_kafka_toppar_op_fetch_stop(rd_kafka_toppar_t *rktp, RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), rktp->rktp_partition, version); - rd_kafka_toppar_op(rktp, RD_KAFKA_OP_FETCH_STOP, version, 0, NULL, - replyq); + rd_kafka_toppar_op(rktp, RD_KAFKA_OP_FETCH_STOP, version, + RD_KAFKA_FETCH_POS(-1, -1), NULL, replyq); return RD_KAFKA_RESP_ERR_NO_ERROR; } /** - * Set/Seek offset of a consumed partition (async operation). - * 'offset' is the target offset - * 'replyq' is an optional queue for handling the ack. + * @brief Set/Seek offset of a consumed partition (async operation). + * + * @param offset is the target offset. + * @param leader_epoch is the partition leader epoch, or -1. + * @param replyq is an optional queue for handling the ack. * * This is the thread-safe interface that can be called from any thread. */ rd_kafka_resp_err_t rd_kafka_toppar_op_seek(rd_kafka_toppar_t *rktp, - int64_t offset, + rd_kafka_fetch_pos_t pos, rd_kafka_replyq_t replyq) { int32_t version; @@ -2228,15 +2348,12 @@ rd_kafka_resp_err_t rd_kafka_toppar_op_seek(rd_kafka_toppar_t *rktp, version = rd_kafka_toppar_version_new_barrier(rktp); rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "CONSUMER", - "Seek %.*s [%" PRId32 - "] to " - "offset %s (v%" PRId32 ")", + "Seek %.*s [%" PRId32 "] to %s (v%" PRId32 ")", RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), - rktp->rktp_partition, rd_kafka_offset2str(offset), + rktp->rktp_partition, rd_kafka_fetch_pos2str(pos), version); - rd_kafka_toppar_op(rktp, RD_KAFKA_OP_SEEK, version, offset, NULL, - replyq); + rd_kafka_toppar_op(rktp, RD_KAFKA_OP_SEEK, version, pos, NULL, replyq); return RD_KAFKA_RESP_ERR_NO_ERROR; } @@ -2256,7 +2373,22 @@ rd_kafka_resp_err_t rd_kafka_toppar_op_pause_resume(rd_kafka_toppar_t *rktp, int flag, rd_kafka_replyq_t replyq) { int32_t version; - rd_kafka_op_t *rko; + rd_kafka_op_t *rko = rd_kafka_op_new(RD_KAFKA_OP_PAUSE); + + if (!pause) { + /* If partitions isn't paused, avoid bumping its version, + * as it'll result in resuming fetches from a stale + * next_fetch_start */ + rd_bool_t is_paused = rd_false; + rd_kafka_toppar_lock(rktp); + is_paused = RD_KAFKA_TOPPAR_IS_PAUSED(rktp); + rd_kafka_toppar_unlock(rktp); + if (!is_paused) { + rko->rko_replyq = replyq; + rd_kafka_op_reply(rko, RD_KAFKA_RESP_ERR_NO_ERROR); + return RD_KAFKA_RESP_ERR_NO_ERROR; + } + } /* Bump version barrier. */ version = rd_kafka_toppar_version_new_barrier(rktp); @@ -2267,7 +2399,6 @@ rd_kafka_resp_err_t rd_kafka_toppar_op_pause_resume(rd_kafka_toppar_t *rktp, RD_KAFKAP_STR_PR(rktp->rktp_rkt->rkt_topic), rktp->rktp_partition, version); - rko = rd_kafka_op_new(RD_KAFKA_OP_PAUSE); rko->rko_version = version; rko->rko_u.pause.pause = pause; rko->rko_u.pause.flag = flag; @@ -2453,7 +2584,8 @@ void rd_kafka_toppar_leader_unavailable(rd_kafka_toppar_t *rktp, rkt->rkt_flags |= RD_KAFKA_TOPIC_F_LEADER_UNAVAIL; rd_kafka_topic_wrunlock(rkt); - rd_kafka_topic_fast_leader_query(rkt->rkt_rk); + rd_kafka_topic_fast_leader_query(rkt->rkt_rk, + rd_false /* don't force */); } @@ -2478,7 +2610,6 @@ void rd_kafka_topic_partition_get(const rd_kafka_topic_partition_t *rktpar, } - /** * * rd_kafka_topic_partition_t lists @@ -2526,7 +2657,17 @@ rd_kafka_topic_partition_list_t *rd_kafka_topic_partition_list_new(int size) { return rktparlist; } +rd_kafka_topic_partition_t * +rd_kafka_topic_partition_new_with_topic_id(rd_kafka_Uuid_t topic_id, + int32_t partition) { + rd_kafka_topic_partition_private_t *parpriv; + rd_kafka_topic_partition_t *rktpar = rd_calloc(1, sizeof(*rktpar)); + rktpar->partition = partition; + parpriv = rd_kafka_topic_partition_get_private(rktpar); + parpriv->topic_id = topic_id; + return rktpar; +} rd_kafka_topic_partition_t *rd_kafka_topic_partition_new(const char *topic, int32_t partition) { @@ -2538,10 +2679,60 @@ rd_kafka_topic_partition_t *rd_kafka_topic_partition_new(const char *topic, return rktpar; } +/** + * @brief Update \p dst with info from \p src. + */ +static void +rd_kafka_topic_partition_update(rd_kafka_topic_partition_t *dst, + const rd_kafka_topic_partition_t *src) { + const rd_kafka_topic_partition_private_t *srcpriv; + rd_kafka_topic_partition_private_t *dstpriv; + + rd_dassert(!strcmp(dst->topic, src->topic)); + rd_dassert(dst->partition == src->partition); + rd_dassert(dst != src); + + dst->offset = src->offset; + dst->opaque = src->opaque; + dst->err = src->err; + + if (src->metadata_size > 0) { + dst->metadata = rd_malloc(src->metadata_size); + dst->metadata_size = src->metadata_size; + ; + memcpy(dst->metadata, src->metadata, dst->metadata_size); + } + + if ((srcpriv = src->_private)) { + dstpriv = rd_kafka_topic_partition_get_private(dst); + if (srcpriv->rktp && !dstpriv->rktp) + dstpriv->rktp = rd_kafka_toppar_keep(srcpriv->rktp); + + rd_assert(dstpriv->rktp == srcpriv->rktp); + + dstpriv->leader_epoch = srcpriv->leader_epoch; + + dstpriv->current_leader_epoch = srcpriv->current_leader_epoch; + + dstpriv->topic_id = srcpriv->topic_id; + + } else if ((dstpriv = dst->_private)) { + /* No private object in source, reset the fields. */ + dstpriv->leader_epoch = -1; + dstpriv->current_leader_epoch = -1; + dstpriv->topic_id = RD_KAFKA_UUID_ZERO; + } +} + rd_kafka_topic_partition_t * rd_kafka_topic_partition_copy(const rd_kafka_topic_partition_t *src) { - return rd_kafka_topic_partition_new(src->topic, src->partition); + rd_kafka_topic_partition_t *dst = + rd_kafka_topic_partition_new(src->topic, src->partition); + + rd_kafka_topic_partition_update(dst, src); + + return dst; } @@ -2561,7 +2752,15 @@ rd_kafka_topic_partition_new_from_rktp(rd_kafka_toppar_t *rktp) { return rktpar; } - +/** + * @brief Destroy a partition private glue object. + */ +static void rd_kafka_topic_partition_private_destroy( + rd_kafka_topic_partition_private_t *parpriv) { + if (parpriv->rktp) + rd_kafka_toppar_destroy(parpriv->rktp); + rd_free(parpriv); +} static void rd_kafka_topic_partition_destroy0(rd_kafka_topic_partition_t *rktpar, @@ -2571,13 +2770,117 @@ rd_kafka_topic_partition_destroy0(rd_kafka_topic_partition_t *rktpar, if (rktpar->metadata) rd_free(rktpar->metadata); if (rktpar->_private) - rd_kafka_toppar_destroy((rd_kafka_toppar_t *)rktpar->_private); + rd_kafka_topic_partition_private_destroy( + (rd_kafka_topic_partition_private_t *)rktpar->_private); if (do_free) rd_free(rktpar); } +int32_t rd_kafka_topic_partition_get_leader_epoch( + const rd_kafka_topic_partition_t *rktpar) { + const rd_kafka_topic_partition_private_t *parpriv; + + if (!(parpriv = rktpar->_private)) + return -1; + + return parpriv->leader_epoch; +} + +void rd_kafka_topic_partition_set_leader_epoch( + rd_kafka_topic_partition_t *rktpar, + int32_t leader_epoch) { + rd_kafka_topic_partition_private_t *parpriv; + + /* Avoid allocating private_t if clearing the epoch */ + if (leader_epoch == -1 && !rktpar->_private) + return; + + parpriv = rd_kafka_topic_partition_get_private(rktpar); + + parpriv->leader_epoch = leader_epoch; +} + +int32_t rd_kafka_topic_partition_get_current_leader_epoch( + const rd_kafka_topic_partition_t *rktpar) { + const rd_kafka_topic_partition_private_t *parpriv; + + if (!(parpriv = rktpar->_private)) + return -1; + + return parpriv->current_leader_epoch; +} + +/** + * @brief Sets topic id for partition \p rktpar. + * + * @param rktpar Topic partition. + * @param topic_id Topic id to set. + */ +void rd_kafka_topic_partition_set_topic_id(rd_kafka_topic_partition_t *rktpar, + rd_kafka_Uuid_t topic_id) { + rd_kafka_topic_partition_private_t *parpriv; + parpriv = rd_kafka_topic_partition_get_private(rktpar); + parpriv->topic_id = topic_id; +} + +/** + * @brief Gets topic id from topic-partition \p rktpar. + * + * @param rktpar Topic partition. + * @return Topic id, or RD_KAFKA_UUID_ZERO. + */ +rd_kafka_Uuid_t rd_kafka_topic_partition_get_topic_id( + const rd_kafka_topic_partition_t *rktpar) { + const rd_kafka_topic_partition_private_t *parpriv; + + if (!(parpriv = rktpar->_private)) + return RD_KAFKA_UUID_ZERO; + + return parpriv->topic_id; +} + +void rd_kafka_topic_partition_set_current_leader_epoch( + rd_kafka_topic_partition_t *rktpar, + int32_t current_leader_epoch) { + rd_kafka_topic_partition_private_t *parpriv; + + /* Avoid allocating private_t if clearing the epoch */ + if (current_leader_epoch == -1 && !rktpar->_private) + return; + + parpriv = rd_kafka_topic_partition_get_private(rktpar); + + parpriv->current_leader_epoch = current_leader_epoch; +} + +/** + * @brief Set offset and leader epoch from a fetchpos. + */ +void rd_kafka_topic_partition_set_from_fetch_pos( + rd_kafka_topic_partition_t *rktpar, + const rd_kafka_fetch_pos_t fetchpos) { + rktpar->offset = fetchpos.offset; + rd_kafka_topic_partition_set_leader_epoch(rktpar, + fetchpos.leader_epoch); +} + +/** + * @brief Set partition metadata from rktp stored one. + */ +void rd_kafka_topic_partition_set_metadata_from_rktp_stored( + rd_kafka_topic_partition_t *rktpar, + const rd_kafka_toppar_t *rktp) { + rktpar->metadata_size = rktp->rktp_stored_metadata_size; + if (rktp->rktp_stored_metadata) { + rktpar->metadata = rd_malloc(rktp->rktp_stored_metadata_size); + memcpy(rktpar->metadata, rktp->rktp_stored_metadata, + rktpar->metadata_size); + } +} + + /** * @brief Destroy all partitions in list. * @@ -2630,22 +2933,23 @@ void rd_kafka_topic_partition_list_destroy_free(void *ptr) { (rd_kafka_topic_partition_list_t *)ptr); } - /** - * Add a partition to an rktpar list. + * @brief Add a partition to an rktpar list. * The list must have enough room to fit it. * - * '_private' must be NULL or a valid 'rd_kafka_toppar_t *'. + * @param rktp Optional partition object that will be stored on the + * ._private object (with refcount increased). * - * Returns a pointer to the added element. + * @returns a pointer to the added element. */ -rd_kafka_topic_partition_t * -rd_kafka_topic_partition_list_add0(const char *func, - int line, - rd_kafka_topic_partition_list_t *rktparlist, - const char *topic, - int32_t partition, - rd_kafka_toppar_t *_private) { +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_add0( + const char *func, + int line, + rd_kafka_topic_partition_list_t *rktparlist, + const char *topic, + int32_t partition, + rd_kafka_toppar_t *rktp, + const rd_kafka_topic_partition_private_t *parpriv) { rd_kafka_topic_partition_t *rktpar; if (rktparlist->cnt == rktparlist->size) rd_kafka_topic_partition_list_grow(rktparlist, 1); @@ -2653,12 +2957,27 @@ rd_kafka_topic_partition_list_add0(const char *func, rktpar = &rktparlist->elems[rktparlist->cnt++]; memset(rktpar, 0, sizeof(*rktpar)); - rktpar->topic = rd_strdup(topic); + if (topic) + rktpar->topic = rd_strdup(topic); rktpar->partition = partition; rktpar->offset = RD_KAFKA_OFFSET_INVALID; - rktpar->_private = _private; - if (_private) - rd_kafka_toppar_keep_fl(func, line, _private); + + if (parpriv) { + rd_kafka_topic_partition_private_t *parpriv_copy = + rd_kafka_topic_partition_get_private(rktpar); + if (parpriv->rktp) { + parpriv_copy->rktp = + rd_kafka_toppar_keep_fl(func, line, parpriv->rktp); + } + parpriv_copy->leader_epoch = parpriv->leader_epoch; + parpriv_copy->current_leader_epoch = + parpriv->current_leader_epoch; + parpriv_copy->topic_id = parpriv->topic_id; + } else if (rktp) { + rd_kafka_topic_partition_private_t *parpriv_copy = + rd_kafka_topic_partition_get_private(rktpar); + parpriv_copy->rktp = rd_kafka_toppar_keep_fl(func, line, rktp); + } return rktpar; } @@ -2669,7 +2988,37 @@ rd_kafka_topic_partition_list_add(rd_kafka_topic_partition_list_t *rktparlist, const char *topic, int32_t partition) { return rd_kafka_topic_partition_list_add0( - __FUNCTION__, __LINE__, rktparlist, topic, partition, NULL); + __FUNCTION__, __LINE__, rktparlist, topic, partition, NULL, NULL); +} + + +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_add_with_topic_id( + rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + int32_t partition) { + rd_kafka_topic_partition_t *rktpar; + rktpar = rd_kafka_topic_partition_list_add0( + __FUNCTION__, __LINE__, rktparlist, NULL, partition, NULL, NULL); + rd_kafka_topic_partition_private_t *parpriv = + rd_kafka_topic_partition_get_private(rktpar); + parpriv->topic_id = topic_id; + return rktpar; +} + + +rd_kafka_topic_partition_t * +rd_kafka_topic_partition_list_add_with_topic_name_and_id( + rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + const char *topic, + int32_t partition) { + rd_kafka_topic_partition_t *rktpar; + rktpar = rd_kafka_topic_partition_list_add0( + __FUNCTION__, __LINE__, rktparlist, topic, partition, NULL, NULL); + rd_kafka_topic_partition_private_t *parpriv = + rd_kafka_topic_partition_get_private(rktpar); + parpriv->topic_id = topic_id; + return rktpar; } @@ -2701,40 +3050,24 @@ rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_upsert( } -/** - * @brief Update \p dst with info from \p src. - */ -void rd_kafka_topic_partition_update(rd_kafka_topic_partition_t *dst, - const rd_kafka_topic_partition_t *src) { - rd_dassert(!strcmp(dst->topic, src->topic)); - rd_dassert(dst->partition == src->partition); - rd_dassert(dst != src); - - dst->offset = src->offset; - dst->opaque = src->opaque; - dst->err = src->err; - - if (src->metadata_size > 0) { - dst->metadata = rd_malloc(src->metadata_size); - dst->metadata_size = src->metadata_size; - ; - memcpy(dst->metadata, src->metadata, dst->metadata_size); - } -} /** * @brief Creates a copy of \p rktpar and adds it to \p rktparlist + * + * @return Copy of passed partition that was added to the list + * + * @remark Ownership of returned partition remains of the list. */ -void rd_kafka_topic_partition_list_add_copy( +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_add_copy( rd_kafka_topic_partition_list_t *rktparlist, const rd_kafka_topic_partition_t *rktpar) { rd_kafka_topic_partition_t *dst; dst = rd_kafka_topic_partition_list_add0( __FUNCTION__, __LINE__, rktparlist, rktpar->topic, - rktpar->partition, rktpar->_private); - + rktpar->partition, NULL, rktpar->_private); rd_kafka_topic_partition_update(dst, rktpar); + return dst; } @@ -2829,30 +3162,16 @@ rd_kafka_toppar_t * rd_kafka_topic_partition_ensure_toppar(rd_kafka_t *rk, rd_kafka_topic_partition_t *rktpar, rd_bool_t create_on_miss) { - if (!rktpar->_private) - rktpar->_private = rd_kafka_toppar_get2( - rk, rktpar->topic, rktpar->partition, 0, create_on_miss); - return rktpar->_private; -} + rd_kafka_topic_partition_private_t *parpriv; + parpriv = rd_kafka_topic_partition_get_private(rktpar); -/** - * @returns (and sets if necessary) the \p rktpar's _private / toppar. - * @remark a new reference is returned. - */ -rd_kafka_toppar_t * -rd_kafka_topic_partition_get_toppar(rd_kafka_t *rk, - rd_kafka_topic_partition_t *rktpar, - rd_bool_t create_on_miss) { - rd_kafka_toppar_t *rktp; + if (!parpriv->rktp) + parpriv->rktp = rd_kafka_toppar_get2( + rk, rktpar->topic, rktpar->partition, + 0 /* not ua on miss */, create_on_miss); - rktp = - rd_kafka_topic_partition_ensure_toppar(rk, rktpar, create_on_miss); - - if (rktp) - rd_kafka_toppar_keep(rktp); - - return rktp; + return parpriv->rktp; } @@ -2866,6 +3185,25 @@ int rd_kafka_topic_partition_cmp(const void *_a, const void *_b) { return RD_CMP(a->partition, b->partition); } +/** + * @brief Compare topic partitions \p a and \p b by topic id first + * and then by partition. + */ +int rd_kafka_topic_partition_by_id_cmp(const void *_a, const void *_b) { + const rd_kafka_topic_partition_t *a = _a; + const rd_kafka_topic_partition_t *b = _b; + rd_kafka_Uuid_t topic_id_a = rd_kafka_topic_partition_get_topic_id(a); + rd_kafka_Uuid_t topic_id_b = rd_kafka_topic_partition_get_topic_id(b); + int are_topic_ids_different = rd_kafka_Uuid_cmp(topic_id_a, topic_id_b); + return are_topic_ids_different || RD_CMP(a->partition, b->partition); +} + +static int rd_kafka_topic_partition_by_id_cmp_opaque(const void *_a, + const void *_b, + void *opaque) { + return rd_kafka_topic_partition_by_id_cmp(_a, _b); +} + /** @brief Compare only the topic */ int rd_kafka_topic_partition_cmp_topic(const void *_a, const void *_b) { const rd_kafka_topic_partition_t *a = _a; @@ -2873,19 +3211,36 @@ int rd_kafka_topic_partition_cmp_topic(const void *_a, const void *_b) { return strcmp(a->topic, b->topic); } +/** @brief Compare only the topic id */ +int rd_kafka_topic_partition_cmp_topic_id(const void *_a, const void *_b) { + const rd_kafka_topic_partition_t *a = _a; + const rd_kafka_topic_partition_t *b = _b; + return rd_kafka_Uuid_cmp(rd_kafka_topic_partition_get_topic_id(a), + rd_kafka_topic_partition_get_topic_id(b)); +} + static int rd_kafka_topic_partition_cmp_opaque(const void *_a, const void *_b, void *opaque) { return rd_kafka_topic_partition_cmp(_a, _b); } -/** @returns a hash of the topic and partition */ +/** @returns a hash of the topic name and partition */ unsigned int rd_kafka_topic_partition_hash(const void *_a) { const rd_kafka_topic_partition_t *a = _a; int r = 31 * 17 + a->partition; return 31 * r + rd_string_hash(a->topic, -1); } +/** @returns a hash of the topic id and partition */ +unsigned int rd_kafka_topic_partition_hash_by_id(const void *_a) { + const rd_kafka_topic_partition_t *a = _a; + const rd_kafka_Uuid_t topic_id = + rd_kafka_topic_partition_get_topic_id(a); + int r = 31 * 17 + a->partition; + return 31 * r + rd_kafka_Uuid_hash(&topic_id); +} + /** @@ -2911,6 +3266,31 @@ static int rd_kafka_topic_partition_list_find0( return -1; } +/** + * @brief Search 'rktparlist' for \p topic_id and \p partition with comparator + * \p cmp. + * @returns the elems[] index or -1 on miss. + */ +static int rd_kafka_topic_partition_list_find_by_id0( + const rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + int32_t partition, + int (*cmp)(const void *, const void *)) { + int i, ret = -1; + rd_kafka_topic_partition_t *rktpar = + rd_kafka_topic_partition_new_with_topic_id(topic_id, partition); + + for (i = 0; i < rktparlist->cnt; i++) { + if (!cmp(rktpar, &rktparlist->elems[i])) { + ret = i; + break; + } + } + + rd_kafka_topic_partition_destroy(rktpar); + return ret; +} + rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find( const rd_kafka_topic_partition_list_t *rktparlist, const char *topic, @@ -2923,6 +3303,22 @@ rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find( return &rktparlist->elems[i]; } +/** + * @brief Search 'rktparlist' for 'topic_id' and 'partition'. + * @returns Found topic partition or NULL. + */ +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_by_id( + const rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + int32_t partition) { + int i = rd_kafka_topic_partition_list_find_by_id0( + rktparlist, topic_id, partition, + rd_kafka_topic_partition_by_id_cmp); + if (i == -1) + return NULL; + else + return &rktparlist->elems[i]; +} int rd_kafka_topic_partition_list_find_idx( const rd_kafka_topic_partition_list_t *rktparlist, @@ -2932,11 +3328,24 @@ int rd_kafka_topic_partition_list_find_idx( rktparlist, topic, partition, rd_kafka_topic_partition_cmp); } +/** + * @brief Search 'rktparlist' for \p topic_id and \p partition. + * @returns the elems[] index or -1 on miss. + */ +int rd_kafka_topic_partition_list_find_idx_by_id( + const rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + int32_t partition) { + return rd_kafka_topic_partition_list_find_by_id0( + rktparlist, topic_id, partition, + rd_kafka_topic_partition_by_id_cmp); +} + /** * @returns the first element that matches \p topic, regardless of partition. */ -rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_topic( +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_topic_by_name( const rd_kafka_topic_partition_list_t *rktparlist, const char *topic) { int i = rd_kafka_topic_partition_list_find0( @@ -2948,6 +3357,21 @@ rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_topic( return &rktparlist->elems[i]; } +/** + * @returns the first element that matches \p topic_id, regardless of partition. + */ +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_topic_by_id( + const rd_kafka_topic_partition_list_t *rktparlist, + const rd_kafka_Uuid_t topic_id) { + int i = rd_kafka_topic_partition_list_find_by_id0( + rktparlist, topic_id, RD_KAFKA_PARTITION_UA, + rd_kafka_topic_partition_cmp_topic_id); + if (i == -1) + return NULL; + else + return &rktparlist->elems[i]; +} + int rd_kafka_topic_partition_list_del_by_idx( rd_kafka_topic_partition_list_t *rktparlist, @@ -3038,6 +3462,12 @@ void rd_kafka_topic_partition_list_sort_by_topic( rktparlist, rd_kafka_topic_partition_cmp_opaque, NULL); } +void rd_kafka_topic_partition_list_sort_by_topic_id( + rd_kafka_topic_partition_list_t *rktparlist) { + rd_kafka_topic_partition_list_sort( + rktparlist, rd_kafka_topic_partition_by_id_cmp_opaque, NULL); +} + rd_kafka_resp_err_t rd_kafka_topic_partition_list_set_offset( rd_kafka_topic_partition_list_t *rktparlist, const char *topic, @@ -3089,26 +3519,33 @@ int rd_kafka_topic_partition_list_set_offsets( for (i = 0; i < rktparlist->cnt; i++) { rd_kafka_topic_partition_t *rktpar = &rktparlist->elems[i]; const char *verb = "setting"; - char preamble[80]; + char preamble[128]; *preamble = '\0'; /* Avoid warning */ if (from_rktp) { - rd_kafka_toppar_t *rktp = rktpar->_private; + rd_kafka_toppar_t *rktp = + rd_kafka_topic_partition_ensure_toppar(rk, rktpar, + rd_true); rd_kafka_toppar_lock(rktp); if (rk->rk_conf.debug & (RD_KAFKA_DBG_CGRP | RD_KAFKA_DBG_TOPIC)) rd_snprintf(preamble, sizeof(preamble), - "stored offset %" PRId64 - ", committed offset %" PRId64 ": ", - rktp->rktp_stored_offset, - rktp->rktp_committed_offset); + "stored %s, committed %s: ", + rd_kafka_fetch_pos2str( + rktp->rktp_stored_pos), + rd_kafka_fetch_pos2str( + rktp->rktp_committed_pos)); - if (rktp->rktp_stored_offset > - rktp->rktp_committed_offset) { - verb = "setting stored"; - rktpar->offset = rktp->rktp_stored_offset; + if (rd_kafka_fetch_pos_cmp(&rktp->rktp_stored_pos, + &rktp->rktp_committed_pos) > + 0) { + verb = "setting stored"; + rd_kafka_topic_partition_set_from_fetch_pos( + rktpar, rktp->rktp_stored_pos); + rd_kafka_topic_partition_set_metadata_from_rktp_stored( + rktpar, rktp); } else { rktpar->offset = RD_KAFKA_OFFSET_INVALID; } @@ -3117,6 +3554,8 @@ int rd_kafka_topic_partition_list_set_offsets( if (RD_KAFKA_OFFSET_IS_LOGICAL(rktpar->offset)) { verb = "setting default"; rktpar->offset = def_value; + rd_kafka_topic_partition_set_leader_epoch( + rktpar, -1); } else verb = "keeping"; } @@ -3129,13 +3568,15 @@ int rd_kafka_topic_partition_list_set_offsets( rktpar->topic, rktpar->partition, preamble); else - rd_kafka_dbg(rk, CGRP | RD_KAFKA_DBG_TOPIC, "OFFSET", - "Topic %s [%" PRId32 - "]: " - "%s%s offset %s%s", - rktpar->topic, rktpar->partition, preamble, - verb, rd_kafka_offset2str(rktpar->offset), - is_commit ? " for commit" : ""); + rd_kafka_dbg( + rk, CGRP | RD_KAFKA_DBG_TOPIC, "OFFSET", + "Topic %s [%" PRId32 + "]: " + "%s%s offset %s (leader epoch %" PRId32 ") %s", + rktpar->topic, rktpar->partition, preamble, verb, + rd_kafka_offset2str(rktpar->offset), + rd_kafka_topic_partition_get_leader_epoch(rktpar), + is_commit ? " for commit" : ""); if (!RD_KAFKA_OFFSET_IS_LOGICAL(rktpar->offset)) valid_cnt++; @@ -3175,10 +3616,8 @@ void rd_kafka_topic_partition_list_update_toppars( for (i = 0; i < rktparlist->cnt; i++) { rd_kafka_topic_partition_t *rktpar = &rktparlist->elems[i]; - if (!rktpar->_private) - rktpar->_private = rd_kafka_toppar_get2( - rk, rktpar->topic, rktpar->partition, - 0 /*not ua-on-miss*/, create_on_miss); + rd_kafka_topic_partition_ensure_toppar(rk, rktpar, + create_on_miss); } } @@ -3238,11 +3677,12 @@ static rd_bool_t rd_kafka_topic_partition_list_get_leaders( struct rd_kafka_partition_leader *leader; const rd_kafka_metadata_topic_t *mtopic; const rd_kafka_metadata_partition_t *mpart; + const rd_kafka_metadata_partition_internal_t *mdpi; rd_bool_t topic_wait_cache; rd_kafka_metadata_cache_topic_partition_get( - rk, &mtopic, &mpart, rktpar->topic, rktpar->partition, - 0 /*negative entries too*/); + rk, &mtopic, &mpart, &mdpi, rktpar->topic, + rktpar->partition, 0 /*negative entries too*/); topic_wait_cache = !mtopic || @@ -3310,9 +3750,11 @@ static rd_bool_t rd_kafka_topic_partition_list_get_leaders( rd_kafka_topic_partition_update(rktpar2, rktpar); } else { /* Make a copy of rktpar and add to partitions list */ - rd_kafka_topic_partition_list_add_copy( + rktpar2 = rd_kafka_topic_partition_list_add_copy( leader->partitions, rktpar); } + rd_kafka_topic_partition_set_current_leader_epoch( + rktpar2, mdpi->leader_epoch); rktpar->err = RD_KAFKA_RESP_ERR_NO_ERROR; @@ -3436,7 +3878,7 @@ rd_kafka_topic_partition_list_query_leaders_async_worker(rd_kafka_op_t *rko) { rd_kafka_metadata_refresh_topics( rk, NULL, &query_topics, rd_true /*force*/, rd_false /*!allow_auto_create*/, rd_false /*!cgrp_update*/, - "query partition leaders"); + -1, "query partition leaders"); } rd_list_destroy(leaders); @@ -3625,7 +4067,7 @@ rd_kafka_resp_err_t rd_kafka_topic_partition_list_query_leaders( rd_kafka_metadata_refresh_topics( rk, NULL, &query_topics, rd_true /*force*/, rd_false /*!allow_auto_create*/, - rd_false /*!cgrp_update*/, + rd_false /*!cgrp_update*/, -1, "query partition leaders"); ts_query = now; query_cnt++; @@ -3784,11 +4226,16 @@ const char *rd_kafka_topic_partition_list_str( int i; size_t of = 0; + if (!rktparlist->cnt) + dest[0] = '\0'; for (i = 0; i < rktparlist->cnt; i++) { const rd_kafka_topic_partition_t *rktpar = &rktparlist->elems[i]; char errstr[128]; char offsetstr[32]; + const char *topic_id_str = NULL; + const rd_kafka_Uuid_t topic_id = + rd_kafka_topic_partition_get_topic_id(rktpar); int r; if (!rktpar->err && (fmt_flags & RD_KAFKA_FMT_F_ONLY_ERR)) @@ -3806,14 +4253,19 @@ const char *rd_kafka_topic_partition_list_str( else offsetstr[0] = '\0'; + + if (!RD_KAFKA_UUID_IS_ZERO(topic_id)) + topic_id_str = rd_kafka_Uuid_base64str(&topic_id); + r = rd_snprintf(&dest[of], dest_size - of, "%s" - "%s[%" PRId32 + "%s(%s)[%" PRId32 "]" "%s" "%s", of == 0 ? "" : ", ", rktpar->topic, - rktpar->partition, offsetstr, errstr); + topic_id_str, rktpar->partition, offsetstr, + errstr); if ((size_t)r >= dest_size - of) { rd_snprintf(&dest[dest_size - 4], 4, "..."); @@ -3835,6 +4287,7 @@ const char *rd_kafka_topic_partition_list_str( * - metadata * - metadata_size * - offset + * - offset leader epoch * - err * * Will only update partitions that are in both dst and src, other partitions @@ -3848,6 +4301,7 @@ void rd_kafka_topic_partition_list_update( for (i = 0; i < dst->cnt; i++) { rd_kafka_topic_partition_t *d = &dst->elems[i]; rd_kafka_topic_partition_t *s; + rd_kafka_topic_partition_private_t *s_priv, *d_priv; if (!(s = rd_kafka_topic_partition_list_find( (rd_kafka_topic_partition_list_t *)src, d->topic, @@ -3867,6 +4321,12 @@ void rd_kafka_topic_partition_list_update( memcpy((void *)d->metadata, s->metadata, s->metadata_size); } + + s_priv = rd_kafka_topic_partition_get_private(s); + d_priv = rd_kafka_topic_partition_get_private(d); + d_priv->leader_epoch = s_priv->leader_epoch; + d_priv->current_leader_epoch = s_priv->current_leader_epoch; + d_priv->topic_id = s_priv->topic_id; } } @@ -3967,6 +4427,89 @@ int rd_kafka_topic_partition_list_regex_cnt( } +/** + * @brief Match function that returns true if topic is not a regex. + */ +static int rd_kafka_topic_partition_not_regex(const void *elem, + const void *opaque) { + const rd_kafka_topic_partition_t *rktpar = elem; + return *rktpar->topic != '^'; +} + +/** + * @brief Return a new list with all regex topics removed. + * + * @remark The caller is responsible for freeing the returned list. + */ +rd_kafka_topic_partition_list_t *rd_kafka_topic_partition_list_remove_regexes( + const rd_kafka_topic_partition_list_t *rktparlist) { + return rd_kafka_topic_partition_list_match( + rktparlist, rd_kafka_topic_partition_not_regex, NULL); +} + + +/** + * @brief Combine regexes present in the list into a single regex. + */ +rd_kafkap_str_t *rd_kafka_topic_partition_list_combine_regexes( + const rd_kafka_topic_partition_list_t *rktparlist) { + int i; + int combined_regex_len = 1; /* 1 for null-terminator */ + int regex_cnt = 0; + int j = 1; + rd_bool_t is_first_regex = rd_true; + char *combined_regex_str; + rd_kafkap_str_t *combined_regex_kstr; + + // Count the number of characters needed for the combined regex string + for (i = 0; i < rktparlist->cnt; i++) { + const rd_kafka_topic_partition_t *rktpar = + &(rktparlist->elems[i]); + if (*rktpar->topic == '^') { + combined_regex_len += strlen(rktpar->topic); + regex_cnt++; + } + } + + if (regex_cnt == 0) + return rd_kafkap_str_new("", 0); + + combined_regex_len += + 3 * (regex_cnt - 1); /* 1 for each ')|(' separator */ + combined_regex_len += 2; /* 2 for enclosing brackets */ + + // memory allocation for the combined regex string + combined_regex_str = rd_malloc(combined_regex_len); + + // Construct the combined regex string + combined_regex_str[0] = '('; + for (i = 0; i < rktparlist->cnt; i++) { + const rd_kafka_topic_partition_t *rktpar = + &(rktparlist->elems[i]); + char *topic = rktpar->topic; + if (*topic == '^') { + if (!is_first_regex) { + combined_regex_str[j++] = ')'; + combined_regex_str[j++] = '|'; + combined_regex_str[j++] = '('; + } + while (*topic) { + combined_regex_str[j++] = *topic; + topic++; + } + is_first_regex = rd_false; + } + } + combined_regex_str[j++] = ')'; + combined_regex_str[j] = '\0'; + + combined_regex_kstr = + rd_kafkap_str_new(combined_regex_str, combined_regex_len - 1); + rd_free(combined_regex_str); + return combined_regex_kstr; +} + + /** * @brief Reset base sequence for this toppar. * @@ -4173,3 +4716,175 @@ void rd_kafka_partition_leader_destroy_free(void *ptr) { struct rd_kafka_partition_leader *leader = ptr; rd_kafka_partition_leader_destroy(leader); } + + +const char *rd_kafka_fetch_pos2str(const rd_kafka_fetch_pos_t fetchpos) { + static RD_TLS char ret[2][64]; + static int idx; + + idx = (idx + 1) % 2; + + rd_snprintf( + ret[idx], sizeof(ret[idx]), "offset %s (leader epoch %" PRId32 ")", + rd_kafka_offset2str(fetchpos.offset), fetchpos.leader_epoch); + + return ret[idx]; +} + +typedef RD_MAP_TYPE(const rd_kafka_topic_partition_t *, + void *) map_toppar_void_t; + +/** + * @brief Calculates \p a ∩ \p b using \p cmp and \p hash . + * Ordered following \p a order. Elements are copied from \p a. + */ +static rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_intersection0( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b, + int(cmp)(const void *_a, const void *_b), + unsigned int(hash)(const void *_a)) { + rd_kafka_topic_partition_t *rktpar; + rd_kafka_topic_partition_list_t *ret = + rd_kafka_topic_partition_list_new(a->cnt < b->cnt ? a->cnt + : b->cnt); + map_toppar_void_t b_map = + RD_MAP_INITIALIZER(b->cnt, cmp, hash, NULL, NULL); + RD_KAFKA_TPLIST_FOREACH(rktpar, b) { + RD_MAP_SET(&b_map, rktpar, rktpar); + } + RD_KAFKA_TPLIST_FOREACH(rktpar, a) { + if ((RD_MAP_GET(&b_map, rktpar) != NULL) == 1) { + rd_kafka_topic_partition_list_add_copy(ret, rktpar); + } + } + RD_MAP_DESTROY(&b_map); + return ret; +} + +/** + * @brief Calculates \p a - \p b using \p cmp and \p hash . + * Ordered following \p a order. Elements are copied from \p a. + */ +static rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_difference0(rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b, + int(cmp)(const void *_a, + const void *_b), + unsigned int(hash)(const void *_a)) { + rd_kafka_topic_partition_t *rktpar; + rd_kafka_topic_partition_list_t *ret = + rd_kafka_topic_partition_list_new(a->cnt); + map_toppar_void_t b_map = + RD_MAP_INITIALIZER(b->cnt, cmp, hash, NULL, NULL); + RD_KAFKA_TPLIST_FOREACH(rktpar, b) { + RD_MAP_SET(&b_map, rktpar, rktpar); + } + RD_KAFKA_TPLIST_FOREACH(rktpar, a) { + if ((RD_MAP_GET(&b_map, rktpar) != NULL) == 0) { + rd_kafka_topic_partition_list_add_copy(ret, rktpar); + } + } + RD_MAP_DESTROY(&b_map); + return ret; +} + +/** + * @brief Calculates \p a ∪ \p b using \p cmp and \p hash . + * Ordered following \p a order for elements in \p a + * and \p b order for elements only in \p b. + * Elements are copied the same way. + */ +static rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_union0(rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b, + int(cmp)(const void *_a, const void *_b), + unsigned int(hash)(const void *_a)) { + + rd_kafka_topic_partition_list_t *b_minus_a = + rd_kafka_topic_partition_list_difference0(b, a, cmp, hash); + rd_kafka_topic_partition_list_t *ret = + rd_kafka_topic_partition_list_new(a->cnt + b_minus_a->cnt); + + rd_kafka_topic_partition_list_add_list(ret, a); + rd_kafka_topic_partition_list_add_list(ret, b_minus_a); + + rd_kafka_topic_partition_list_destroy(b_minus_a); + return ret; +} + +/** + * @brief Calculates \p a ∩ \p b using topic name and partition id. + * Ordered following \p a order. Elements are copied from \p a. + */ +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_intersection_by_name( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b) { + return rd_kafka_topic_partition_list_intersection0( + a, b, rd_kafka_topic_partition_cmp, rd_kafka_topic_partition_hash); +} + +/** + * @brief Calculates \p a - \p b using topic name and partition id. + * Ordered following \p a order. Elements are copied from \p a. + */ +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_difference_by_name( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b) { + return rd_kafka_topic_partition_list_difference0( + a, b, rd_kafka_topic_partition_cmp, rd_kafka_topic_partition_hash); +} + +/** + * @brief Calculates \p a ∪ \p b using topic name and partition id. + * Ordered following \p a order for elements in \p a + * and \p b order for elements only in \p b. + * Elements are copied the same way. + */ +rd_kafka_topic_partition_list_t *rd_kafka_topic_partition_list_union_by_name( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b) { + return rd_kafka_topic_partition_list_union0( + a, b, rd_kafka_topic_partition_cmp, rd_kafka_topic_partition_hash); +} + +/** + * @brief Calculates \p a ∩ \p b using topic id and partition id. + * Ordered following \p a order. Elements are copied from \p a. + */ +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_intersection_by_id( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b) { + return rd_kafka_topic_partition_list_intersection0( + a, b, rd_kafka_topic_partition_by_id_cmp, + rd_kafka_topic_partition_hash_by_id); +} + +/** + * @brief Calculates \p a - \p b using topic id and partition id. + * Ordered following \p a order. Elements are copied from \p a. + */ +rd_kafka_topic_partition_list_t *rd_kafka_topic_partition_list_difference_by_id( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b) { + return rd_kafka_topic_partition_list_difference0( + a, b, rd_kafka_topic_partition_by_id_cmp, + rd_kafka_topic_partition_hash_by_id); +} + +/** + * @brief Calculates \p a ∪ \p b using topic id and partition id. + * Ordered following \p a order for elements in \p a + * and \p b order for elements only in \p b. + * Elements are copied the same way. + */ +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_union_by_id(rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b) { + return rd_kafka_topic_partition_list_union0( + a, b, rd_kafka_topic_partition_by_id_cmp, + rd_kafka_topic_partition_hash_by_id); +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_partition.h b/src/third_party/librdkafka/dist/src/rdkafka_partition.h index e869820ef8f..97a704f03b9 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_partition.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_partition.h @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill, + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -39,16 +40,17 @@ extern const char *rd_kafka_fetch_states[]; * @brief Offset statistics */ struct offset_stats { - int64_t fetch_offset; /**< Next offset to fetch */ - int64_t eof_offset; /**< Last offset we reported EOF for */ + rd_kafka_fetch_pos_t fetch_pos; /**< Next offset to fetch */ + int64_t eof_offset; /**< Last offset we reported EOF for */ }; /** * @brief Reset offset_stats struct to default values */ static RD_UNUSED void rd_kafka_offset_stats_reset(struct offset_stats *offs) { - offs->fetch_offset = 0; - offs->eof_offset = RD_KAFKA_OFFSET_INVALID; + offs->fetch_pos.offset = 0; + offs->fetch_pos.leader_epoch = -1; + offs->eof_offset = RD_KAFKA_OFFSET_INVALID; } @@ -66,6 +68,65 @@ struct rd_kafka_toppar_err { * last msg sequence */ }; +/** + * @brief Fetchpos comparator, only offset is compared. + */ +static RD_UNUSED RD_INLINE int +rd_kafka_fetch_pos_cmp_offset(const rd_kafka_fetch_pos_t *a, + const rd_kafka_fetch_pos_t *b) { + return (RD_CMP(a->offset, b->offset)); +} + +/** + * @brief Fetchpos comparator, leader epoch has precedence + * iff both values are not null. + */ +static RD_UNUSED RD_INLINE int +rd_kafka_fetch_pos_cmp(const rd_kafka_fetch_pos_t *a, + const rd_kafka_fetch_pos_t *b) { + if (a->leader_epoch == -1 || b->leader_epoch == -1) + return rd_kafka_fetch_pos_cmp_offset(a, b); + if (a->leader_epoch < b->leader_epoch) + return -1; + else if (a->leader_epoch > b->leader_epoch) + return 1; + else + return rd_kafka_fetch_pos_cmp_offset(a, b); +} + + +static RD_UNUSED RD_INLINE void +rd_kafka_fetch_pos_init(rd_kafka_fetch_pos_t *fetchpos) { + fetchpos->offset = RD_KAFKA_OFFSET_INVALID; + fetchpos->leader_epoch = -1; +} + +const char *rd_kafka_fetch_pos2str(const rd_kafka_fetch_pos_t fetchpos); + +static RD_UNUSED RD_INLINE rd_kafka_fetch_pos_t +rd_kafka_fetch_pos_make(int64_t offset, + int32_t leader_epoch, + rd_bool_t validated) { + rd_kafka_fetch_pos_t fetchpos = {offset, leader_epoch, validated}; + return fetchpos; +} + +#ifdef RD_HAS_STATEMENT_EXPRESSIONS +#define RD_KAFKA_FETCH_POS0(offset, leader_epoch, validated) \ + ({ \ + rd_kafka_fetch_pos_t _fetchpos = {offset, leader_epoch, \ + validated}; \ + _fetchpos; \ + }) +#else +#define RD_KAFKA_FETCH_POS0(offset, leader_epoch, validated) \ + rd_kafka_fetch_pos_make(offset, leader_epoch, validated) +#endif + +#define RD_KAFKA_FETCH_POS(offset, leader_epoch) \ + RD_KAFKA_FETCH_POS0(offset, leader_epoch, rd_false) + + typedef TAILQ_HEAD(rd_kafka_toppar_tqhead_s, rd_kafka_toppar_s) rd_kafka_toppar_tqhead_t; @@ -231,17 +292,22 @@ struct rd_kafka_toppar_s { /* rd_kafka_toppar_t */ int32_t rktp_fetch_version; /* Op version of curr fetch. (broker thread) */ - enum { RD_KAFKA_TOPPAR_FETCH_NONE = 0, - RD_KAFKA_TOPPAR_FETCH_STOPPING, - RD_KAFKA_TOPPAR_FETCH_STOPPED, - RD_KAFKA_TOPPAR_FETCH_OFFSET_QUERY, - RD_KAFKA_TOPPAR_FETCH_OFFSET_WAIT, - RD_KAFKA_TOPPAR_FETCH_ACTIVE, + enum { + RD_KAFKA_TOPPAR_FETCH_NONE = 0, + RD_KAFKA_TOPPAR_FETCH_STOPPING, + RD_KAFKA_TOPPAR_FETCH_STOPPED, + RD_KAFKA_TOPPAR_FETCH_OFFSET_QUERY, + RD_KAFKA_TOPPAR_FETCH_OFFSET_WAIT, + RD_KAFKA_TOPPAR_FETCH_VALIDATE_EPOCH_WAIT, + RD_KAFKA_TOPPAR_FETCH_ACTIVE, } rktp_fetch_state; /* Broker thread's state */ #define RD_KAFKA_TOPPAR_FETCH_IS_STARTED(fetch_state) \ ((fetch_state) >= RD_KAFKA_TOPPAR_FETCH_OFFSET_QUERY) + int32_t rktp_leader_epoch; /**< Last known partition leader epoch, + * or -1. */ + int32_t rktp_fetch_msg_max_bytes; /* Max number of bytes to * fetch. * Locality: broker thread @@ -252,25 +318,46 @@ struct rd_kafka_toppar_s { /* rd_kafka_toppar_t */ * absolute timestamp * expires. */ - int64_t rktp_query_offset; /* Offset to query broker for*/ - int64_t rktp_next_offset; /* Next offset to start - * fetching from. - * Locality: toppar thread */ - int64_t rktp_last_next_offset; /* Last next_offset handled - * by fetch_decide(). - * Locality: broker thread */ - int64_t rktp_app_offset; /* Last offset delivered to - * application + 1. - * Is reset to INVALID_OFFSET - * when partition is - * unassigned/stopped/seeked. */ - int64_t rktp_stored_offset; /* Last stored offset, but - * maybe not committed yet. */ - int64_t rktp_committing_offset; /* Offset currently being - * committed */ - int64_t rktp_committed_offset; /* Last committed offset */ - rd_ts_t rktp_ts_committed_offset; /* Timestamp of last - * commit */ + /** Offset to query broker for. */ + rd_kafka_fetch_pos_t rktp_query_pos; + + /** Next fetch start position. + * This is set up start, seek, resume, etc, to tell + * the fetcher where to start fetching. + * It is not updated for each fetch, see + * rktp_offsets.fetch_pos for that. + * @locality toppar thread */ + rd_kafka_fetch_pos_t rktp_next_fetch_start; + + /** The previous next fetch position. + * @locality toppar thread */ + rd_kafka_fetch_pos_t rktp_last_next_fetch_start; + + /** The offset to verify. + * @locality toppar thread */ + rd_kafka_fetch_pos_t rktp_offset_validation_pos; + + /** Application's position. + * This is the latest offset delivered to application + 1. + * It is reset to INVALID_OFFSET when partition is + * unassigned/stopped/seeked. */ + rd_kafka_fetch_pos_t rktp_app_pos; + + /** Last stored offset, but maybe not yet committed. */ + rd_kafka_fetch_pos_t rktp_stored_pos; + + /* Last stored metadata, but + * maybe not committed yet. */ + void *rktp_stored_metadata; + size_t rktp_stored_metadata_size; + + /** Offset currently being committed */ + rd_kafka_fetch_pos_t rktp_committing_pos; + + /** Last (known) committed offset */ + rd_kafka_fetch_pos_t rktp_committed_pos; + + rd_ts_t rktp_ts_committed_offset; /**< Timestamp of last commit */ struct offset_stats rktp_offsets; /* Current offsets. * Locality: broker thread*/ @@ -347,6 +434,8 @@ struct rd_kafka_toppar_s { /* rd_kafka_toppar_t */ #define RD_KAFKA_TOPPAR_F_ASSIGNED \ 0x2000 /**< Toppar is part of the consumer \ * assignment. */ +#define RD_KAFKA_TOPPAR_F_VALIDATING \ + 0x4000 /**< Toppar is currently requesting validation. */ /* * Timers @@ -356,6 +445,8 @@ struct rd_kafka_toppar_s { /* rd_kafka_toppar_t */ rd_kafka_timer_t rktp_offset_sync_tmr; /* Offset file sync timer */ rd_kafka_timer_t rktp_consumer_lag_tmr; /* Consumer lag monitoring * timer */ + rd_kafka_timer_t rktp_validate_tmr; /**< Offset and epoch + * validation retry timer */ rd_interval_t rktp_lease_intvl; /**< Preferred replica lease * period */ @@ -389,6 +480,28 @@ struct rd_kafka_toppar_s { /* rd_kafka_toppar_t */ } rktp_c; }; +/** + * @struct This is a separately allocated glue object used in + * rd_kafka_topic_partition_t._private to allow referencing both + * an rktp and/or a leader epoch. Both are optional. + * The rktp, if non-NULL, owns a refcount. + * + * This glue object is not always set in ._private, but allocated on demand + * as necessary. + */ +typedef struct rd_kafka_topic_partition_private_s { + /** Reference to a toppar. Optional, may be NULL. */ + rd_kafka_toppar_t *rktp; + /** Current Leader epoch, if known, else -1. + * this is set when the API needs to send the last epoch known + * by the client. */ + int32_t current_leader_epoch; + /** Leader epoch if known, else -1. */ + int32_t leader_epoch; + /** Topic id. */ + rd_kafka_Uuid_t topic_id; +} rd_kafka_topic_partition_private_t; + /** * Check if toppar is paused (consumer). @@ -458,7 +571,10 @@ int rd_kafka_retry_msgq(rd_kafka_msgq_t *destq, int max_retries, rd_ts_t backoff, rd_kafka_msg_status_t status, - int (*cmp)(const void *a, const void *b)); + int (*cmp)(const void *a, const void *b), + rd_bool_t exponential_backoff, + int retry_ms, + int retry_max_ms); void rd_kafka_msgq_insert_msgq(rd_kafka_msgq_t *destq, rd_kafka_msgq_t *srcq, int (*cmp)(const void *a, const void *b)); @@ -498,14 +614,14 @@ void rd_kafka_toppar_desired_unlink(rd_kafka_toppar_t *rktp); void rd_kafka_toppar_desired_del(rd_kafka_toppar_t *rktp); void rd_kafka_toppar_next_offset_handle(rd_kafka_toppar_t *rktp, - int64_t Offset); + rd_kafka_fetch_pos_t next_pos); void rd_kafka_toppar_broker_delegate(rd_kafka_toppar_t *rktp, rd_kafka_broker_t *rkb); rd_kafka_resp_err_t rd_kafka_toppar_op_fetch_start(rd_kafka_toppar_t *rktp, - int64_t offset, + rd_kafka_fetch_pos_t pos, rd_kafka_q_t *fwdq, rd_kafka_replyq_t replyq); @@ -513,7 +629,7 @@ rd_kafka_resp_err_t rd_kafka_toppar_op_fetch_stop(rd_kafka_toppar_t *rktp, rd_kafka_replyq_t replyq); rd_kafka_resp_err_t rd_kafka_toppar_op_seek(rd_kafka_toppar_t *rktp, - int64_t offset, + rd_kafka_fetch_pos_t pos, rd_kafka_replyq_t replyq); rd_kafka_resp_err_t @@ -532,9 +648,11 @@ void rd_kafka_toppar_offset_fetch(rd_kafka_toppar_t *rktp, rd_kafka_replyq_t replyq); void rd_kafka_toppar_offset_request(rd_kafka_toppar_t *rktp, - int64_t query_offset, + rd_kafka_fetch_pos_t query_pos, int backoff_ms); +void rd_kafka_toppar_purge_internal_fetch_queue_maybe(rd_kafka_toppar_t *rktp); + int rd_kafka_toppar_purge_queues(rd_kafka_toppar_t *rktp, int purge_flags, rd_bool_t include_xmit_msgq); @@ -569,6 +687,13 @@ void *rd_kafka_topic_partition_copy_void(const void *src); void rd_kafka_topic_partition_destroy_free(void *ptr); rd_kafka_topic_partition_t * rd_kafka_topic_partition_new_from_rktp(rd_kafka_toppar_t *rktp); +rd_kafka_topic_partition_t * +rd_kafka_topic_partition_new_with_topic_id(rd_kafka_Uuid_t topic_id, + int32_t partition); +void rd_kafka_topic_partition_set_topic_id(rd_kafka_topic_partition_t *rktpar, + rd_kafka_Uuid_t topic_id); +rd_kafka_Uuid_t +rd_kafka_topic_partition_get_topic_id(const rd_kafka_topic_partition_t *rktpar); void rd_kafka_topic_partition_list_init( rd_kafka_topic_partition_list_t *rktparlist, @@ -578,20 +703,33 @@ void rd_kafka_topic_partition_list_destroy_free(void *ptr); void rd_kafka_topic_partition_list_clear( rd_kafka_topic_partition_list_t *rktparlist); +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_add0( + const char *func, + int line, + rd_kafka_topic_partition_list_t *rktparlist, + const char *topic, + int32_t partition, + rd_kafka_toppar_t *rktp, + const rd_kafka_topic_partition_private_t *parpriv); + +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_add_with_topic_id( + rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + int32_t partition); + rd_kafka_topic_partition_t * -rd_kafka_topic_partition_list_add0(const char *func, - int line, - rd_kafka_topic_partition_list_t *rktparlist, - const char *topic, - int32_t partition, - rd_kafka_toppar_t *_private); +rd_kafka_topic_partition_list_add_with_topic_name_and_id( + rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + const char *topic, + int32_t partition); rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_upsert( rd_kafka_topic_partition_list_t *rktparlist, const char *topic, int32_t partition); -void rd_kafka_topic_partition_list_add_copy( +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_add_copy( rd_kafka_topic_partition_list_t *rktparlist, const rd_kafka_topic_partition_t *rktpar); @@ -627,19 +765,38 @@ int rd_kafka_topic_partition_match(rd_kafka_t *rk, int rd_kafka_topic_partition_cmp(const void *_a, const void *_b); +int rd_kafka_topic_partition_by_id_cmp(const void *_a, const void *_b); unsigned int rd_kafka_topic_partition_hash(const void *a); int rd_kafka_topic_partition_list_find_idx( const rd_kafka_topic_partition_list_t *rktparlist, const char *topic, int32_t partition); -rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_topic( + +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_by_id( + const rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + int32_t partition); + +int rd_kafka_topic_partition_list_find_idx_by_id( + const rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id, + int32_t partition); + +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_topic_by_name( const rd_kafka_topic_partition_list_t *rktparlist, const char *topic); +rd_kafka_topic_partition_t *rd_kafka_topic_partition_list_find_topic_by_id( + const rd_kafka_topic_partition_list_t *rktparlist, + rd_kafka_Uuid_t topic_id); + void rd_kafka_topic_partition_list_sort_by_topic( rd_kafka_topic_partition_list_t *rktparlist); +void rd_kafka_topic_partition_list_sort_by_topic_id( + rd_kafka_topic_partition_list_t *rktparlist); + void rd_kafka_topic_partition_list_reset_offsets( rd_kafka_topic_partition_list_t *rktparlist, int64_t offset); @@ -658,15 +815,99 @@ int rd_kafka_topic_partition_list_cmp(const void *_a, const void *_b, int (*cmp)(const void *, const void *)); +/** + * Creates a new empty topic partition private. + * + * @remark This struct is dynamically allocated and hence should be freed. + */ +static RD_UNUSED RD_INLINE rd_kafka_topic_partition_private_t * +rd_kafka_topic_partition_private_new() { + rd_kafka_topic_partition_private_t *parpriv; + parpriv = rd_calloc(1, sizeof(*parpriv)); + parpriv->leader_epoch = -1; + parpriv->current_leader_epoch = -1; + return parpriv; +} + +/** + * @returns (and creates if necessary) the ._private glue object. + */ +static RD_UNUSED RD_INLINE rd_kafka_topic_partition_private_t * +rd_kafka_topic_partition_get_private(rd_kafka_topic_partition_t *rktpar) { + rd_kafka_topic_partition_private_t *parpriv; + + if (!(parpriv = rktpar->_private)) { + parpriv = rd_kafka_topic_partition_private_new(); + rktpar->_private = parpriv; + } + + return parpriv; +} + + +/** + * @returns the partition leader current epoch, if relevant and known, + * else -1. + * + * @param rktpar Partition object. + * + * @remark See KIP-320 for more information. + */ +int32_t rd_kafka_topic_partition_get_current_leader_epoch( + const rd_kafka_topic_partition_t *rktpar); + + +/** + * @brief Sets the partition leader current epoch (use -1 to clear). + * + * @param rktpar Partition object. + * @param leader_epoch Partition leader current epoch, use -1 to reset. + * + * @remark See KIP-320 for more information. + */ +void rd_kafka_topic_partition_set_current_leader_epoch( + rd_kafka_topic_partition_t *rktpar, + int32_t leader_epoch); + +/** + * @returns the partition's rktp if set (no refcnt increase), else NULL. + */ +static RD_INLINE RD_UNUSED rd_kafka_toppar_t * +rd_kafka_topic_partition_toppar(rd_kafka_t *rk, + const rd_kafka_topic_partition_t *rktpar) { + const rd_kafka_topic_partition_private_t *parpriv; + + if ((parpriv = rktpar->_private)) + return parpriv->rktp; + + return NULL; +} + rd_kafka_toppar_t * rd_kafka_topic_partition_ensure_toppar(rd_kafka_t *rk, rd_kafka_topic_partition_t *rktpar, rd_bool_t create_on_miss); -rd_kafka_toppar_t *rd_kafka_topic_partition_get_toppar( - rd_kafka_t *rk, - rd_kafka_topic_partition_t *rktpar, - rd_bool_t create_on_miss) RD_WARN_UNUSED_RESULT; +/** + * @returns (and sets if necessary) the \p rktpar's ._private. + * @remark a new reference is returned. + */ +static RD_INLINE RD_UNUSED rd_kafka_toppar_t * +rd_kafka_topic_partition_get_toppar(rd_kafka_t *rk, + rd_kafka_topic_partition_t *rktpar, + rd_bool_t create_on_miss) { + rd_kafka_toppar_t *rktp; + + rktp = + rd_kafka_topic_partition_ensure_toppar(rk, rktpar, create_on_miss); + + if (rktp) + rd_kafka_toppar_keep(rktp); + + return rktp; +} + + void rd_kafka_topic_partition_list_update_toppars( rd_kafka_t *rk, @@ -719,6 +960,23 @@ void rd_kafka_topic_partition_list_update( int rd_kafka_topic_partition_leader_cmp(const void *_a, const void *_b); +void rd_kafka_topic_partition_set_from_fetch_pos( + rd_kafka_topic_partition_t *rktpar, + const rd_kafka_fetch_pos_t fetchpos); + +void rd_kafka_topic_partition_set_metadata_from_rktp_stored( + rd_kafka_topic_partition_t *rktpar, + const rd_kafka_toppar_t *rktp); + +static RD_UNUSED rd_kafka_fetch_pos_t rd_kafka_topic_partition_get_fetch_pos( + const rd_kafka_topic_partition_t *rktpar) { + rd_kafka_fetch_pos_t fetchpos = { + rktpar->offset, rd_kafka_topic_partition_get_leader_epoch(rktpar)}; + + return fetchpos; +} + + /** * @brief Match function that returns true if partition has a valid offset. */ @@ -753,6 +1011,12 @@ rd_kafka_resp_err_t rd_kafka_topic_partition_list_get_err( int rd_kafka_topic_partition_list_regex_cnt( const rd_kafka_topic_partition_list_t *rktparlist); +rd_kafka_topic_partition_list_t *rd_kafka_topic_partition_list_remove_regexes( + const rd_kafka_topic_partition_list_t *rktparlist); + +rd_kafkap_str_t *rd_kafka_topic_partition_list_combine_regexes( + const rd_kafka_topic_partition_list_t *rktparlist); + void *rd_kafka_topic_partition_list_copy_opaque(const void *src, void *opaque); /** @@ -865,4 +1129,54 @@ static RD_UNUSED int rd_kafka_toppar_topic_cmp(const void *_a, const void *_b) { } +/** + * @brief Set's the partitions next fetch position, i.e., the next offset + * to start fetching from. + * + * @locks rd_kafka_toppar_lock(rktp) MUST be held. + */ +static RD_UNUSED RD_INLINE void +rd_kafka_toppar_set_next_fetch_position(rd_kafka_toppar_t *rktp, + rd_kafka_fetch_pos_t next_pos) { + rktp->rktp_next_fetch_start = next_pos; +} + +/** + * @brief Sets the offset validation position. + * + * @locks rd_kafka_toppar_lock(rktp) MUST be held. + */ +static RD_UNUSED RD_INLINE void rd_kafka_toppar_set_offset_validation_position( + rd_kafka_toppar_t *rktp, + rd_kafka_fetch_pos_t offset_validation_pos) { + rktp->rktp_offset_validation_pos = offset_validation_pos; +} + +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_intersection_by_name( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b); + +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_difference_by_name( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b); + +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_union_by_name(rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b); + +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_intersection_by_id( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b); + +rd_kafka_topic_partition_list_t *rd_kafka_topic_partition_list_difference_by_id( + rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b); + +rd_kafka_topic_partition_list_t * +rd_kafka_topic_partition_list_union_by_id(rd_kafka_topic_partition_list_t *a, + rd_kafka_topic_partition_list_t *b); + #endif /* _RDKAFKA_PARTITION_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_pattern.c b/src/third_party/librdkafka/dist/src/rdkafka_pattern.c index dfe3ef03e60..425f8201a52 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_pattern.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_pattern.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_pattern.h b/src/third_party/librdkafka/dist/src/rdkafka_pattern.h index 88d183cd32c..5ef6a3464c1 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_pattern.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_pattern.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_plugin.c b/src/third_party/librdkafka/dist/src/rdkafka_plugin.c index f58bc5060c6..f084eff7a76 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_plugin.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_plugin.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_plugin.h b/src/third_party/librdkafka/dist/src/rdkafka_plugin.h index 1783d5f53cd..cb50a8647ad 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_plugin.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_plugin.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_proto.h b/src/third_party/librdkafka/dist/src/rdkafka_proto.h index f5ae9ed753a..02565ecb3ba 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_proto.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_proto.h @@ -1,7 +1,9 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012,2013 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. + * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,8 +32,10 @@ #define _RDKAFKA_PROTO_H_ +#include "rdstring.h" #include "rdendian.h" #include "rdvarint.h" +#include "rdbase64.h" /* Protocol defines */ #include "rdkafka_protocol.h" @@ -152,13 +156,26 @@ static RD_UNUSED const char *rd_kafka_ApiKey2str(int16_t ApiKey) { "DescribeUserScramCredentialsRequest", [RD_KAFKAP_AlterUserScramCredentials] = "AlterUserScramCredentialsRequest", - [RD_KAFKAP_Vote] = "VoteRequest", - [RD_KAFKAP_BeginQuorumEpoch] = "BeginQuorumEpochRequest", - [RD_KAFKAP_EndQuorumEpoch] = "EndQuorumEpochRequest", - [RD_KAFKAP_DescribeQuorum] = "DescribeQuorumRequest", - [RD_KAFKAP_AlterIsr] = "AlterIsrRequest", - [RD_KAFKAP_UpdateFeatures] = "UpdateFeaturesRequest", - [RD_KAFKAP_Envelope] = "EnvelopeRequest", + [RD_KAFKAP_Vote] = "VoteRequest", + [RD_KAFKAP_BeginQuorumEpoch] = "BeginQuorumEpochRequest", + [RD_KAFKAP_EndQuorumEpoch] = "EndQuorumEpochRequest", + [RD_KAFKAP_DescribeQuorum] = "DescribeQuorumRequest", + [RD_KAFKAP_AlterIsr] = "AlterIsrRequest", + [RD_KAFKAP_UpdateFeatures] = "UpdateFeaturesRequest", + [RD_KAFKAP_Envelope] = "EnvelopeRequest", + [RD_KAFKAP_FetchSnapshot] = "FetchSnapshot", + [RD_KAFKAP_DescribeCluster] = "DescribeCluster", + [RD_KAFKAP_DescribeProducers] = "DescribeProducers", + [RD_KAFKAP_BrokerHeartbeat] = "BrokerHeartbeat", + [RD_KAFKAP_UnregisterBroker] = "UnregisterBroker", + [RD_KAFKAP_DescribeTransactions] = "DescribeTransactions", + [RD_KAFKAP_ListTransactions] = "ListTransactions", + [RD_KAFKAP_AllocateProducerIds] = "AllocateProducerIds", + [RD_KAFKAP_ConsumerGroupHeartbeat] = "ConsumerGroupHeartbeat", + [RD_KAFKAP_ConsumerGroupDescribe] = "ConsumerGroupDescribe", + [RD_KAFKAP_GetTelemetrySubscriptions] = "GetTelemetrySubscriptions", + [RD_KAFKAP_PushTelemetry] = "PushTelemetry", + }; static RD_TLS char ret[64]; @@ -267,6 +284,8 @@ typedef struct rd_kafkap_str_s { #define RD_KAFKAP_STR_INITIALIZER \ { .len = RD_KAFKAP_STR_LEN_NULL, .str = NULL } +#define RD_KAFKAP_STR_INITIALIZER_EMPTY \ + { .len = 0, .str = "" } /** * Frees a Kafka string previously allocated with `rd_kafkap_str_new()` */ @@ -370,7 +389,7 @@ typedef struct rd_kafkap_bytes_s { int32_t len; /* Kafka bytes length (-1=NULL, 0=empty, >0=data) */ const void *data; /* points just past the struct, or other memory, * not NULL-terminated */ - const char _data[1]; /* Bytes following struct when new()ed */ + const unsigned char _data[1]; /* Bytes following struct when new()ed */ } rd_kafkap_bytes_t; @@ -415,7 +434,7 @@ static RD_UNUSED void rd_kafkap_bytes_destroy(rd_kafkap_bytes_t *kbytes) { * - No-copy, just alloc (bytes==NULL,len>0) */ static RD_INLINE RD_UNUSED rd_kafkap_bytes_t * -rd_kafkap_bytes_new(const char *bytes, int32_t len) { +rd_kafkap_bytes_new(const unsigned char *bytes, int32_t len) { rd_kafkap_bytes_t *kbytes; int32_t klen; @@ -432,7 +451,7 @@ rd_kafkap_bytes_new(const char *bytes, int32_t len) { if (len == RD_KAFKAP_BYTES_LEN_NULL) kbytes->data = NULL; else { - kbytes->data = ((const char *)(kbytes + 1)) + 4; + kbytes->data = ((const unsigned char *)(kbytes + 1)) + 4; if (bytes) memcpy((void *)kbytes->data, bytes, len); } @@ -447,7 +466,7 @@ rd_kafkap_bytes_new(const char *bytes, int32_t len) { */ static RD_INLINE RD_UNUSED rd_kafkap_bytes_t * rd_kafkap_bytes_copy(const rd_kafkap_bytes_t *src) { - return rd_kafkap_bytes_new((const char *)src->data, src->len); + return rd_kafkap_bytes_new((const unsigned char *)src->data, src->len); } @@ -557,6 +576,76 @@ typedef struct rd_kafka_buf_s rd_kafka_buf_t; (8 + 4 + 4 + 1 + 4 + 2 + 4 + 8 + 8 + 8 + 2 + 4) +/** + * @struct Struct representing UUID protocol primitive type. + */ +typedef struct rd_kafka_Uuid_s { + int64_t + most_significant_bits; /**< Most significant 64 bits for the UUID */ + int64_t least_significant_bits; /**< Least significant 64 bits for the + UUID */ + char base64str[23]; /**< base64 encoding for the uuid. By default, it is + lazy loaded. Use function + `rd_kafka_Uuid_base64str()` as a getter for this + field. */ +} rd_kafka_Uuid_t; + +#define RD_KAFKA_UUID_ZERO \ + (rd_kafka_Uuid_t) { \ + 0, 0, "" \ + } + +#define RD_KAFKA_UUID_IS_ZERO(uuid) \ + (!rd_kafka_Uuid_cmp(uuid, RD_KAFKA_UUID_ZERO)) + +#define RD_KAFKA_UUID_METADATA_TOPIC_ID \ + (rd_kafka_Uuid_t) { \ + 0, 1, "" \ + } + +static RD_INLINE RD_UNUSED int rd_kafka_Uuid_cmp(rd_kafka_Uuid_t a, + rd_kafka_Uuid_t b) { + if (a.most_significant_bits < b.most_significant_bits) + return -1; + if (a.most_significant_bits > b.most_significant_bits) + return 1; + if (a.least_significant_bits < b.least_significant_bits) + return -1; + if (a.least_significant_bits > b.least_significant_bits) + return 1; + return 0; +} + +static RD_INLINE RD_UNUSED int rd_kafka_Uuid_ptr_cmp(void *a, void *b) { + rd_kafka_Uuid_t *a_uuid = a, *b_uuid = b; + return rd_kafka_Uuid_cmp(*a_uuid, *b_uuid); +} + +rd_kafka_Uuid_t rd_kafka_Uuid_random(); + +char *rd_kafka_Uuid_str(const rd_kafka_Uuid_t *uuid); + +unsigned int rd_kafka_Uuid_hash(const rd_kafka_Uuid_t *uuid); + +unsigned int rd_kafka_Uuid_map_hash(const void *key); + +/** + * @brief UUID copier for rd_list_copy() + */ +static RD_UNUSED void *rd_list_Uuid_copy(const void *elem, void *opaque) { + return (void *)rd_kafka_Uuid_copy((rd_kafka_Uuid_t *)elem); +} + +static RD_INLINE RD_UNUSED void rd_list_Uuid_destroy(void *uuid) { + rd_kafka_Uuid_destroy((rd_kafka_Uuid_t *)uuid); +} + +static RD_INLINE RD_UNUSED int rd_list_Uuid_cmp(const void *uuid1, + const void *uuid2) { + return rd_kafka_Uuid_cmp(*((rd_kafka_Uuid_t *)uuid1), + *((rd_kafka_Uuid_t *)uuid2)); +} + /** * @name Producer ID and Epoch for the Idempotent Producer diff --git a/src/third_party/librdkafka/dist/src/rdkafka_protocol.h b/src/third_party/librdkafka/dist/src/rdkafka_protocol.h index aa9db5392bd..4755494d0b0 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_protocol.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_protocol.h @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -105,7 +106,22 @@ #define RD_KAFKAP_AlterIsr 56 #define RD_KAFKAP_UpdateFeatures 57 #define RD_KAFKAP_Envelope 58 -#define RD_KAFKAP__NUM 59 +#define RD_KAFKAP_FetchSnapshot 59 +#define RD_KAFKAP_DescribeCluster 60 +#define RD_KAFKAP_DescribeProducers 61 +#define RD_KAFKAP_BrokerHeartbeat 63 +#define RD_KAFKAP_UnregisterBroker 64 +#define RD_KAFKAP_DescribeTransactions 65 +#define RD_KAFKAP_ListTransactions 66 +#define RD_KAFKAP_AllocateProducerIds 67 +#define RD_KAFKAP_ConsumerGroupHeartbeat 68 +#define RD_KAFKAP_ConsumerGroupDescribe 69 +#define RD_KAFKAP_ControllerRegistration 70 +#define RD_KAFKAP_GetTelemetrySubscriptions 71 +#define RD_KAFKAP_PushTelemetry 72 +#define RD_KAFKAP_AssignReplicasToDirs 73 + +#define RD_KAFKAP__NUM 74 #endif /* _RDKAFKA_PROTOCOL_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_queue.c b/src/third_party/librdkafka/dist/src/rdkafka_queue.c index 6a829c45154..92eddccae9c 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_queue.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_queue.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill, + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -83,16 +84,21 @@ void rd_kafka_q_destroy_final(rd_kafka_q_t *rkq) { */ void rd_kafka_q_init0(rd_kafka_q_t *rkq, rd_kafka_t *rk, + rd_bool_t for_consume, const char *func, int line) { rd_kafka_q_reset(rkq); rkq->rkq_fwdq = NULL; rkq->rkq_refcnt = 1; rkq->rkq_flags = RD_KAFKA_Q_F_READY; - rkq->rkq_rk = rk; - rkq->rkq_qio = NULL; - rkq->rkq_serve = NULL; - rkq->rkq_opaque = NULL; + if (for_consume) + rkq->rkq_flags |= RD_KAFKA_Q_F_CONSUMER; + rkq->rkq_rk = rk; + rkq->rkq_qio = NULL; + rkq->rkq_serve = NULL; + rkq->rkq_opaque = NULL; + rkq->rkq_ts_last_poll_start = 0; + rkq->rkq_ts_last_poll_end = 0; mtx_init(&rkq->rkq_lock, mtx_plain); cnd_init(&rkq->rkq_cond); #if ENABLE_DEVEL @@ -106,9 +112,15 @@ void rd_kafka_q_init0(rd_kafka_q_t *rkq, /** * Allocate a new queue and initialize it. */ -rd_kafka_q_t *rd_kafka_q_new0(rd_kafka_t *rk, const char *func, int line) { +rd_kafka_q_t *rd_kafka_q_new0(rd_kafka_t *rk, + rd_bool_t for_consume, + const char *func, + int line) { rd_kafka_q_t *rkq = rd_malloc(sizeof(*rkq)); - rd_kafka_q_init(rkq, rk); + if (!for_consume) + rd_kafka_q_init(rkq, rk); + else + rd_kafka_consume_q_init(rkq, rk); rkq->rkq_flags |= RD_KAFKA_Q_F_ALLOCATED; #if ENABLE_DEVEL rd_snprintf(rkq->rkq_name, sizeof(rkq->rkq_name), "%s:%d", func, line); @@ -118,6 +130,33 @@ rd_kafka_q_t *rd_kafka_q_new0(rd_kafka_t *rk, const char *func, int line) { return rkq; } +/* + * Sets the flag RD_KAFKA_Q_F_CONSUMER for rkq, any queues it's being forwarded + * to, recursively. + * Setting this flag indicates that polling this queue is equivalent to calling + * consumer poll, and will reset the max.poll.interval.ms timer. Only used + * internally when forwarding queues. + * @locks rd_kafka_q_lock(rkq) + */ +static void rd_kafka_q_consumer_propagate(rd_kafka_q_t *rkq) { + mtx_lock(&rkq->rkq_lock); + rkq->rkq_flags |= RD_KAFKA_Q_F_CONSUMER; + + if (!rkq->rkq_fwdq) { + mtx_unlock(&rkq->rkq_lock); + return; + } + + /* Recursively propagate the flag to any queues rkq is already + * forwarding to. There will be a deadlock here if the queues are being + * forwarded circularly, but that is a user error. We can't resolve this + * deadlock by unlocking before the recursive call, because that leads + * to incorrectness if the rkq_fwdq is forwarded elsewhere and the old + * one destroyed between recursive calls. */ + rd_kafka_q_consumer_propagate(rkq->rkq_fwdq); + mtx_unlock(&rkq->rkq_lock); +} + /** * Set/clear forward queue. * Queue forwarding enables message routing inside rdkafka. @@ -152,6 +191,9 @@ void rd_kafka_q_fwd_set0(rd_kafka_q_t *srcq, } srcq->rkq_fwdq = destq; + + if (srcq->rkq_flags & RD_KAFKA_Q_F_CONSUMER) + rd_kafka_q_consumer_propagate(destq); } if (do_lock) mtx_unlock(&srcq->rkq_lock); @@ -340,16 +382,26 @@ rd_kafka_op_filter(rd_kafka_q_t *rkq, rd_kafka_op_t *rko, int version) { * Serve q like rd_kafka_q_serve() until an op is found that can be returned * as an event to the application. * + * @param rkq Queue to pop from. + * @param timeout_us Maximum time to wait for an op, in microseconds. + * @param version Fetch version to filter out outdated ops. + * @param cb_type Callback type to use for the op. + * @param callback Callback to use for the op, if any. + * @param opaque Opaque pointer to pass to the callback. + * @param is_consume_call If `rd_true` and it could be a consumer call it + * checks if this queue can contain fetched messages. + * * @returns the first event:able op, or NULL on timeout. * - * Locality: any thread + * @locality any thread */ -rd_kafka_op_t *rd_kafka_q_pop_serve(rd_kafka_q_t *rkq, - rd_ts_t timeout_us, - int32_t version, - rd_kafka_q_cb_type_t cb_type, - rd_kafka_q_serve_cb_t *callback, - void *opaque) { +static rd_kafka_op_t *rd_kafka_q_pop_serve0(rd_kafka_q_t *rkq, + rd_ts_t timeout_us, + int32_t version, + rd_kafka_q_cb_type_t cb_type, + rd_kafka_q_serve_cb_t *callback, + void *opaque, + rd_bool_t is_consume_call) { rd_kafka_op_t *rko; rd_kafka_q_t *fwdq; @@ -359,9 +411,15 @@ rd_kafka_op_t *rd_kafka_q_pop_serve(rd_kafka_q_t *rkq, rd_kafka_yield_thread = 0; if (!(fwdq = rd_kafka_q_fwd_get(rkq, 0))) { - struct timespec timeout_tspec; + const rd_bool_t can_q_contain_fetched_msgs = + is_consume_call && + rd_kafka_q_can_contain_fetched_msgs(rkq, RD_DONT_LOCK); - rd_timeout_init_timespec_us(&timeout_tspec, timeout_us); + rd_ts_t abs_timeout = rd_timeout_init_us(timeout_us); + + if (can_q_contain_fetched_msgs) + rd_kafka_app_poll_start(rkq->rkq_rk, rkq, 0, + timeout_us); while (1) { rd_kafka_op_res_t res; @@ -400,15 +458,24 @@ rd_kafka_op_t *rd_kafka_q_pop_serve(rd_kafka_q_t *rkq, goto retry; /* Next op */ } else if (unlikely(res == RD_KAFKA_OP_RES_YIELD)) { + if (can_q_contain_fetched_msgs) + rd_kafka_app_polled(rkq->rkq_rk, + rkq); /* Callback yielded, unroll */ return NULL; - } else + } else { + if (can_q_contain_fetched_msgs) + rd_kafka_app_polled(rkq->rkq_rk, + rkq); break; /* Proper op, handle below. */ + } } if (unlikely(rd_kafka_q_check_yield(rkq))) { if (is_locked) mtx_unlock(&rkq->rkq_lock); + if (can_q_contain_fetched_msgs) + rd_kafka_app_polled(rkq->rkq_rk, rkq); return NULL; } @@ -416,8 +483,10 @@ rd_kafka_op_t *rd_kafka_q_pop_serve(rd_kafka_q_t *rkq, mtx_lock(&rkq->rkq_lock); if (cnd_timedwait_abs(&rkq->rkq_cond, &rkq->rkq_lock, - &timeout_tspec) != thrd_success) { + abs_timeout) != thrd_success) { mtx_unlock(&rkq->rkq_lock); + if (can_q_contain_fetched_msgs) + rd_kafka_app_polled(rkq->rkq_rk, rkq); return NULL; } } @@ -426,8 +495,8 @@ rd_kafka_op_t *rd_kafka_q_pop_serve(rd_kafka_q_t *rkq, /* Since the q_pop may block we need to release the parent * queue's lock. */ mtx_unlock(&rkq->rkq_lock); - rko = rd_kafka_q_pop_serve(fwdq, timeout_us, version, cb_type, - callback, opaque); + rko = rd_kafka_q_pop_serve0(fwdq, timeout_us, version, cb_type, + callback, opaque, is_consume_call); rd_kafka_q_destroy(fwdq); } @@ -435,34 +504,73 @@ rd_kafka_op_t *rd_kafka_q_pop_serve(rd_kafka_q_t *rkq, return rko; } +rd_kafka_op_t *rd_kafka_q_pop_serve(rd_kafka_q_t *rkq, + rd_ts_t timeout_us, + int32_t version, + rd_kafka_q_cb_type_t cb_type, + rd_kafka_q_serve_cb_t *callback, + void *opaque) { + return rd_kafka_q_pop_serve0(rkq, timeout_us, version, cb_type, + callback, opaque, rd_false); +} + +/** + * @brief Same as `rd_kafka_q_pop_serve`, use this call when the queue + * could be a fetch queue, use the other one when it + * can never be. + */ +rd_kafka_op_t * +rd_kafka_q_pop_serve_maybe_consume(rd_kafka_q_t *rkq, + rd_ts_t timeout_us, + int32_t version, + rd_kafka_q_cb_type_t cb_type, + rd_kafka_q_serve_cb_t *callback, + void *opaque) { + return rd_kafka_q_pop_serve0(rkq, timeout_us, version, cb_type, + callback, opaque, + /* Only check if to call app_polled when + * this is a consumer. */ + rkq->rkq_rk->rk_type == RD_KAFKA_CONSUMER); +} + rd_kafka_op_t * rd_kafka_q_pop(rd_kafka_q_t *rkq, rd_ts_t timeout_us, int32_t version) { return rd_kafka_q_pop_serve(rkq, timeout_us, version, RD_KAFKA_Q_CB_RETURN, NULL, NULL); } - /** * Pop all available ops from a queue and call the provided * callback for each op. - * `max_cnt` limits the number of ops served, 0 = no limit. * - * Returns the number of ops served. + * @param rkq Queue to serve. + * @param max_cnt Limits the number of ops served, 0 = no limit. + * @param cb_type Callback type to use. + * @param callback Callback to call for each op. + * @param opaque Opaque pointer to pass to the callback. + * @param is_consume_call If `rd_true` and it could be a consumer call it + * checks if this queue can contain fetched messages. * - * Locality: any thread. + * @return The number of ops served. + * + * @locality any thread. */ -int rd_kafka_q_serve(rd_kafka_q_t *rkq, - int timeout_ms, - int max_cnt, - rd_kafka_q_cb_type_t cb_type, - rd_kafka_q_serve_cb_t *callback, - void *opaque) { +int rd_kafka_q_serve0(rd_kafka_q_t *rkq, + int timeout_ms, + int max_cnt, + rd_kafka_q_cb_type_t cb_type, + rd_kafka_q_serve_cb_t *callback, + void *opaque, + rd_bool_t is_consume_call) { rd_kafka_t *rk = rkq->rkq_rk; rd_kafka_op_t *rko; rd_kafka_q_t localq; rd_kafka_q_t *fwdq; int cnt = 0; - struct timespec timeout_tspec; + rd_ts_t abs_timeout; + const rd_bool_t can_q_contain_fetched_msgs = + is_consume_call && + rd_kafka_q_can_contain_fetched_msgs(rkq, RD_DONT_LOCK); rd_dassert(cb_type); @@ -474,25 +582,31 @@ int rd_kafka_q_serve(rd_kafka_q_t *rkq, /* Since the q_pop may block we need to release the parent * queue's lock. */ mtx_unlock(&rkq->rkq_lock); - ret = rd_kafka_q_serve(fwdq, timeout_ms, max_cnt, cb_type, - callback, opaque); + ret = rd_kafka_q_serve0(fwdq, timeout_ms, max_cnt, cb_type, + callback, opaque, is_consume_call); rd_kafka_q_destroy(fwdq); return ret; } - rd_timeout_init_timespec(&timeout_tspec, timeout_ms); + + abs_timeout = rd_timeout_init(timeout_ms); + + if (can_q_contain_fetched_msgs) + rd_kafka_app_poll_start(rk, rkq, 0, timeout_ms); /* Wait for op */ while (!(rko = TAILQ_FIRST(&rkq->rkq_q)) && !rd_kafka_q_check_yield(rkq) && - cnd_timedwait_abs(&rkq->rkq_cond, &rkq->rkq_lock, - &timeout_tspec) == thrd_success) + cnd_timedwait_abs(&rkq->rkq_cond, &rkq->rkq_lock, abs_timeout) == + thrd_success) ; rd_kafka_q_mark_served(rkq); if (!rko) { mtx_unlock(&rkq->rkq_lock); + if (can_q_contain_fetched_msgs) + rd_kafka_app_polled(rk, rkq); return 0; } @@ -527,11 +641,42 @@ int rd_kafka_q_serve(rd_kafka_q_t *rkq, } } + if (can_q_contain_fetched_msgs) + rd_kafka_app_polled(rk, rkq); + rd_kafka_q_destroy_owner(&localq); return cnt; } +int rd_kafka_q_serve(rd_kafka_q_t *rkq, + int timeout_ms, + int max_cnt, + rd_kafka_q_cb_type_t cb_type, + rd_kafka_q_serve_cb_t *callback, + void *opaque) { + return rd_kafka_q_serve0(rkq, timeout_ms, max_cnt, cb_type, callback, + opaque, rd_false); +} + +/** + * @brief Same as `rd_kafka_q_serve`, use this call when the queue + * could be a fetch queue, use the other one when it + * can never be. + */ +int rd_kafka_q_serve_maybe_consume(rd_kafka_q_t *rkq, + int timeout_ms, + int max_cnt, + rd_kafka_q_cb_type_t cb_type, + rd_kafka_q_serve_cb_t *callback, + void *opaque) { + return rd_kafka_q_serve0(rkq, timeout_ms, max_cnt, cb_type, callback, + opaque, + /* Only check if to call app_polled when + * this is a consumer. */ + rkq->rkq_rk->rk_type == RD_KAFKA_CONSUMER); +} + /** * @brief Filter out and destroy outdated messages. * @@ -539,15 +684,17 @@ int rd_kafka_q_serve(rd_kafka_q_t *rkq, * * @locality Any thread. */ -static size_t rd_kafka_purge_outdated_messages(rd_kafka_toppar_t *rktp, - int32_t version, - rd_kafka_message_t **rkmessages, - size_t cnt) { +static size_t +rd_kafka_purge_outdated_messages(rd_kafka_toppar_t *rktp, + int32_t version, + rd_kafka_message_t **rkmessages, + size_t cnt, + struct rd_kafka_op_tailq *ctrl_msg_q) { size_t valid_count = 0; size_t i; + rd_kafka_op_t *rko, *next; for (i = 0; i < cnt; i++) { - rd_kafka_op_t *rko; rko = rkmessages[i]->_private; if (rko->rko_rktp == rktp && rd_kafka_op_version_outdated(rko, version)) { @@ -559,6 +706,19 @@ static size_t rd_kafka_purge_outdated_messages(rd_kafka_toppar_t *rktp, valid_count++; } } + + /* Discard outdated control msgs ops */ + next = TAILQ_FIRST(ctrl_msg_q); + while (next) { + rko = next; + next = TAILQ_NEXT(rko, rko_link); + if (rko->rko_rktp == rktp && + rd_kafka_op_version_outdated(rko, version)) { + TAILQ_REMOVE(ctrl_msg_q, rko, rko_link); + rd_kafka_op_destroy(rko); + } + } + return valid_count; } @@ -577,10 +737,13 @@ int rd_kafka_q_serve_rkmessages(rd_kafka_q_t *rkq, size_t rkmessages_size) { unsigned int cnt = 0; TAILQ_HEAD(, rd_kafka_op_s) tmpq = TAILQ_HEAD_INITIALIZER(tmpq); + struct rd_kafka_op_tailq ctrl_msg_q = + TAILQ_HEAD_INITIALIZER(ctrl_msg_q); rd_kafka_op_t *rko, *next; rd_kafka_t *rk = rkq->rkq_rk; rd_kafka_q_t *fwdq; - struct timespec timeout_tspec; + rd_ts_t abs_timeout; + int i; mtx_lock(&rkq->rkq_lock); if ((fwdq = rd_kafka_q_fwd_get(rkq, 0))) { @@ -592,12 +755,12 @@ int rd_kafka_q_serve_rkmessages(rd_kafka_q_t *rkq, rd_kafka_q_destroy(fwdq); return cnt; } + mtx_unlock(&rkq->rkq_lock); - if (timeout_ms) - rd_kafka_app_poll_blocking(rk); + abs_timeout = rd_timeout_init(timeout_ms); - rd_timeout_init_timespec(&timeout_tspec, timeout_ms); + rd_kafka_app_poll_start(rk, rkq, 0, timeout_ms); rd_kafka_yield_thread = 0; while (cnt < rkmessages_size) { @@ -608,7 +771,7 @@ int rd_kafka_q_serve_rkmessages(rd_kafka_q_t *rkq, while (!(rko = TAILQ_FIRST(&rkq->rkq_q)) && !rd_kafka_q_check_yield(rkq) && cnd_timedwait_abs(&rkq->rkq_cond, &rkq->rkq_lock, - &timeout_tspec) == thrd_success) + abs_timeout) == thrd_success) ; rd_kafka_q_mark_served(rkq); @@ -624,7 +787,8 @@ int rd_kafka_q_serve_rkmessages(rd_kafka_q_t *rkq, if (unlikely(rko->rko_type == RD_KAFKA_OP_BARRIER)) { cnt = (unsigned int)rd_kafka_purge_outdated_messages( - rko->rko_rktp, rko->rko_version, rkmessages, cnt); + rko->rko_rktp, rko->rko_version, rkmessages, cnt, + &ctrl_msg_q); rd_kafka_op_destroy(rko); continue; } @@ -649,22 +813,31 @@ int rd_kafka_q_serve_rkmessages(rd_kafka_q_t *rkq, } rd_dassert(res == RD_KAFKA_OP_RES_PASS); - if (!rko->rko_err && rko->rko_type == RD_KAFKA_OP_FETCH) { - /* Store offset, etc. */ - rd_kafka_fetch_op_app_prepare(rk, rko); - - /* If this is a control messages, don't return - * message to application, only store the offset */ - if (unlikely(rd_kafka_op_is_ctrl_msg(rko))) { - rd_kafka_op_destroy(rko); - continue; - } + /* If this is a control messages, don't return message to + * application. Add it to a tmp queue from where we can store + * the offset and destroy the op */ + if (unlikely(rd_kafka_op_is_ctrl_msg(rko))) { + TAILQ_INSERT_TAIL(&ctrl_msg_q, rko, rko_link); + continue; } /* Get rkmessage from rko and append to array. */ rkmessages[cnt++] = rd_kafka_message_get(rko); } + for (i = cnt - 1; i >= 0; i--) { + rko = (rd_kafka_op_t *)rkmessages[i]->_private; + rd_kafka_toppar_t *rktp = rko->rko_rktp; + int64_t offset = rkmessages[i]->offset + 1; + if (unlikely(rktp && (rktp->rktp_app_pos.offset < offset))) + rd_kafka_update_app_pos( + rk, rktp, + RD_KAFKA_FETCH_POS( + offset, + rd_kafka_message_leader_epoch(rkmessages[i])), + RD_DO_LOCK); + } + /* Discard non-desired and already handled ops */ next = TAILQ_FIRST(&tmpq); while (next) { @@ -673,7 +846,25 @@ int rd_kafka_q_serve_rkmessages(rd_kafka_q_t *rkq, rd_kafka_op_destroy(rko); } - rd_kafka_app_polled(rk); + /* Discard ctrl msgs */ + next = TAILQ_FIRST(&ctrl_msg_q); + while (next) { + rko = next; + next = TAILQ_NEXT(next, rko_link); + rd_kafka_toppar_t *rktp = rko->rko_rktp; + int64_t offset = rko->rko_u.fetch.rkm.rkm_rkmessage.offset + 1; + if (rktp && (rktp->rktp_app_pos.offset < offset)) + rd_kafka_update_app_pos( + rk, rktp, + RD_KAFKA_FETCH_POS( + offset, + rd_kafka_message_leader_epoch( + &rko->rko_u.fetch.rkm.rkm_rkmessage)), + RD_DO_LOCK); + rd_kafka_op_destroy(rko); + } + + rd_kafka_app_polled(rk, rkq); return cnt; } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_queue.h b/src/third_party/librdkafka/dist/src/rdkafka_queue.h index 0d50f58703f..ff1a4657703 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_queue.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_queue.h @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill, + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -75,6 +76,11 @@ struct rd_kafka_q_s { * by triggering the cond-var \ * but without having to enqueue \ * an op. */ +#define RD_KAFKA_Q_F_CONSUMER \ + 0x10 /* If this flag is set, this queue might contain fetched messages \ + from partitions. Polling this queue will reset the \ + max.poll.interval.ms timer. Once set, this flag is never \ + reset. */ rd_kafka_t *rkq_rk; struct rd_kafka_q_io *rkq_qio; /* FD-based application signalling */ @@ -85,6 +91,12 @@ struct rd_kafka_q_s { * Shall return 1 if op was handled, else 0. */ rd_kafka_q_serve_cb_t *rkq_serve; void *rkq_opaque; + rd_ts_t rkq_ts_last_poll_start; /**< Timestamp of last queue + * poll() call start + * Only relevant for a consumer. */ + rd_ts_t rkq_ts_last_poll_end; /**< Timestamp of last queue + * poll() call end + * Only relevant for a consumer. */ #if ENABLE_DEVEL char rkq_name[64]; /* Debugging: queue name (FUNC:LINE) */ @@ -123,12 +135,20 @@ static RD_INLINE RD_UNUSED int rd_kafka_q_ready(rd_kafka_q_t *rkq) { void rd_kafka_q_init0(rd_kafka_q_t *rkq, rd_kafka_t *rk, + rd_bool_t for_consume, const char *func, int line); #define rd_kafka_q_init(rkq, rk) \ - rd_kafka_q_init0(rkq, rk, __FUNCTION__, __LINE__) -rd_kafka_q_t *rd_kafka_q_new0(rd_kafka_t *rk, const char *func, int line); -#define rd_kafka_q_new(rk) rd_kafka_q_new0(rk, __FUNCTION__, __LINE__) + rd_kafka_q_init0(rkq, rk, rd_false, __FUNCTION__, __LINE__) +#define rd_kafka_consume_q_init(rkq, rk) \ + rd_kafka_q_init0(rkq, rk, rd_true, __FUNCTION__, __LINE__) +rd_kafka_q_t *rd_kafka_q_new0(rd_kafka_t *rk, + rd_bool_t for_consume, + const char *func, + int line); +#define rd_kafka_q_new(rk) rd_kafka_q_new0(rk, rd_false, __FUNCTION__, __LINE__) +#define rd_kafka_consume_q_new(rk) \ + rd_kafka_q_new0(rk, rd_true, __FUNCTION__, __LINE__) void rd_kafka_q_destroy_final(rd_kafka_q_t *rkq); #define rd_kafka_q_lock(rkqu) mtx_lock(&(rkqu)->rkq_lock) @@ -827,6 +847,13 @@ rd_kafka_op_t *rd_kafka_q_pop_serve(rd_kafka_q_t *rkq, rd_kafka_q_serve_cb_t *callback, void *opaque); rd_kafka_op_t * +rd_kafka_q_pop_serve_maybe_consume(rd_kafka_q_t *rkq, + rd_ts_t timeout_us, + int32_t version, + rd_kafka_q_cb_type_t cb_type, + rd_kafka_q_serve_cb_t *callback, + void *opaque); +rd_kafka_op_t * rd_kafka_q_pop(rd_kafka_q_t *rkq, rd_ts_t timeout_us, int32_t version); int rd_kafka_q_serve(rd_kafka_q_t *rkq, int timeout_ms, @@ -834,6 +861,12 @@ int rd_kafka_q_serve(rd_kafka_q_t *rkq, rd_kafka_q_cb_type_t cb_type, rd_kafka_q_serve_cb_t *callback, void *opaque); +int rd_kafka_q_serve_maybe_consume(rd_kafka_q_t *rkq, + int timeout_ms, + int max_cnt, + rd_kafka_q_cb_type_t cb_type, + rd_kafka_q_serve_cb_t *callback, + void *opaque); int rd_kafka_q_move_cnt(rd_kafka_q_t *dstq, @@ -1164,6 +1197,22 @@ rd_kafka_enq_once_disable(rd_kafka_enq_once_t *eonce) { return rko; } +/** + * @brief Returns true if the queue can contain fetched messages. + * + * @locks rd_kafka_q_lock(rkq) if do_lock is set. + */ +static RD_INLINE RD_UNUSED rd_bool_t +rd_kafka_q_can_contain_fetched_msgs(rd_kafka_q_t *rkq, rd_bool_t do_lock) { + rd_bool_t val; + if (do_lock) + mtx_lock(&rkq->rkq_lock); + val = rkq->rkq_flags & RD_KAFKA_Q_F_CONSUMER; + if (do_lock) + mtx_unlock(&rkq->rkq_lock); + return val; +} + /**@}*/ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_range_assignor.c b/src/third_party/librdkafka/dist/src/rdkafka_range_assignor.c index c83f1f1a44f..4664de069cd 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_range_assignor.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_range_assignor.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -27,7 +28,7 @@ */ #include "rdkafka_int.h" #include "rdkafka_assignor.h" - +#include "rdunittest.h" /** @@ -50,6 +51,445 @@ * C1: [t0p2, t1p2] */ +typedef struct { + rd_kafkap_str_t *member_id; + rd_list_t *assigned_partitions; /* Contained Type: int* */ +} rd_kafka_member_assigned_partitions_pair_t; + +/** + * @brief Intializes a rd_kafka_member_assigned_partitions_pair_t* with + * assigned_partitions = []. + * + * @param member_id + * + * The member_id isn't copied, so the returned value can be used only for the + * lifetime of this function's arguments. + * @return rd_kafka_member_assigned_partitions_pair_t* + */ +static rd_kafka_member_assigned_partitions_pair_t * +rd_kafka_member_assigned_partitions_pair_new(rd_kafkap_str_t *member_id) { + rd_kafka_member_assigned_partitions_pair_t *pair = + rd_calloc(1, sizeof(rd_kafka_member_assigned_partitions_pair_t)); + + pair->member_id = member_id; + pair->assigned_partitions = rd_list_new(0, NULL); + return pair; +} + +static void rd_kafka_member_assigned_partitions_pair_destroy(void *_pair) { + rd_kafka_member_assigned_partitions_pair_t *pair = + (rd_kafka_member_assigned_partitions_pair_t *)_pair; + + /* Do not destroy the member_id, we don't take ownership. */ + RD_IF_FREE(pair->assigned_partitions, rd_list_destroy); + RD_IF_FREE(pair, rd_free); +} + +static int rd_kafka_member_assigned_partitions_pair_cmp(const void *_a, + const void *_b) { + rd_kafka_member_assigned_partitions_pair_t *a = + (rd_kafka_member_assigned_partitions_pair_t *)_a; + rd_kafka_member_assigned_partitions_pair_t *b = + (rd_kafka_member_assigned_partitions_pair_t *)_b; + return rd_kafkap_str_cmp(a->member_id, b->member_id); +} + +static rd_kafka_member_assigned_partitions_pair_t * +rd_kafka_find_member_assigned_partitions_pair_by_member_id( + rd_kafkap_str_t *member_id, + rd_list_t *rd_kafka_member_assigned_partitions_pair_list) { + rd_kafka_member_assigned_partitions_pair_t search_pair = {member_id, + NULL}; + return rd_list_find(rd_kafka_member_assigned_partitions_pair_list, + &search_pair, + rd_kafka_member_assigned_partitions_pair_cmp); +} + +typedef struct { + /* Contains topic and list of members - sorted by group instance id and + * member id. Also contains partitions, along with partition replicas, + * which will help us with the racks. The members also contain their + * rack id and the partitions they have already been assigned. + */ + rd_kafka_assignor_topic_t *topic; + /* unassigned_partitions[i] is true if the ith partition of this topic + * is not assigned. We prefer using an array rather than using an + * rd_list and removing elements, because that involves a memmove on + * each remove. */ + rd_bool_t *unassigned_partitions; + /* Number of partitions still to be assigned.*/ + size_t unassigned_partitions_left; + /* An array of char** arrays. The ith element of this array is a sorted + * char** array, denoting the racks for the ith partition of this topic. + * The size of this array is equal to the partition_cnt. */ + char ***partition_racks; + /* The ith element of this array is the size of partition_racks[i]. */ + size_t *racks_cnt; + /* Contains a pair denoting the partitions assigned to every subscribed + * consumer (member, [rd_list_t* of int*]). Sorted by member_id. + * Contained Type: rd_kafka_member_assigned_partitions_pair_t* */ + rd_list_t *member_to_assigned_partitions; + /* Contains the number of partitions that should be ideally assigned to + * every subscribing consumer. */ + int num_partitions_per_consumer; + /* Contains the number of consumers with extra partitions in case number + * of partitions isn't perfectly divisible by number of consumers. */ + int remaining_consumers_with_extra_partition; + /* True if we need to perform rack aware assignment. */ + rd_bool_t needs_rack_aware_assignment; +} rd_kafka_topic_assignment_state_t; + + +/** + * @brief Initialize an rd_kafka_topic_assignment_state_t. + * + * @param topic + * @param broker_rack_pair + * @param broker_rack_pair_cnt + * + * The struct rd_kafka_topic_assignment_state_t is mostly for convenience and + * easy grouping, so we avoid copying values as much as possible. Hence, the + * returned rd_kafka_topic_assignment_state_t does not own all its values, and + * should not be used beyond the lifetime of this function's arguments. This + * function also computes the value of needsRackAwareAssignment given the other + * information. + * + * @return rd_kafka_topic_assignment_state_t* + */ + +static rd_kafka_topic_assignment_state_t * +rd_kafka_topic_assignment_state_new(rd_kafka_assignor_topic_t *topic, + const rd_kafka_metadata_internal_t *mdi) { + int i; + rd_kafka_group_member_t *member; + rd_kafka_topic_assignment_state_t *rktas; + const int partition_cnt = topic->metadata->partition_cnt; + + rktas = rd_calloc(1, sizeof(rd_kafka_topic_assignment_state_t)); + rktas->topic = topic; /* don't copy. */ + + rktas->unassigned_partitions = + rd_malloc(sizeof(rd_bool_t) * partition_cnt); + rktas->unassigned_partitions_left = partition_cnt; + for (i = 0; i < partition_cnt; i++) { + rktas->unassigned_partitions[i] = rd_true; + } + + rktas->num_partitions_per_consumer = 0; + rktas->remaining_consumers_with_extra_partition = 0; + if (rd_list_cnt(&topic->members)) { + rktas->num_partitions_per_consumer = + partition_cnt / rd_list_cnt(&topic->members); + rktas->remaining_consumers_with_extra_partition = + partition_cnt % rd_list_cnt(&topic->members); + } + + rktas->member_to_assigned_partitions = + rd_list_new(0, rd_kafka_member_assigned_partitions_pair_destroy); + + RD_LIST_FOREACH(member, &topic->members, i) { + rd_list_add(rktas->member_to_assigned_partitions, + rd_kafka_member_assigned_partitions_pair_new( + member->rkgm_member_id)); + } + + rd_list_sort(rktas->member_to_assigned_partitions, + rd_kafka_member_assigned_partitions_pair_cmp); + + rktas->partition_racks = rd_calloc(partition_cnt, sizeof(char **)); + rktas->racks_cnt = rd_calloc(partition_cnt, sizeof(size_t)); + for (i = 0; topic->metadata_internal->partitions && i < partition_cnt; + i++) { + rktas->racks_cnt[i] = + topic->metadata_internal->partitions[i].racks_cnt; + rktas->partition_racks[i] = + topic->metadata_internal->partitions[i].racks; + } + + rktas->needs_rack_aware_assignment = + rd_kafka_use_rack_aware_assignment(&topic, 1, mdi); + + return rktas; +} + +/* Destroy a rd_kafka_topic_assignment_state_t. */ +static void rd_kafka_topic_assignment_state_destroy(void *_rktas) { + rd_kafka_topic_assignment_state_t *rktas = + (rd_kafka_topic_assignment_state_t *)_rktas; + + rd_free(rktas->unassigned_partitions); + rd_list_destroy(rktas->member_to_assigned_partitions); + rd_free(rktas->partition_racks); + rd_free(rktas->racks_cnt); + rd_free(rktas); +} + +/** + * Compare two topic_assignment_states, first on the sorted list of consumers + * (each consumer from the list of consumers is matched till the first point of + * difference), and if that's equal, compare on the number of partitions. + * + * A list sorted with this comparator will group the topic_assignment_states + * having the same consumers and the same number of partitions together - this + * is the criteria of co-partitioned topics. + */ +static int rd_kafka_topic_assignment_state_cmp(const void *_a, const void *_b) { + int i; + rd_kafka_topic_assignment_state_t *a = + (rd_kafka_topic_assignment_state_t *)_a; + rd_kafka_topic_assignment_state_t *b = + (rd_kafka_topic_assignment_state_t *)_b; + + /* This guarantee comes from rd_kafka_range_assignor_assign_cb. */ + rd_assert(a->topic->members.rl_flags & RD_LIST_F_SORTED); + rd_assert(b->topic->members.rl_flags & RD_LIST_F_SORTED); + + /* Based on consumers */ + for (i = 0; i < rd_list_cnt(&a->topic->members) && + i < rd_list_cnt(&b->topic->members); + i++) { + rd_kafka_group_member_t *am = + rd_list_elem(&a->topic->members, i); + rd_kafka_group_member_t *bm = + rd_list_elem(&b->topic->members, i); + int cmp_res = + rd_kafkap_str_cmp(am->rkgm_member_id, bm->rkgm_member_id); + if (cmp_res != 0) + return cmp_res; + } + + if (rd_list_cnt(&a->topic->members) != + rd_list_cnt(&b->topic->members)) { + return RD_CMP(rd_list_cnt(&a->topic->members), + rd_list_cnt(&b->topic->members)); + } + + /* Based on number of partitions */ + return RD_CMP(a->topic->metadata->partition_cnt, + b->topic->metadata->partition_cnt); +} + + +/* Helper function to wrap a bsearch on the partition's racks. */ +static char *rd_kafka_topic_assignment_state_rack_search( + rd_kafka_topic_assignment_state_t *rktas, + int partition, + const char *rack) { + char **partition_racks = rktas->partition_racks[partition]; + size_t cnt = rktas->racks_cnt[partition]; + void *res = NULL; + + if (!partition_racks) + return NULL; + + res = bsearch(&rack, partition_racks, cnt, sizeof(char *), rd_strcmp3); + if (!res) + return NULL; + + return *(char **)res; +} + +/* + * Assigns a partition to a member, and updates fields in rktas for accounting. + * It's assumed that the partitions assigned to this member don't exceed the + * allowed number. + */ +static void rd_kafka_assign_partition(rd_kafka_group_member_t *member, + rd_kafka_topic_assignment_state_t *rktas, + int32_t partition) { + rd_kafka_member_assigned_partitions_pair_t *member_assignment = + rd_kafka_find_member_assigned_partitions_pair_by_member_id( + member->rkgm_member_id, rktas->member_to_assigned_partitions); + rd_assert(member_assignment); + + /* We can't use &partition, since that's a copy on the stack. */ + rd_list_add(member_assignment->assigned_partitions, + (void *)&rktas->topic->metadata->partitions[partition].id); + rd_kafka_topic_partition_list_add_range(member->rkgm_assignment, + rktas->topic->metadata->topic, + partition, partition); + + rd_assert(rktas->unassigned_partitions[partition]); + rktas->unassigned_partitions[partition] = rd_false; + rktas->unassigned_partitions_left--; + + if (rd_list_cnt(member_assignment->assigned_partitions) > + rktas->num_partitions_per_consumer) { + rktas->remaining_consumers_with_extra_partition -= 1; + } +} + + +/* Implementation of may_assign for rd_kafka_assign_ranges. True if the consumer + * rack is empty, or if is exists within the partition racks. */ +static rd_bool_t rd_kafka_racks_match(rd_kafka_group_member_t *member, + rd_kafka_topic_assignment_state_t *rktas, + int32_t partition) { + rd_kafkap_str_t *consumer_rack = member->rkgm_rack_id; + + if (!consumer_rack || RD_KAFKAP_STR_LEN(consumer_rack) == 0) { + return rd_true; + } + + return rd_kafka_topic_assignment_state_rack_search( + rktas, partition, consumer_rack->str) != NULL; +} + + +/* Implementation of may_assign for rd_kafka_assign_ranges. Always true, used to + * assign remaining partitions after rack-aware assignment is complete. */ +static rd_bool_t rd_kafka_always(rd_kafka_group_member_t *member, + rd_kafka_topic_assignment_state_t *rktas, + int32_t partition) { + return rd_true; +} + +/* Assigns as many partitions as possible for a topic to subscribing members, + * such that no subscribing member exceeds their limit of allowed partitions, + * and may_assign(member, rktas, partition) is true for each member and + * partition. + */ +static void rd_kafka_assign_ranges( + rd_kafka_topic_assignment_state_t *rktas, + rd_bool_t (*may_assign)(rd_kafka_group_member_t *member, + rd_kafka_topic_assignment_state_t *rktas, + int32_t partition)) { + int i; + rd_kafka_group_member_t *member; + int32_t *partitions_to_assign = + rd_alloca(rktas->unassigned_partitions_left * sizeof(int32_t)); + + RD_LIST_FOREACH(member, &rktas->topic->members, i) { + int j; + rd_kafka_member_assigned_partitions_pair_t *member_assignment; + int maximum_assignable_to_consumer; + int partitions_to_assign_cnt; + + if (rktas->unassigned_partitions_left == 0) + break; + + member_assignment = + rd_kafka_find_member_assigned_partitions_pair_by_member_id( + member->rkgm_member_id, + rktas->member_to_assigned_partitions); + + maximum_assignable_to_consumer = + rktas->num_partitions_per_consumer + + (rktas->remaining_consumers_with_extra_partition > 0) - + rd_list_cnt(member_assignment->assigned_partitions); + + if (maximum_assignable_to_consumer <= 0) + continue; + + partitions_to_assign_cnt = 0; + for (j = 0; j < rktas->topic->metadata->partition_cnt; j++) { + if (!rktas->unassigned_partitions[j]) { + continue; + } + + if (maximum_assignable_to_consumer <= 0) + break; + + if (!may_assign(member, rktas, j)) + continue; + + partitions_to_assign[partitions_to_assign_cnt] = j; + partitions_to_assign_cnt++; + maximum_assignable_to_consumer--; + } + + for (j = 0; j < partitions_to_assign_cnt; j++) + rd_kafka_assign_partition(member, rktas, + partitions_to_assign[j]); + } +} + +/* + * Assigns partitions for co-partitioned topics in a rack-aware manner on a best + * effort basis. All partitions may not be assigned to consumers in case a rack + * aware assignment does not exist. + */ +static void rd_kafka_assign_co_partitioned( + rd_list_t * + rktas_bucket /* Contained Type: rd_kafka_topic_assignment_state_t* */) { + rd_kafka_topic_assignment_state_t *first_rktas = + rd_list_elem(rktas_bucket, 0); + rd_kafka_topic_assignment_state_t *rktas; + rd_kafka_group_member_t *member; + int i; + + /* Since a "bucket" is a group of topic_assignment_states with the same + * consumers and number of partitions, we can just fetch them from the + * first member of the bucket. */ + const int partition_cnt = first_rktas->topic->metadata->partition_cnt; + const rd_list_t *consumers = &first_rktas->topic->members; + + for (i = 0; i < partition_cnt; i++) { + /* + * To assign the ith partition of all the co partitioned topics, + * we need to find a consumerX that fulfils the criteria: + * for all topic_assignment_states in the bucket: + * 1. rack(consumerX) is contained inside racks(partition i) + * 2. partitions assigned to consumerX does not exceed limits. + */ + int j; + RD_LIST_FOREACH(member, consumers, j) { + int m; + RD_LIST_FOREACH(rktas, rktas_bucket, m) { + int maximum_assignable; + rd_kafka_member_assigned_partitions_pair_t + *member_assignment; + + /* Check (1.) */ + if (!member->rkgm_rack_id || + RD_KAFKAP_STR_LEN(member->rkgm_rack_id) == + 0 || + rd_kafka_topic_assignment_state_rack_search( + rktas, i, member->rkgm_rack_id->str) == + NULL) { + break; + } + + /* Check (2.) */ + member_assignment = + rd_kafka_find_member_assigned_partitions_pair_by_member_id( + member->rkgm_member_id, + rktas->member_to_assigned_partitions); + maximum_assignable = + rktas->num_partitions_per_consumer + + (rktas + ->remaining_consumers_with_extra_partition > + 0) - + rd_list_cnt( + member_assignment->assigned_partitions); + + if (maximum_assignable <= 0) { + break; + } + } + if (m == rd_list_cnt(rktas_bucket)) { + /* Break early - this consumer can be assigned + * this partition. */ + break; + } + } + if (j == rd_list_cnt(&first_rktas->topic->members)) { + continue; /* We didn't find a suitable consumer. */ + } + + rd_assert(member); + + RD_LIST_FOREACH(rktas, rktas_bucket, j) { + rd_kafka_assign_partition(member, rktas, i); + } + + /* FIXME: A possible optimization: early break here if no + * consumer remains with maximum_assignable_to_consumer > 0 + * across all topics. */ + } +} + + rd_kafka_resp_err_t rd_kafka_range_assignor_assign_cb(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas, @@ -64,67 +504,1236 @@ rd_kafka_range_assignor_assign_cb(rd_kafka_t *rk, void *opaque) { unsigned int ti; int i; + rd_list_t *rktas_list = rd_list_new( + eligible_topic_cnt, rd_kafka_topic_assignment_state_destroy); + rd_list_t *rktas_buckets = rd_list_new(0, rd_list_destroy_free); + rd_list_t + *rktas_current_bucket; /* Contained Type: + rd_kafka_topic_assignment_state_t* */ + rd_kafka_topic_assignment_state_t *rktas; + rd_kafka_topic_assignment_state_t *prev_rktas; + const rd_kafka_metadata_internal_t *mdi = + rd_kafka_metadata_get_internal(metadata); /* The range assignor works on a per-topic basis. */ for (ti = 0; ti < eligible_topic_cnt; ti++) { rd_kafka_assignor_topic_t *eligible_topic = eligible_topics[ti]; - int numPartitionsPerConsumer; - int consumersWithExtraPartition; - /* For each topic, we lay out the available partitions in - * numeric order and the consumers in lexicographic order. */ + /* For each topic, we sort the consumers in lexicographic order, + * and create a topic_assignment_state. */ rd_list_sort(&eligible_topic->members, rd_kafka_group_member_cmp); + rd_list_add(rktas_list, rd_kafka_topic_assignment_state_new( + eligible_topic, mdi)); + } - /* We then divide the number of partitions by the total number - * of consumers to determine the number of partitions to assign - * to each consumer. */ - numPartitionsPerConsumer = - eligible_topic->metadata->partition_cnt / - rd_list_cnt(&eligible_topic->members); + /* Sort the topic_assignment_states to group the topics which need to be + * co-partitioned. */ + rd_list_sort(rktas_list, rd_kafka_topic_assignment_state_cmp); - /* If it does not evenly divide, then the first few consumers - * will have one extra partition. */ - consumersWithExtraPartition = - eligible_topic->metadata->partition_cnt % - rd_list_cnt(&eligible_topic->members); + /* Use the sorted list of topic_assignment_states and separate them into + * "buckets". Each bucket contains topics which can be co-partitioned, + * ie with the same consumers and number of partitions. */ + prev_rktas = NULL; + rktas_current_bucket = NULL; + RD_LIST_FOREACH(rktas, rktas_list, i) { + if (prev_rktas && rd_kafka_topic_assignment_state_cmp( + rktas, prev_rktas) == 0) { + rd_list_add(rktas_current_bucket, rktas); + continue; + } - rd_kafka_dbg(rk, CGRP, "ASSIGN", - "range: Topic %s with %d partition(s) and " - "%d subscribing member(s)", - eligible_topic->metadata->topic, - eligible_topic->metadata->partition_cnt, - rd_list_cnt(&eligible_topic->members)); + /* The free function is set to NULL, as we don't copy any of the + * topic_assignment_states. */ + rktas_current_bucket = rd_list_new(0, NULL); + rd_list_add(rktas_buckets, rktas_current_bucket); + prev_rktas = rktas; + rd_list_add(rktas_current_bucket, rktas); + } - for (i = 0; i < rd_list_cnt(&eligible_topic->members); i++) { - rd_kafka_group_member_t *rkgm = - rd_list_elem(&eligible_topic->members, i); - int start = numPartitionsPerConsumer * i + - RD_MIN(i, consumersWithExtraPartition); - int length = - numPartitionsPerConsumer + - (i + 1 > consumersWithExtraPartition ? 0 : 1); + /* Iterate through each bucket. In case there's more than one element in + * the bucket, we prefer co-partitioning over rack awareness. Otherwise, + * assign with rack-awareness. */ + rktas = NULL; + rktas_current_bucket = NULL; + RD_LIST_FOREACH(rktas_current_bucket, rktas_buckets, i) { + rd_assert(rd_list_cnt(rktas_current_bucket) > 0); - if (length == 0) + if (rd_list_cnt(rktas_current_bucket) == 1) { + rktas = rd_list_elem(rktas_current_bucket, 0); + if (!rktas->needs_rack_aware_assignment) continue; + rd_kafka_dbg(rk, CGRP, "ASSIGN", - "range: Member \"%s\": " - "assigned topic %s partitions %d..%d", - rkgm->rkgm_member_id->str, - eligible_topic->metadata->topic, start, - start + length - 1); - rd_kafka_topic_partition_list_add_range( - rkgm->rkgm_assignment, - eligible_topic->metadata->topic, start, - start + length - 1); + "range: Topic %s with %d partition(s) and " + "%d subscribing member(s), single-topic " + "rack-aware assignment", + rktas->topic->metadata->topic, + rktas->topic->metadata->partition_cnt, + rd_list_cnt(&rktas->topic->members)); + + rd_kafka_assign_ranges(rktas, rd_kafka_racks_match); + } else { + rktas = rd_list_elem(rktas_current_bucket, 0); + rd_kafka_dbg( + rk, CGRP, "ASSIGN", + "range: %d topics with %d partition(s) and " + "%d subscribing member(s), co-partitioned " + "rack-aware assignment", + rd_list_cnt(rktas_current_bucket), + rktas->topic->metadata->partition_cnt, + rd_list_cnt(&rktas->topic->members)); + + rd_kafka_assign_co_partitioned(rktas_current_bucket); } } + /* Iterate through each rktas, doing normal assignment for any + * partitions that might not have gotten a rack-aware assignment.*/ + RD_LIST_FOREACH(rktas, rktas_list, i) { + rd_kafka_dbg(rk, CGRP, "ASSIGN", + "range: Topic %s with %d partition(s) and " + "%d subscribing member(s), single-topic " + "non-rack-aware assignment for %" PRIusz + " leftover partitions", + rktas->topic->metadata->topic, + rktas->topic->metadata->partition_cnt, + rd_list_cnt(&rktas->topic->members), + rktas->unassigned_partitions_left); + rd_kafka_assign_ranges(rktas, rd_kafka_always); + } + + rd_list_destroy(rktas_list); + rd_list_destroy(rktas_buckets); + return 0; } +/** + * @name Sticky assignor unit tests + * + * + * These are based on RangeAssignorTest.java + * + * + * + */ + + +/* All possible racks used in tests, as well as several common rack configs used + * by consumers */ +static rd_kafkap_str_t + *ALL_RACKS[7]; /* initialized before starting the unit tests. */ +static int RACKS_INITIAL[] = {0, 1, 2}; +static int RACKS_NULL[] = {6, 6, 6}; +static int RACKS_FINAL[] = {4, 5, 6}; +static int RACKS_ONE_NULL[] = {6, 4, 5}; + +static int +ut_testOneConsumerNoTopic(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[1]; + + + if (parametrization == RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK) { + RD_UT_PASS(); + } + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 0); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyAssignment(&members[0], NULL); + + rd_kafka_group_member_clear(&members[0]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testOneConsumerNonexistentTopic( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[1]; + + + if (parametrization == RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK) { + RD_UT_PASS(); + } + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "t1", 0); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyAssignment(&members[0], NULL); + + rd_kafka_group_member_clear(&members[0]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +static int +ut_testOneConsumerOneTopic(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[1]; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "t1", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + RD_UT_ASSERT(members[0].rkgm_assignment->cnt == 3, + "expected assignment of 3 partitions, got %d partition(s)", + members[0].rkgm_assignment->cnt); + + verifyAssignment(&members[0], "t1", 0, "t1", 1, "t1", 2, NULL); + + rd_kafka_group_member_clear(&members[0]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +static int ut_testOnlyAssignsPartitionsFromSubscribedTopics( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[1]; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "t1", 3, "t2", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyAssignment(&members[0], "t1", 0, "t1", 1, "t1", 2, NULL); + + rd_kafka_group_member_clear(&members[0]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testOneConsumerMultipleTopics( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[1]; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "t1", 1, "t2", 2); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", "t2", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyAssignment(&members[0], "t1", 0, "t2", 0, "t2", 1, NULL); + + rd_kafka_group_member_clear(&members[0]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testTwoConsumersOneTopicOnePartition( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[2]; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "t1", 1); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", ALL_RACKS[1], + parametrization, "t1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyAssignment(&members[0], "t1", 0, NULL); + verifyAssignment(&members[1], NULL); + + rd_kafka_group_member_clear(&members[0]); + rd_kafka_group_member_clear(&members[1]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testTwoConsumersOneTopicTwoPartitions( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[2]; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "t1", 2); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", ALL_RACKS[1], + parametrization, "t1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyAssignment(&members[0], "t1", 0, NULL); + verifyAssignment(&members[1], "t1", 1, NULL); + + rd_kafka_group_member_clear(&members[0]); + rd_kafka_group_member_clear(&members[1]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testMultipleConsumersMixedTopicSubscriptions( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[3]; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "t1", 3, "t2", 2); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", ALL_RACKS[1], + parametrization, "t1", "t2", NULL); + ut_initMemberConditionalRack(&members[2], "consumer3", ALL_RACKS[2], + parametrization, "t1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyAssignment(&members[0], "t1", 0, NULL); + verifyAssignment(&members[1], "t1", 1, "t2", 0, "t2", 1, NULL); + verifyAssignment(&members[2], "t1", 2, NULL); + + rd_kafka_group_member_clear(&members[0]); + rd_kafka_group_member_clear(&members[1]); + rd_kafka_group_member_clear(&members[2]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testTwoConsumersTwoTopicsSixPartitions( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[2]; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "t1", 3, "t2", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", ALL_RACKS[0], + parametrization, "t1", "t2", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", ALL_RACKS[1], + parametrization, "t1", "t2", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + RD_ARRAYSIZE(members), errstr, + sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyAssignment(&members[0], "t1", 0, "t1", 1, "t2", 0, "t2", 1, NULL); + verifyAssignment(&members[1], "t1", 2, "t2", 2, NULL); + + rd_kafka_group_member_clear(&members[0]); + rd_kafka_group_member_clear(&members[1]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +/* Helper for setting up metadata and members, and running the assignor. Does + * not check the results of the assignment. */ +static int setupRackAwareAssignment0(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_group_member_t *members, + size_t member_cnt, + int replication_factor, + int num_broker_racks, + size_t topic_cnt, + char *topics[], + int *partitions, + int *subscriptions_count, + char **subscriptions[], + int *consumer_racks, + rd_kafka_metadata_t **metadata) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata_local = NULL; + if (!metadata) + metadata = &metadata_local; + + size_t i = 0; + const int num_brokers = num_broker_racks > 0 + ? replication_factor * num_broker_racks + : replication_factor; + + /* The member naming for tests is consumerN where N is a single + * character. */ + rd_assert(member_cnt <= 9); + + *metadata = rd_kafka_metadata_new_topic_with_partition_replicas_mock( + replication_factor, num_brokers, topics, partitions, topic_cnt); + ut_populate_internal_broker_metadata( + rd_kafka_metadata_get_internal(*metadata), num_broker_racks, + ALL_RACKS, RD_ARRAYSIZE(ALL_RACKS)); + ut_populate_internal_topic_metadata( + rd_kafka_metadata_get_internal(*metadata)); + + for (i = 0; i < member_cnt; i++) { + char member_id[10]; + snprintf(member_id, 10, "consumer%d", (int)(i + 1)); + ut_init_member_with_rack( + &members[i], member_id, ALL_RACKS[consumer_racks[i]], + subscriptions[i], subscriptions_count[i]); + } + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, *metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + if (metadata_local) + ut_destroy_metadata(metadata_local); + return 0; +} + +static int setupRackAwareAssignment(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_group_member_t *members, + size_t member_cnt, + int replication_factor, + int num_broker_racks, + size_t topic_cnt, + char *topics[], + int *partitions, + int *subscriptions_count, + char **subscriptions[], + int *consumer_racks) { + return setupRackAwareAssignment0( + rk, rkas, members, member_cnt, replication_factor, num_broker_racks, + topic_cnt, topics, partitions, subscriptions_count, subscriptions, + consumer_racks, NULL); +} + +/* Helper for testing cases where rack-aware assignment should not be triggered, + * and assignment should be the same as the pre-rack-aware assignor. */ +#define verifyNonRackAwareAssignment(rk, rkas, members, member_cnt, topic_cnt, \ + topics, partitions, subscriptions_count, \ + subscriptions, ...) \ + do { \ + size_t idx = 0; \ + rd_kafka_metadata_t *metadata = NULL; \ + \ + /* num_broker_racks = 0, implies that brokers have no \ + * configured racks. */ \ + setupRackAwareAssignment(rk, rkas, members, member_cnt, 3, 0, \ + topic_cnt, topics, partitions, \ + subscriptions_count, subscriptions, \ + RACKS_INITIAL); \ + verifyMultipleAssignment(members, member_cnt, __VA_ARGS__); \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + /* consumer_racks = RACKS_NULL implies that consumers have no \ + * racks. */ \ + setupRackAwareAssignment(rk, rkas, members, member_cnt, 3, 3, \ + topic_cnt, topics, partitions, \ + subscriptions_count, subscriptions, \ + RACKS_NULL); \ + verifyMultipleAssignment(members, member_cnt, __VA_ARGS__); \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + /* replication_factor = 3 and num_broker_racks = 3 means that \ + * all partitions are replicated on all racks.*/ \ + setupRackAwareAssignment0(rk, rkas, members, member_cnt, 3, 3, \ + topic_cnt, topics, partitions, \ + subscriptions_count, subscriptions, \ + RACKS_INITIAL, &metadata); \ + verifyMultipleAssignment(members, member_cnt, __VA_ARGS__); \ + verifyNumPartitionsWithRackMismatch(metadata, members, \ + RD_ARRAYSIZE(members), 0); \ + \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + ut_destroy_metadata(metadata); \ + /* replication_factor = 4 and num_broker_racks = 4 means that \ + * all partitions are replicated on all racks. */ \ + setupRackAwareAssignment0(rk, rkas, members, member_cnt, 4, 4, \ + topic_cnt, topics, partitions, \ + subscriptions_count, subscriptions, \ + RACKS_INITIAL, &metadata); \ + verifyMultipleAssignment(members, member_cnt, __VA_ARGS__); \ + verifyNumPartitionsWithRackMismatch(metadata, members, \ + RD_ARRAYSIZE(members), 0); \ + \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + ut_destroy_metadata(metadata); \ + /* There's no overap between broker racks and consumer racks, \ + * since num_broker_racks = 3, they'll be picked from a,b,c \ + * and consumer racks are d,e,f. */ \ + setupRackAwareAssignment(rk, rkas, members, member_cnt, 3, 3, \ + topic_cnt, topics, partitions, \ + subscriptions_count, subscriptions, \ + RACKS_FINAL); \ + verifyMultipleAssignment(members, member_cnt, __VA_ARGS__); \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + /* There's no overap between broker racks and consumer racks, \ + * since num_broker_racks = 3, they'll be picked from a,b,c \ + * and consumer racks are d,e,NULL. */ \ + setupRackAwareAssignment(rk, rkas, members, member_cnt, 3, 3, \ + topic_cnt, topics, partitions, \ + subscriptions_count, subscriptions, \ + RACKS_ONE_NULL); \ + verifyMultipleAssignment(members, member_cnt, __VA_ARGS__); \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + } while (0) + +static int ut_testRackAwareAssignmentWithUniformSubscription( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + char *topics[] = {"t1", "t2", "t3"}; + int partitions[] = {6, 7, 2}; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[3]; + size_t i = 0; + int subscriptions_count[] = {3, 3, 3}; + char **subscriptions[] = {topics, topics, topics}; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + verifyNonRackAwareAssignment( + rk, rkas, members, RD_ARRAYSIZE(members), RD_ARRAYSIZE(topics), + topics, partitions, subscriptions_count, subscriptions, + /* consumer1*/ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t2", 2, "t3", 0, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 3, "t2", 4, "t3", 1, NULL, + /* consumer3 */ + "t1", 4, "t1", 5, "t2", 5, "t2", 6, NULL); + + /* Verify best-effort rack-aware assignment for lower replication factor + * where racks have a subset of partitions.*/ + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 1, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t2", 2, "t2", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 2, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /*consumer1*/ + "t1", 0, "t1", 2, "t2", 0, "t2", 2, "t2", 3, "t3", 1, NULL, + /* consumer2 */ + "t1", 1, "t1", 3, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer 3*/ + "t1", 4, "t1", 5, "t2", 5, "t2", 6, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 1); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + + /* One consumer on a rack with no partitions. */ + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 3, + 2, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment(members, RD_ARRAYSIZE(members), + /* consumer1 */ "t1", 0, "t1", 1, "t2", 0, + "t2", 1, "t2", 2, "t3", 0, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 3, "t2", 4, "t3", 1, + NULL, + /* consumer3 */ + "t1", 4, "t1", 5, "t2", 5, "t2", 6, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 4); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testRackAwareAssignmentWithNonEqualSubscription( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_metadata_t *metadata; + char *topics[] = {"t1", "t2", "t3"}; + int partitions[] = {6, 7, 2}; + rd_kafka_group_member_t members[3]; + size_t i = 0; + int subscriptions_count[] = {3, 3, 2}; + char *subscription13[] = {"t1", "t3"}; + char **subscriptions[] = {topics, topics, subscription13}; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + verifyNonRackAwareAssignment( + rk, rkas, members, RD_ARRAYSIZE(members), RD_ARRAYSIZE(topics), + topics, partitions, subscriptions_count, subscriptions, + /* consumer1*/ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t2", 2, "t2", 3, "t3", 0, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 4, "t2", 5, "t2", 6, "t3", 1, NULL, + /* consumer3 */ + "t1", 4, "t1", 5, NULL); + + /* Verify best-effort rack-aware assignment for lower replication factor + * where racks have a subset of partitions. */ + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 1, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 2, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t2", 5, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 2); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 2, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 2, "t2", 0, "t2", 2, "t2", 3, "t2", 5, "t3", 1, NULL, + /* consumer2 */ + "t1", 1, "t1", 3, "t2", 1, "t2", 4, "t2", 6, "t3", 0, NULL, + /* consumer3 */ + "t1", 4, "t1", 5, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + /* One consumer on a rack with no partitions */ + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 3, + 2, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t2", 2, "t2", 3, "t3", 0, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 4, "t2", 5, "t2", 6, "t3", 1, NULL, + /* consumer3 */ + "t1", 4, "t1", 5, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 2); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testRackAwareAssignmentWithUniformPartitions( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + char *topics[] = {"t1", "t2", "t3"}; + int partitions[] = {5, 5, 5}; + int partitions_mismatch[] = {10, 5, 3}; + rd_kafka_group_member_t members[3]; + size_t i = 0; + int replication_factor = 0; + int subscriptions_count[] = {3, 3, 3}; + char **subscriptions[] = {topics, topics, topics}; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + /* Verify combinations where rack-aware logic is not used. */ + verifyNonRackAwareAssignment( + rk, rkas, members, RD_ARRAYSIZE(members), RD_ARRAYSIZE(topics), + topics, partitions, subscriptions_count, subscriptions, + /* consumer1*/ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t3", 0, "t3", 1, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 2, "t2", 3, "t3", 2, "t3", 3, NULL, + /* consumer3 */ + "t1", 4, "t2", 4, "t3", 4, NULL); + + /* Verify that co-partitioning is prioritized over rack-alignment for + * topics with equal subscriptions */ + for (replication_factor = 1; replication_factor <= 3; + replication_factor++) { + rd_kafka_metadata_t *metadata = NULL; + setupRackAwareAssignment0( + rk, rkas, members, RD_ARRAYSIZE(members), + replication_factor, replication_factor < 3 ? 3 : 2, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, RACKS_INITIAL, + &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1*/ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t3", 0, "t3", 1, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 2, "t2", 3, "t3", 2, "t3", 3, NULL, + /* consumer3 */ + "t1", 4, "t2", 4, "t3", 4, NULL); + verifyNumPartitionsWithRackMismatch( + metadata, members, RD_ARRAYSIZE(members), + partitions_mismatch[replication_factor - 1]); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + } + + RD_UT_PASS(); +} + +static int ut_testRackAwareAssignmentWithUniformPartitionsNonEqualSubscription( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_metadata_t *metadata = NULL; + char *topics[] = {"t1", "t2", "t3"}; + int partitions[] = {5, 5, 5}; + rd_kafka_group_member_t members[3]; + size_t i = 0; + int subscriptions_count[] = {3, 3, 2}; + char *subscription13[] = {"t1", "t3"}; + char **subscriptions[] = {topics, topics, subscription13}; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + /* Verify combinations where rack-aware logic is not used. */ + verifyNonRackAwareAssignment( + rk, rkas, members, RD_ARRAYSIZE(members), RD_ARRAYSIZE(topics), + topics, partitions, subscriptions_count, subscriptions, + /* consumer1*/ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t2", 2, "t3", 0, "t3", 1, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 3, "t2", 4, "t3", 2, "t3", 3, NULL, + /* consumer3 */ + "t1", 4, "t3", 4, NULL); + + /* Verify that co-partitioning is prioritized over rack-alignment for + * topics with equal subscriptions */ + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 1, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t2", 4, "t3", 0, "t3", 1, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 2, "t2", 3, "t3", 2, "t3", 3, NULL, + /* consumer3 */ + "t1", 4, "t3", 4, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 9); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 2, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 2, "t2", 0, "t2", 1, "t2", 3, "t3", 2, NULL, + /* consumer2 */ + "t1", 0, "t1", 3, "t2", 2, "t2", 4, "t3", 0, "t3", 3, NULL, + /* consumer3 */ + "t1", 1, "t1", 4, "t3", 1, "t3", 4, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + /* One consumer on a rack with no partitions */ + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 3, + 2, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t2", 2, "t3", 0, "t3", 1, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 3, "t2", 4, "t3", 2, "t3", 3, NULL, + /* consumer3 */ + "t1", 4, "t3", 4, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 2); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testRackAwareAssignmentWithCoPartitioning0( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_metadata_t *metadata = NULL; + char *topics[] = {"t1", "t2", "t3", "t4"}; + int partitions[] = {6, 6, 2, 2}; + rd_kafka_group_member_t members[4]; + size_t i = 0; + int subscriptions_count[] = {2, 2, 2, 2}; + char *subscription12[] = {"t1", "t2"}; + char *subscription34[] = {"t3", "t4"}; + char **subscriptions[] = {subscription12, subscription12, + subscription34, subscription34}; + int racks[] = {0, 1, 1, 0}; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + setupRackAwareAssignment(rk, rkas, members, RD_ARRAYSIZE(members), 3, 2, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, racks); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t1", 2, "t2", 0, "t2", 1, "t2", 2, NULL, + /* consumer2 */ + "t1", 3, "t1", 4, "t1", 5, "t2", 3, "t2", 4, "t2", 5, NULL, + /* consumer3 */ + "t3", 0, "t4", 0, NULL, + /* consumer4 */ + "t3", 1, "t4", 1, NULL); + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 2, + 2, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, racks, + &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t1", 2, "t2", 0, "t2", 1, "t2", 2, NULL, + /* consumer2 */ + "t1", 3, "t1", 4, "t1", 5, "t2", 3, "t2", 4, "t2", 5, NULL, + /* consumer3 */ + "t3", 0, "t4", 0, NULL, + /* consumer4 */ + "t3", 1, "t4", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 1, + 2, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, racks, + &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 2, "t1", 4, "t2", 0, "t2", 2, "t2", 4, NULL, + /* consumer2 */ + "t1", 1, "t1", 3, "t1", 5, "t2", 1, "t2", 3, "t2", 5, NULL, + /* consumer3 */ + "t3", 1, "t4", 1, NULL, + /* consumer4 */ + "t3", 0, "t4", 0, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testRackAwareAssignmentWithCoPartitioning1( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_metadata_t *metadata = NULL; + char *topics[] = {"t1", "t2", "t3", "t4"}; + int partitions[] = {6, 6, 2, 2}; + rd_kafka_group_member_t members[4]; + size_t i = 0; + int subscriptions_count[] = {4, 4, 4, 4}; + char **subscriptions[] = {topics, topics, topics, topics}; + int racks[] = {0, 1, 1, 0}; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + setupRackAwareAssignment(rk, rkas, members, RD_ARRAYSIZE(members), 3, 2, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, racks); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t3", 0, "t4", 0, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 2, "t2", 3, "t3", 1, "t4", 1, NULL, + /* consumer3 */ + "t1", 4, "t2", 4, NULL, + /* consumer4 */ + "t1", 5, "t2", 5, NULL); + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 2, + 2, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, racks, + &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t3", 0, "t4", 0, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 2, "t2", 3, "t3", 1, "t4", 1, NULL, + /* consumer3 */ + "t1", 4, "t2", 4, NULL, + /* consumer4 */ + "t1", 5, "t2", 5, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 1, + 2, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, racks, + &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 2, "t2", 0, "t2", 2, "t3", 0, "t4", 0, NULL, + /* consumer2 */ + "t1", 1, "t1", 3, "t2", 1, "t2", 3, "t3", 1, "t4", 1, NULL, + /* consumer3 */ + "t1", 5, "t2", 5, NULL, + /* consumer4 */ + "t1", 4, "t2", 4, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 1, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, racks, + &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t3", 0, "t4", 0, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 1, "t4", 1, NULL, + /* consumer3 */ + "t1", 2, "t2", 2, NULL, + /* consumer4 */ + "t1", 5, "t2", 5, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 6); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testCoPartitionedAssignmentWithSameSubscription( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_metadata_t *metadata = NULL; + char *topics[] = {"t1", "t2", "t3", "t4", "t5", "t6"}; + int partitions[] = {6, 6, 2, 2, 4, 4}; + rd_kafka_group_member_t members[3]; + size_t i = 0; + int subscriptions_count[] = {6, 6, 6}; + char **subscriptions[] = {topics, topics, topics}; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + setupRackAwareAssignment(rk, rkas, members, RD_ARRAYSIZE(members), 3, 0, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t3", 0, "t4", 0, "t5", 0, "t5", + 1, "t6", 0, "t6", 1, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 2, "t2", 3, "t3", 1, "t4", 1, "t5", 2, "t6", + 2, NULL, + /* consumer3 */ + "t1", 4, "t1", 5, "t2", 4, "t2", 5, "t5", 3, "t6", 3, NULL); + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 3, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 1, "t2", 0, "t2", 1, "t3", 0, "t4", 0, "t5", 0, "t5", + 1, "t6", 0, "t6", 1, NULL, + /* consumer2 */ + "t1", 2, "t1", 3, "t2", 2, "t2", 3, "t3", 1, "t4", 1, "t5", 2, "t6", + 2, NULL, + /* consumer3 */ + "t1", 4, "t1", 5, "t2", 4, "t2", 5, "t5", 3, "t6", 3, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +static int rd_kafka_range_assignor_unittest(void) { + rd_kafka_conf_t *conf; + rd_kafka_t *rk; + int fails = 0; + char errstr[256]; + rd_kafka_assignor_t *rkas; + size_t i; + + conf = rd_kafka_conf_new(); + if (rd_kafka_conf_set(conf, "group.id", "test", errstr, + sizeof(errstr)) || + rd_kafka_conf_set(conf, "partition.assignment.strategy", "range", + errstr, sizeof(errstr))) + RD_UT_FAIL("range assignor conf failed: %s", errstr); + + rd_kafka_conf_set(conf, "debug", rd_getenv("TEST_DEBUG", NULL), NULL, + 0); + + rk = rd_kafka_new(RD_KAFKA_CONSUMER, conf, errstr, sizeof(errstr)); + RD_UT_ASSERT(rk, "range assignor client instantiation failed: %s", + errstr); + rkas = rd_kafka_assignor_find(rk, "range"); + RD_UT_ASSERT(rkas, "range assignor not found"); + + for (i = 0; i < RD_ARRAY_SIZE(ALL_RACKS) - 1; i++) { + char c = 'a' + i; + ALL_RACKS[i] = rd_kafkap_str_new(&c, 1); + } + ALL_RACKS[i] = NULL; + + static int (*tests[])( + rd_kafka_t *, const rd_kafka_assignor_t *, + rd_kafka_assignor_ut_rack_config_t parametrization) = { + ut_testOneConsumerNoTopic, + ut_testOneConsumerNonexistentTopic, + ut_testOneConsumerOneTopic, + ut_testOnlyAssignsPartitionsFromSubscribedTopics, + ut_testOneConsumerMultipleTopics, + ut_testTwoConsumersOneTopicOnePartition, + ut_testTwoConsumersOneTopicTwoPartitions, + ut_testMultipleConsumersMixedTopicSubscriptions, + ut_testTwoConsumersTwoTopicsSixPartitions, + ut_testRackAwareAssignmentWithUniformSubscription, + ut_testRackAwareAssignmentWithNonEqualSubscription, + ut_testRackAwareAssignmentWithUniformPartitions, + ut_testRackAwareAssignmentWithUniformPartitionsNonEqualSubscription, + ut_testRackAwareAssignmentWithCoPartitioning0, + ut_testRackAwareAssignmentWithCoPartitioning1, + ut_testCoPartitionedAssignmentWithSameSubscription, + NULL, + }; + + for (i = 0; tests[i]; i++) { + rd_ts_t ts = rd_clock(); + int r = 0; + rd_kafka_assignor_ut_rack_config_t j; + + for (j = RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK; + j != RD_KAFKA_RANGE_ASSIGNOR_UT_CONFIG_CNT; j++) { + RD_UT_SAY("[ Test #%" PRIusz ", RackConfig = %d ]", i, + j); + r += tests[i](rk, rkas, j); + } + RD_UT_SAY("[ Test #%" PRIusz " ran for %.3fms ]", i, + (double)(rd_clock() - ts) / 1000.0); + + RD_UT_ASSERT(!r, "^ failed"); + + fails += r; + } + + for (i = 0; i < RD_ARRAY_SIZE(ALL_RACKS) - 1; i++) { + rd_kafkap_str_destroy(ALL_RACKS[i]); + } + + rd_kafka_destroy(rk); + + return fails; +} + + /** * @brief Initialzie and add range assignor. @@ -133,6 +1742,7 @@ rd_kafka_resp_err_t rd_kafka_range_assignor_init(rd_kafka_t *rk) { return rd_kafka_assignor_add( rk, "consumer", "range", RD_KAFKA_REBALANCE_PROTOCOL_EAGER, rd_kafka_range_assignor_assign_cb, - rd_kafka_assignor_get_metadata_with_empty_userdata, NULL, NULL, - NULL, NULL); + rd_kafka_assignor_get_metadata_with_empty_userdata, + NULL /* on_assignment_cb */, NULL /* destroy_state_cb */, + rd_kafka_range_assignor_unittest, NULL); } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_request.c b/src/third_party/librdkafka/dist/src/rdkafka_request.c index c86a5e27cc0..f553a2427e9 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_request.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_request.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,6 +36,7 @@ #include "rdkafka_topic.h" #include "rdkafka_partition.h" #include "rdkafka_metadata.h" +#include "rdkafka_telemetry.h" #include "rdkafka_msgset.h" #include "rdkafka_idempotence.h" #include "rdkafka_txnmgr.h" @@ -121,6 +123,7 @@ int rd_kafka_err_action(rd_kafka_broker_t *rkb, case RD_KAFKA_RESP_ERR_COORDINATOR_NOT_AVAILABLE: case RD_KAFKA_RESP_ERR_NOT_COORDINATOR: case RD_KAFKA_RESP_ERR__WAIT_COORD: + case RD_KAFKA_RESP_ERR__DESTROY_BROKER: /* Request metadata information update */ actions |= RD_KAFKA_ERR_ACTION_REFRESH | RD_KAFKA_ERR_ACTION_MSG_NOT_PERSISTED; @@ -134,6 +137,7 @@ int rd_kafka_err_action(rd_kafka_broker_t *rkb, break; case RD_KAFKA_RESP_ERR__TRANSPORT: + case RD_KAFKA_RESP_ERR__SSL: case RD_KAFKA_RESP_ERR__TIMED_OUT: case RD_KAFKA_RESP_ERR_REQUEST_TIMED_OUT: case RD_KAFKA_RESP_ERR_NOT_ENOUGH_REPLICAS_AFTER_APPEND: @@ -142,6 +146,7 @@ int rd_kafka_err_action(rd_kafka_broker_t *rkb, break; case RD_KAFKA_RESP_ERR_NOT_ENOUGH_REPLICAS: + case RD_KAFKA_RESP_ERR_INVALID_MSG: /* Client-side wait-response/in-queue timeout */ case RD_KAFKA_RESP_ERR__TIMED_OUT_QUEUE: actions |= RD_KAFKA_ERR_ACTION_RETRY | @@ -196,63 +201,158 @@ int rd_kafka_err_action(rd_kafka_broker_t *rkb, return actions; } - /** * @brief Read a list of topic+partitions+extra from \p rkbuf. * - * @param rkbuf buffer to read from - * @param estimated_part_cnt estimated number of partitions to read. - * @param read_part_errs whether or not to read an error per partition. + * @param rkbuf Buffer to read from + * @param fields An array of fields to read from the buffer and set on + * the rktpar object, in the specified order, must end + * with RD_KAFKA_TOPIC_PARTITION_FIELD_END. * - * @returns a newly allocated list on success, or NULL on parse error. + * @returns A newly allocated list on success, or NULL on parse error. */ -rd_kafka_topic_partition_list_t * -rd_kafka_buf_read_topic_partitions(rd_kafka_buf_t *rkbuf, - size_t estimated_part_cnt, - rd_bool_t read_offset, - rd_bool_t read_part_errs) { +rd_kafka_topic_partition_list_t *rd_kafka_buf_read_topic_partitions( + rd_kafka_buf_t *rkbuf, + rd_bool_t use_topic_id, + rd_bool_t use_topic_name, + size_t estimated_part_cnt, + const rd_kafka_topic_partition_field_t *fields) { + rd_bool_t parse_err; + /* Even if NULL it should be treated as a parse error, + * as this field isn't nullable. */ + return rd_kafka_buf_read_topic_partitions_nullable( + rkbuf, use_topic_id, use_topic_name, estimated_part_cnt, fields, + &parse_err); +} + +/** + * @brief Read a nullable list of topic+partitions+extra from \p rkbuf. + * + * @param rkbuf Buffer to read from + * @param fields An array of fields to read from the buffer and set on + * the rktpar object, in the specified order, must end + * with RD_KAFKA_TOPIC_PARTITION_FIELD_END. + * @param parse_err Is set to true if a parsing error occurred. + * + * @returns A newly allocated list, or NULL. + */ +rd_kafka_topic_partition_list_t *rd_kafka_buf_read_topic_partitions_nullable( + rd_kafka_buf_t *rkbuf, + rd_bool_t use_topic_id, + rd_bool_t use_topic_name, + size_t estimated_part_cnt, + const rd_kafka_topic_partition_field_t *fields, + rd_bool_t *parse_err) { const int log_decode_errors = LOG_ERR; - int16_t ErrorCode = 0; int32_t TopicArrayCnt; rd_kafka_topic_partition_list_t *parts = NULL; + rd_dassert(parse_err); + *parse_err = rd_false; + rd_kafka_buf_read_arraycnt(rkbuf, &TopicArrayCnt, RD_KAFKAP_TOPICS_MAX); + if (TopicArrayCnt < -1) + goto err_parse; + else if (TopicArrayCnt == -1) + return NULL; parts = rd_kafka_topic_partition_list_new( - RD_MAX(TopicArrayCnt, (int)estimated_part_cnt)); + RD_MAX(TopicArrayCnt * 4, (int)estimated_part_cnt)); while (TopicArrayCnt-- > 0) { rd_kafkap_str_t kTopic; int32_t PartArrayCnt; - char *topic; + char *topic = NULL; + rd_kafka_Uuid_t topic_id; + + if (use_topic_id) { + rd_kafka_buf_read_uuid(rkbuf, &topic_id); + } + if (use_topic_name) { + rd_kafka_buf_read_str(rkbuf, &kTopic); + RD_KAFKAP_STR_DUPA(&topic, &kTopic); + } - rd_kafka_buf_read_str(rkbuf, &kTopic); rd_kafka_buf_read_arraycnt(rkbuf, &PartArrayCnt, RD_KAFKAP_PARTITIONS_MAX); - RD_KAFKAP_STR_DUPA(&topic, &kTopic); while (PartArrayCnt-- > 0) { - int32_t Partition; - int64_t Offset; + int32_t Partition = -1, Epoch = -1234, + CurrentLeaderEpoch = -1234; + int64_t Offset = -1234; + int16_t ErrorCode = 0; rd_kafka_topic_partition_t *rktpar; + int fi; - rd_kafka_buf_read_i32(rkbuf, &Partition); + /* + * Read requested fields + */ + for (fi = 0; + fields[fi] != RD_KAFKA_TOPIC_PARTITION_FIELD_END; + fi++) { + switch (fields[fi]) { + case RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION: + rd_kafka_buf_read_i32(rkbuf, + &Partition); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_OFFSET: + rd_kafka_buf_read_i64(rkbuf, &Offset); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_CURRENT_EPOCH: + rd_kafka_buf_read_i32( + rkbuf, &CurrentLeaderEpoch); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_EPOCH: + rd_kafka_buf_read_i32(rkbuf, &Epoch); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_ERR: + rd_kafka_buf_read_i16(rkbuf, + &ErrorCode); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_METADATA: + rd_assert(!*"metadata not implemented"); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_TIMESTAMP: + rd_assert( + !*"timestamp not implemented"); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_NOOP: + /* Fallback */ + case RD_KAFKA_TOPIC_PARTITION_FIELD_END: + break; + } + } - rktpar = rd_kafka_topic_partition_list_add(parts, topic, - Partition); + if (use_topic_id) { + rktpar = + rd_kafka_topic_partition_list_add_with_topic_id( + parts, topic_id, Partition); + if (use_topic_name) + rktpar->topic = rd_strdup(topic); + } else if (use_topic_name) { + rktpar = rd_kafka_topic_partition_list_add( + parts, topic, Partition); + } else { + rd_assert(!*"one of use_topic_id and " + "use_topic_name should be true"); + } - if (read_offset) { - rd_kafka_buf_read_i64(rkbuf, &Offset); + /* Use dummy sentinel values that are unlikely to be + * seen from the broker to know if we are to set these + * fields or not. */ + if (Offset != -1234) rktpar->offset = Offset; - } + if (Epoch != -1234) + rd_kafka_topic_partition_set_leader_epoch( + rktpar, Epoch); + if (CurrentLeaderEpoch != -1234) + rd_kafka_topic_partition_set_current_leader_epoch( + rktpar, CurrentLeaderEpoch); + rktpar->err = ErrorCode; - if (read_part_errs) { - rd_kafka_buf_read_i16(rkbuf, &ErrorCode); - rktpar->err = ErrorCode; - } - - rd_kafka_buf_skip_tags(rkbuf); + if (fi > 1) + rd_kafka_buf_skip_tags(rkbuf); } rd_kafka_buf_skip_tags(rkbuf); @@ -264,6 +364,7 @@ err_parse: if (parts) rd_kafka_topic_partition_list_destroy(parts); + *parse_err = rd_true; return NULL; } @@ -273,24 +374,23 @@ err_parse: * * @returns the number of partitions written to buffer. * - * @remark The \p parts list MUST be sorted. + * @remark The \p parts list MUST be sorted by name if use_topic_id is false or + * by id. */ int rd_kafka_buf_write_topic_partitions( rd_kafka_buf_t *rkbuf, const rd_kafka_topic_partition_list_t *parts, rd_bool_t skip_invalid_offsets, rd_bool_t only_invalid_offsets, - rd_bool_t write_Offset, - rd_bool_t write_Epoch, - rd_bool_t write_Metadata) { + rd_bool_t use_topic_id, + rd_bool_t use_topic_name, + const rd_kafka_topic_partition_field_t *fields) { size_t of_TopicArrayCnt; size_t of_PartArrayCnt = 0; int TopicArrayCnt = 0, PartArrayCnt = 0; int i; - const char *prev_topic = NULL; - int cnt = 0; - rd_bool_t partition_id_only = - !write_Offset && !write_Epoch && !write_Metadata; + const rd_kafka_topic_partition_t *prev_topic = NULL; + int cnt = 0; rd_assert(!only_invalid_offsets || (only_invalid_offsets != skip_invalid_offsets)); @@ -300,6 +400,8 @@ int rd_kafka_buf_write_topic_partitions( for (i = 0; i < parts->cnt; i++) { const rd_kafka_topic_partition_t *rktpar = &parts->elems[i]; + rd_bool_t different_topics; + int fi; if (rktpar->offset < 0) { if (skip_invalid_offsets) @@ -307,20 +409,41 @@ int rd_kafka_buf_write_topic_partitions( } else if (only_invalid_offsets) continue; - if (!prev_topic || strcmp(rktpar->topic, prev_topic)) { + if (use_topic_id) { + different_topics = + !prev_topic || + rd_kafka_Uuid_cmp( + rd_kafka_topic_partition_get_topic_id(rktpar), + rd_kafka_topic_partition_get_topic_id( + prev_topic)); + } else { + different_topics = + !prev_topic || + strcmp(rktpar->topic, prev_topic->topic); + } + if (different_topics) { /* Finish previous topic, if any. */ if (of_PartArrayCnt > 0) { rd_kafka_buf_finalize_arraycnt( rkbuf, of_PartArrayCnt, PartArrayCnt); /* Tags for previous topic struct */ - rd_kafka_buf_write_tags(rkbuf); + rd_kafka_buf_write_tags_empty(rkbuf); } /* Topic */ - rd_kafka_buf_write_str(rkbuf, rktpar->topic, -1); + if (use_topic_name) + rd_kafka_buf_write_str(rkbuf, rktpar->topic, + -1); + if (use_topic_id) { + rd_kafka_Uuid_t topic_id = + rd_kafka_topic_partition_get_topic_id( + rktpar); + rd_kafka_buf_write_uuid(rkbuf, &topic_id); + } + TopicArrayCnt++; - prev_topic = rktpar->topic; + prev_topic = rktpar; /* New topic so reset partition count */ PartArrayCnt = 0; @@ -329,36 +452,67 @@ int rd_kafka_buf_write_topic_partitions( rd_kafka_buf_write_arraycnt_pos(rkbuf); } - /* Partition */ - rd_kafka_buf_write_i32(rkbuf, rktpar->partition); + + /* + * Write requested fields + */ + for (fi = 0; fields[fi] != RD_KAFKA_TOPIC_PARTITION_FIELD_END; + fi++) { + switch (fields[fi]) { + case RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION: + rd_kafka_buf_write_i32(rkbuf, + rktpar->partition); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_OFFSET: + rd_kafka_buf_write_i64(rkbuf, rktpar->offset); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_CURRENT_EPOCH: + rd_kafka_buf_write_i32( + rkbuf, + rd_kafka_topic_partition_get_current_leader_epoch( + rktpar)); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_EPOCH: + rd_kafka_buf_write_i32( + rkbuf, + rd_kafka_topic_partition_get_leader_epoch( + rktpar)); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_ERR: + rd_kafka_buf_write_i16(rkbuf, rktpar->err); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_TIMESTAMP: + /* Current implementation is just + * sending a NULL value */ + rd_kafka_buf_write_i64(rkbuf, -1); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_METADATA: + /* Java client 0.9.0 and broker <0.10.0 can't + * parse Null metadata fields, so as a + * workaround we send an empty string if + * it's Null. */ + if (!rktpar->metadata) + rd_kafka_buf_write_str(rkbuf, "", 0); + else + rd_kafka_buf_write_str( + rkbuf, rktpar->metadata, + rktpar->metadata_size); + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_NOOP: + break; + case RD_KAFKA_TOPIC_PARTITION_FIELD_END: + break; + } + } + + + if (fi > 1) + /* If there was more than one field written + * then this was a struct and thus needs the + * struct suffix tags written. */ + rd_kafka_buf_write_tags_empty(rkbuf); + PartArrayCnt++; - - /* Time/Offset */ - if (write_Offset) { - rd_kafka_buf_write_i64(rkbuf, rktpar->offset); - } - - if (write_Epoch) { - /* CommittedLeaderEpoch */ - rd_kafka_buf_write_i32(rkbuf, -1); - } - - if (write_Metadata) { - /* Metadata */ - /* Java client 0.9.0 and broker <0.10.0 can't parse - * Null metadata fields, so as a workaround we send an - * empty string if it's Null. */ - if (!rktpar->metadata) - rd_kafka_buf_write_str(rkbuf, "", 0); - else - rd_kafka_buf_write_str(rkbuf, rktpar->metadata, - rktpar->metadata_size); - } - - /* Tags for partition struct */ - if (!partition_id_only) - rd_kafka_buf_write_tags(rkbuf); - cnt++; } @@ -366,7 +520,7 @@ int rd_kafka_buf_write_topic_partitions( rd_kafka_buf_finalize_arraycnt(rkbuf, of_PartArrayCnt, PartArrayCnt); /* Tags for topic struct */ - rd_kafka_buf_write_tags(rkbuf); + rd_kafka_buf_write_tags_empty(rkbuf); } rd_kafka_buf_finalize_arraycnt(rkbuf, of_TopicArrayCnt, TopicArrayCnt); @@ -375,6 +529,61 @@ int rd_kafka_buf_write_topic_partitions( } +/** + * @brief Read current leader from \p rkbuf. + * + * @param rkbuf buffer to read from + * @param CurrentLeader is the CurrentLeader to populate. + * + * @return 1 on success, else -1 on parse error. + */ +int rd_kafka_buf_read_CurrentLeader(rd_kafka_buf_t *rkbuf, + rd_kafkap_CurrentLeader_t *CurrentLeader) { + const int log_decode_errors = LOG_ERR; + rd_kafka_buf_read_i32(rkbuf, &CurrentLeader->LeaderId); + rd_kafka_buf_read_i32(rkbuf, &CurrentLeader->LeaderEpoch); + rd_kafka_buf_skip_tags(rkbuf); + return 1; +err_parse: + return -1; +} + +/** + * @brief Read NodeEndpoints from \p rkbuf. + * + * @param rkbuf buffer to read from + * @param NodeEndpoints is the NodeEndpoints to populate. + * + * @return 1 on success, else -1 on parse error. + */ +int rd_kafka_buf_read_NodeEndpoints(rd_kafka_buf_t *rkbuf, + rd_kafkap_NodeEndpoints_t *NodeEndpoints) { + const int log_decode_errors = LOG_ERR; + int32_t i; + rd_kafka_buf_read_arraycnt(rkbuf, &NodeEndpoints->NodeEndpointCnt, + RD_KAFKAP_BROKERS_MAX); + rd_dassert(!NodeEndpoints->NodeEndpoints); + NodeEndpoints->NodeEndpoints = + rd_calloc(NodeEndpoints->NodeEndpointCnt, + sizeof(*NodeEndpoints->NodeEndpoints)); + + for (i = 0; i < NodeEndpoints->NodeEndpointCnt; i++) { + rd_kafka_buf_read_i32(rkbuf, + &NodeEndpoints->NodeEndpoints[i].NodeId); + rd_kafka_buf_read_str(rkbuf, + &NodeEndpoints->NodeEndpoints[i].Host); + rd_kafka_buf_read_i32(rkbuf, + &NodeEndpoints->NodeEndpoints[i].Port); + rd_kafka_buf_read_str(rkbuf, + &NodeEndpoints->NodeEndpoints[i].Rack); + rd_kafka_buf_skip_tags(rkbuf); + } + return 1; +err_parse: + return -1; +} + + /** * @brief Send FindCoordinatorRequest. * @@ -412,25 +621,95 @@ rd_kafka_FindCoordinatorRequest(rd_kafka_broker_t *rkb, return RD_KAFKA_RESP_ERR_NO_ERROR; } +/** + * @struct rd_kafka_ListOffsetRequest_parameters_s + * @brief parameters for the rd_kafka_make_ListOffsetsRequest function. + */ +typedef struct rd_kafka_ListOffsetRequest_parameters_s { + /** Partitions to request offsets for. */ + rd_kafka_topic_partition_list_t *rktpars; + /** Isolation level. */ + rd_kafka_IsolationLevel_t isolation_level; + /** Error string (optional). */ + char *errstr; + /** Error string size (optional). */ + size_t errstr_size; +} rd_kafka_ListOffsetRequest_parameters_t; +static rd_kafka_ListOffsetRequest_parameters_t +rd_kafka_ListOffsetRequest_parameters_make( + rd_kafka_topic_partition_list_t *rktpars, + rd_kafka_IsolationLevel_t isolation_level, + char *errstr, + size_t errstr_size) { + rd_kafka_ListOffsetRequest_parameters_t params = RD_ZERO_INIT; + params.rktpars = rktpars; + params.isolation_level = isolation_level; + params.errstr = errstr; + params.errstr_size = errstr_size; + return params; +} + +static rd_kafka_ListOffsetRequest_parameters_t * +rd_kafka_ListOffsetRequest_parameters_new( + rd_kafka_topic_partition_list_t *rktpars, + rd_kafka_IsolationLevel_t isolation_level, + char *errstr, + size_t errstr_size) { + rd_kafka_ListOffsetRequest_parameters_t *params = + rd_calloc(1, sizeof(*params)); + *params = rd_kafka_ListOffsetRequest_parameters_make( + rktpars, isolation_level, errstr, errstr_size); + return params; +} + +static void rd_kafka_ListOffsetRequest_parameters_destroy_free(void *opaque) { + rd_kafka_ListOffsetRequest_parameters_t *parameters = opaque; + RD_IF_FREE(parameters->rktpars, rd_kafka_topic_partition_list_destroy); + RD_IF_FREE(parameters->errstr, rd_free); + rd_free(parameters); +} + +static rd_kafka_buf_t * +rd_kafka_ListOffsetRequest_buf_new(rd_kafka_broker_t *rkb, + rd_kafka_topic_partition_list_t *rktpars) { + return rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_ListOffsets, 1, + /* ReplicaId+IsolationLevel+TopicArrayCnt+Topic */ + 4 + 1 + 4 + 100 + + /* PartArrayCnt */ + 4 + + /* partition_cnt * Partition+Time+MaxNumOffs */ + (rktpars->cnt * (4 + 8 + 4)), + rd_false); +} + /** * @brief Parses a ListOffsets reply. * * Returns the parsed offsets (and errors) in \p offsets which must have been - * initialized by caller. + * initialized by caller. If \p result_info is passed instead, + * it's populated with rd_kafka_ListOffsetsResultInfo_t instances. + * + * Either \p offsets or \p result_info must be passed. + * and the one that is passed is populated. * * @returns 0 on success, else an error (\p offsets may be completely or * partially updated, depending on the nature of the error, and per * partition error codes should be checked by the caller). */ -static rd_kafka_resp_err_t +rd_kafka_resp_err_t rd_kafka_parse_ListOffsets(rd_kafka_buf_t *rkbuf, - rd_kafka_topic_partition_list_t *offsets) { + rd_kafka_topic_partition_list_t *offsets, + rd_list_t *result_infos) { const int log_decode_errors = LOG_ERR; int32_t TopicArrayCnt; int16_t api_version; rd_kafka_resp_err_t all_err = RD_KAFKA_RESP_ERR_NO_ERROR; + rd_bool_t return_result_infos; + rd_assert((offsets != NULL) ^ (result_infos != NULL)); + return_result_infos = result_infos != NULL; api_version = rkbuf->rkbuf_reqhdr.ApiVersion; @@ -441,31 +720,37 @@ rd_kafka_parse_ListOffsets(rd_kafka_buf_t *rkbuf, * Broker may return offsets in a different constellation than * in the original request .*/ - rd_kafka_buf_read_i32(rkbuf, &TopicArrayCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicArrayCnt, RD_KAFKAP_TOPICS_MAX); while (TopicArrayCnt-- > 0) { - rd_kafkap_str_t ktopic; + rd_kafkap_str_t Topic; int32_t PartArrayCnt; char *topic_name; - rd_kafka_buf_read_str(rkbuf, &ktopic); - rd_kafka_buf_read_i32(rkbuf, &PartArrayCnt); + rd_kafka_buf_read_str(rkbuf, &Topic); + rd_kafka_buf_read_arraycnt(rkbuf, &PartArrayCnt, + RD_KAFKAP_PARTITIONS_MAX); - RD_KAFKAP_STR_DUPA(&topic_name, &ktopic); + RD_KAFKAP_STR_DUPA(&topic_name, &Topic); while (PartArrayCnt-- > 0) { - int32_t kpartition; + int32_t Partition; int16_t ErrorCode; int32_t OffsetArrayCnt; - int64_t Offset = -1; + int64_t Offset = -1; + int32_t LeaderEpoch = -1; + int64_t Timestamp = -1; rd_kafka_topic_partition_t *rktpar; - rd_kafka_buf_read_i32(rkbuf, &kpartition); + rd_kafka_buf_read_i32(rkbuf, &Partition); rd_kafka_buf_read_i16(rkbuf, &ErrorCode); if (api_version >= 1) { - int64_t Timestamp; rd_kafka_buf_read_i64(rkbuf, &Timestamp); rd_kafka_buf_read_i64(rkbuf, &Offset); + if (api_version >= 4) + rd_kafka_buf_read_i32(rkbuf, + &LeaderEpoch); + rd_kafka_buf_skip_tags(rkbuf); } else if (api_version == 0) { rd_kafka_buf_read_i32(rkbuf, &OffsetArrayCnt); /* We only request one offset so just grab @@ -473,17 +758,35 @@ rd_kafka_parse_ListOffsets(rd_kafka_buf_t *rkbuf, while (OffsetArrayCnt-- > 0) rd_kafka_buf_read_i64(rkbuf, &Offset); } else { - rd_kafka_assert(NULL, !*"NOTREACHED"); + RD_NOTREACHED(); } - rktpar = rd_kafka_topic_partition_list_add( - offsets, topic_name, kpartition); - rktpar->err = ErrorCode; - rktpar->offset = Offset; + if (likely(!return_result_infos)) { + rktpar = rd_kafka_topic_partition_list_add( + offsets, topic_name, Partition); + rktpar->err = ErrorCode; + rktpar->offset = Offset; + rd_kafka_topic_partition_set_leader_epoch( + rktpar, LeaderEpoch); + } else { + rktpar = rd_kafka_topic_partition_new( + topic_name, Partition); + rktpar->err = ErrorCode; + rktpar->offset = Offset; + rd_kafka_topic_partition_set_leader_epoch( + rktpar, LeaderEpoch); + rd_kafka_ListOffsetsResultInfo_t *result_info = + rd_kafka_ListOffsetsResultInfo_new( + rktpar, Timestamp); + rd_list_add(result_infos, result_info); + rd_kafka_topic_partition_destroy(rktpar); + } if (ErrorCode && !all_err) all_err = ErrorCode; } + + rd_kafka_buf_skip_tags(rkbuf); } return all_err; @@ -492,7 +795,196 @@ err_parse: return rkbuf->rkbuf_err; } +/** + * @brief Async maker for ListOffsetsRequest. + */ +static rd_kafka_resp_err_t +rd_kafka_make_ListOffsetsRequest(rd_kafka_broker_t *rkb, + rd_kafka_buf_t *rkbuf, + void *make_opaque) { + rd_kafka_ListOffsetRequest_parameters_t *parameters = make_opaque; + const rd_kafka_topic_partition_list_t *partitions = parameters->rktpars; + int isolation_level = parameters->isolation_level; + char *errstr = parameters->errstr; + size_t errstr_size = parameters->errstr_size; + int i; + size_t of_TopicArrayCnt = 0, of_PartArrayCnt = 0; + const char *last_topic = ""; + int32_t topic_cnt = 0, part_cnt = 0; + int16_t ApiVersion; + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_ListOffsets, 0, 7, NULL); + if (ApiVersion == -1) { + if (errstr) { + rd_snprintf( + errstr, errstr_size, + "ListOffsets (KIP-396) not supported " + "by broker, requires broker version >= 2.5.0"); + } + return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; + } + + if (ApiVersion >= 6) { + rd_kafka_buf_upgrade_flexver_request(rkbuf); + } + + /* ReplicaId */ + rd_kafka_buf_write_i32(rkbuf, -1); + + /* IsolationLevel */ + if (ApiVersion >= 2) + rd_kafka_buf_write_i8(rkbuf, isolation_level); + + /* TopicArrayCnt */ + of_TopicArrayCnt = + rd_kafka_buf_write_arraycnt_pos(rkbuf); /* updated later */ + + for (i = 0; i < partitions->cnt; i++) { + const rd_kafka_topic_partition_t *rktpar = + &partitions->elems[i]; + + if (strcmp(rktpar->topic, last_topic)) { + /* Finish last topic, if any. */ + if (of_PartArrayCnt > 0) { + rd_kafka_buf_finalize_arraycnt( + rkbuf, of_PartArrayCnt, part_cnt); + /* Topics tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + } + + /* Topic */ + rd_kafka_buf_write_str(rkbuf, rktpar->topic, -1); + topic_cnt++; + last_topic = rktpar->topic; + /* New topic so reset partition count */ + part_cnt = 0; + + /* PartitionArrayCnt: updated later */ + of_PartArrayCnt = + rd_kafka_buf_write_arraycnt_pos(rkbuf); + } + + /* Partition */ + rd_kafka_buf_write_i32(rkbuf, rktpar->partition); + part_cnt++; + + if (ApiVersion >= 4) + /* CurrentLeaderEpoch */ + rd_kafka_buf_write_i32( + rkbuf, + rd_kafka_topic_partition_get_current_leader_epoch( + rktpar)); + + /* Time/Offset */ + rd_kafka_buf_write_i64(rkbuf, rktpar->offset); + + if (ApiVersion == 0) { + /* MaxNumberOfOffsets */ + rd_kafka_buf_write_i32(rkbuf, 1); + } + + /* Partitions tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + } + + if (of_PartArrayCnt > 0) { + rd_kafka_buf_finalize_arraycnt(rkbuf, of_PartArrayCnt, + part_cnt); + /* Topics tags */ + rd_kafka_buf_write_tags_empty(rkbuf); + } + rd_kafka_buf_finalize_arraycnt(rkbuf, of_TopicArrayCnt, topic_cnt); + + rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); + + rd_rkb_dbg(rkb, TOPIC, "OFFSET", + "ListOffsetsRequest (v%hd, opv %d) " + "for %" PRId32 " topic(s) and %" PRId32 " partition(s)", + ApiVersion, rkbuf->rkbuf_replyq.version, topic_cnt, + partitions->cnt); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Send ListOffsetsRequest for partitions in \p partitions. + * Set absolute timeout \p timeout_ms if >= 0. + */ +void rd_kafka_ListOffsetsRequest(rd_kafka_broker_t *rkb, + rd_kafka_topic_partition_list_t *partitions, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + int timeout_ms, + void *opaque) { + rd_kafka_buf_t *rkbuf; + rd_kafka_topic_partition_list_t *rktpars; + rd_kafka_ListOffsetRequest_parameters_t *params; + + rktpars = rd_kafka_topic_partition_list_copy(partitions); + rd_kafka_topic_partition_list_sort_by_topic(rktpars); + + params = rd_kafka_ListOffsetRequest_parameters_new( + rktpars, + (rd_kafka_IsolationLevel_t)rkb->rkb_rk->rk_conf.isolation_level, + NULL, 0); + + rkbuf = rd_kafka_ListOffsetRequest_buf_new(rkb, partitions); + + if (timeout_ms >= 0) + rd_kafka_buf_set_abs_timeout(rkbuf, timeout_ms, 0); + + /* Postpone creating the request contents until time to send, + * at which time the ApiVersion is known. */ + rd_kafka_buf_set_maker( + rkbuf, rd_kafka_make_ListOffsetsRequest, params, + rd_kafka_ListOffsetRequest_parameters_destroy_free); + + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); +} + +/** + * @brief Send ListOffsetsRequest for offsets contained in the first + * element of \p offsets, that is a rd_kafka_topic_partition_list_t. + * AdminClient compatible request callback. + */ +rd_kafka_resp_err_t rd_kafka_ListOffsetsRequest_admin( + rd_kafka_broker_t *rkb, + const rd_list_t *offsets /* rd_kafka_topic_partition_list_t*/, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_ListOffsetRequest_parameters_t params; + rd_kafka_IsolationLevel_t isolation_level; + rd_kafka_topic_partition_list_t *topic_partitions; + rd_kafka_buf_t *rkbuf; + rd_kafka_resp_err_t err; + topic_partitions = rd_list_elem(offsets, 0); + + isolation_level = RD_KAFKA_ISOLATION_LEVEL_READ_UNCOMMITTED; + if (options && options->isolation_level.u.INT.v) + isolation_level = options->isolation_level.u.INT.v; + + params = rd_kafka_ListOffsetRequest_parameters_make( + topic_partitions, isolation_level, errstr, errstr_size); + + rkbuf = rd_kafka_ListOffsetRequest_buf_new(rkb, topic_partitions); + + err = rd_kafka_make_ListOffsetsRequest(rkb, rkbuf, ¶ms); + + if (err) { + rd_kafka_buf_destroy(rkbuf); + rd_kafka_replyq_destroy(&replyq); + return err; + } + + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} /** * @brief Parses and handles ListOffsets replies. @@ -516,8 +1008,9 @@ rd_kafka_handle_ListOffsets(rd_kafka_t *rk, int actions; - if (!err) - err = rd_kafka_parse_ListOffsets(rkbuf, offsets); + if (!err) { + err = rd_kafka_parse_ListOffsets(rkbuf, offsets, NULL); + } if (!err) return RD_KAFKA_RESP_ERR_NO_ERROR; @@ -541,6 +1034,9 @@ rd_kafka_handle_ListOffsets(rd_kafka_t *rk, RD_KAFKA_ERR_ACTION_REFRESH | RD_KAFKA_ERR_ACTION_RETRY, RD_KAFKA_RESP_ERR_FENCED_LEADER_EPOCH, + RD_KAFKA_ERR_ACTION_REFRESH | RD_KAFKA_ERR_ACTION_RETRY, + RD_KAFKA_RESP_ERR_UNKNOWN_LEADER_EPOCH, + RD_KAFKA_ERR_ACTION_RETRY, RD_KAFKA_RESP_ERR__TRANSPORT, RD_KAFKA_ERR_ACTION_RETRY, RD_KAFKA_RESP_ERR_REQUEST_TIMED_OUT, @@ -572,122 +1068,102 @@ rd_kafka_handle_ListOffsets(rd_kafka_t *rk, } +/** + * @brief OffsetForLeaderEpochResponse handler. + */ +rd_kafka_resp_err_t rd_kafka_handle_OffsetForLeaderEpoch( + rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + rd_kafka_topic_partition_list_t **offsets) { + const int log_decode_errors = LOG_ERR; + int16_t ApiVersion; + + if (err) + goto err; + + ApiVersion = rkbuf->rkbuf_reqhdr.ApiVersion; + + if (ApiVersion >= 2) + rd_kafka_buf_read_throttle_time(rkbuf); + + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_ERR, + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + ApiVersion >= 1 ? RD_KAFKA_TOPIC_PARTITION_FIELD_EPOCH + : RD_KAFKA_TOPIC_PARTITION_FIELD_NOOP, + RD_KAFKA_TOPIC_PARTITION_FIELD_OFFSET, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + *offsets = rd_kafka_buf_read_topic_partitions( + rkbuf, rd_false /*don't use topic_id*/, rd_true, 0, fields); + if (!*offsets) + goto err_parse; + + return RD_KAFKA_RESP_ERR_NO_ERROR; + +err: + return err; + +err_parse: + err = rkbuf->rkbuf_err; + goto err; +} + /** - * @brief Async maker for ListOffsetsRequest. + * @brief Send OffsetForLeaderEpochRequest for partition(s). + * */ -static rd_kafka_resp_err_t -rd_kafka_make_ListOffsetsRequest(rd_kafka_broker_t *rkb, - rd_kafka_buf_t *rkbuf, - void *make_opaque) { - const rd_kafka_topic_partition_list_t *partitions = - (const rd_kafka_topic_partition_list_t *)make_opaque; - int i; - size_t of_TopicArrayCnt = 0, of_PartArrayCnt = 0; - const char *last_topic = ""; - int32_t topic_cnt = 0, part_cnt = 0; +void rd_kafka_OffsetForLeaderEpochRequest( + rd_kafka_broker_t *rkb, + rd_kafka_topic_partition_list_t *parts, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_buf_t *rkbuf; int16_t ApiVersion; ApiVersion = rd_kafka_broker_ApiVersion_supported( - rkb, RD_KAFKAP_ListOffsets, 0, 2, NULL); + rkb, RD_KAFKAP_OffsetForLeaderEpoch, 2, 2, NULL); + /* If the supported ApiVersions are not yet known, + * or this broker doesn't support it, we let this request + * succeed or fail later from the broker thread where the + * version is checked again. */ if (ApiVersion == -1) - return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; + ApiVersion = 2; - /* ReplicaId */ - rd_kafka_buf_write_i32(rkbuf, -1); + rkbuf = rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_OffsetForLeaderEpoch, 1, 4 + (parts->cnt * 64), + ApiVersion >= 4 /*flexver*/); - /* IsolationLevel */ - if (ApiVersion >= 2) - rd_kafka_buf_write_i8(rkbuf, - rkb->rkb_rk->rk_conf.isolation_level); + /* Sort partitions by topic */ + rd_kafka_topic_partition_list_sort_by_topic(parts); - /* TopicArrayCnt */ - of_TopicArrayCnt = rd_kafka_buf_write_i32(rkbuf, 0); /* updated later */ - - for (i = 0; i < partitions->cnt; i++) { - const rd_kafka_topic_partition_t *rktpar = - &partitions->elems[i]; - - if (strcmp(rktpar->topic, last_topic)) { - /* Finish last topic, if any. */ - if (of_PartArrayCnt > 0) - rd_kafka_buf_update_i32(rkbuf, of_PartArrayCnt, - part_cnt); - - /* Topic */ - rd_kafka_buf_write_str(rkbuf, rktpar->topic, -1); - topic_cnt++; - last_topic = rktpar->topic; - /* New topic so reset partition count */ - part_cnt = 0; - - /* PartitionArrayCnt: updated later */ - of_PartArrayCnt = rd_kafka_buf_write_i32(rkbuf, 0); - } - - /* Partition */ - rd_kafka_buf_write_i32(rkbuf, rktpar->partition); - part_cnt++; - - /* Time/Offset */ - rd_kafka_buf_write_i64(rkbuf, rktpar->offset); - - if (ApiVersion == 0) { - /* MaxNumberOfOffsets */ - rd_kafka_buf_write_i32(rkbuf, 1); - } - } - - if (of_PartArrayCnt > 0) { - rd_kafka_buf_update_i32(rkbuf, of_PartArrayCnt, part_cnt); - rd_kafka_buf_update_i32(rkbuf, of_TopicArrayCnt, topic_cnt); - } + /* Write partition list */ + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + /* CurrentLeaderEpoch */ + RD_KAFKA_TOPIC_PARTITION_FIELD_CURRENT_EPOCH, + /* LeaderEpoch */ + RD_KAFKA_TOPIC_PARTITION_FIELD_EPOCH, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + rd_kafka_buf_write_topic_partitions( + rkbuf, parts, rd_false /*include invalid offsets*/, + rd_false /*skip valid offsets*/, rd_false /*don't use topic id*/, + rd_true /*use topic name*/, fields); rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); - rd_rkb_dbg(rkb, TOPIC, "OFFSET", - "ListOffsetsRequest (v%hd, opv %d) " - "for %" PRId32 " topic(s) and %" PRId32 " partition(s)", - ApiVersion, rkbuf->rkbuf_replyq.version, topic_cnt, - partitions->cnt); - - return RD_KAFKA_RESP_ERR_NO_ERROR; -} - - -/** - * @brief Send ListOffsetsRequest for partitions in \p partitions. - */ -void rd_kafka_ListOffsetsRequest(rd_kafka_broker_t *rkb, - rd_kafka_topic_partition_list_t *partitions, - rd_kafka_replyq_t replyq, - rd_kafka_resp_cb_t *resp_cb, - void *opaque) { - rd_kafka_buf_t *rkbuf; - rd_kafka_topic_partition_list_t *make_parts; - - make_parts = rd_kafka_topic_partition_list_copy(partitions); - rd_kafka_topic_partition_list_sort_by_topic(make_parts); - - rkbuf = rd_kafka_buf_new_request( - rkb, RD_KAFKAP_ListOffsets, 1, - /* ReplicaId+IsolationLevel+TopicArrayCnt+Topic */ - 4 + 1 + 4 + 100 + - /* PartArrayCnt */ - 4 + - /* partition_cnt * Partition+Time+MaxNumOffs */ - (make_parts->cnt * (4 + 8 + 4))); - - /* Postpone creating the request contents until time to send, - * at which time the ApiVersion is known. */ - rd_kafka_buf_set_maker(rkbuf, rd_kafka_make_ListOffsetsRequest, - make_parts, - rd_kafka_topic_partition_list_destroy_free); + /* Let caller perform retries */ + rkbuf->rkbuf_max_retries = RD_KAFKA_REQUEST_NO_RETRIES; rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); } + /** * Generic handler for OffsetFetch responses. * Offsets for included partitions will be propagated through the passed @@ -710,6 +1186,7 @@ rd_kafka_handle_OffsetFetch(rd_kafka_t *rk, rd_bool_t add_part, rd_bool_t allow_retry) { const int log_decode_errors = LOG_ERR; + int32_t GroupArrayCnt; int32_t TopicArrayCnt; int64_t offset = RD_KAFKA_OFFSET_INVALID; int16_t ApiVersion; @@ -727,6 +1204,13 @@ rd_kafka_handle_OffsetFetch(rd_kafka_t *rk, if (ApiVersion >= 3) rd_kafka_buf_read_throttle_time(rkbuf); + if (ApiVersion >= 8) { + rd_kafkap_str_t group_id; + // Currently we are supporting only 1 group + rd_kafka_buf_read_arraycnt(rkbuf, &GroupArrayCnt, 1); + rd_kafka_buf_read_str(rkbuf, &group_id); + } + if (!*offsets) *offsets = rd_kafka_topic_partition_list_new(16); @@ -738,12 +1222,17 @@ rd_kafka_handle_OffsetFetch(rd_kafka_t *rk, rd_kafka_buf_read_arraycnt(rkbuf, &TopicArrayCnt, RD_KAFKAP_TOPICS_MAX); for (i = 0; i < TopicArrayCnt; i++) { rd_kafkap_str_t topic; + rd_kafka_Uuid_t *topic_id = NULL; int32_t PartArrayCnt; char *topic_name; int j; rd_kafka_buf_read_str(rkbuf, &topic); - + // if(ApiVersion >= 9) { + // topic_id = rd_kafka_Uuid_new(); + // rd_kafka_buf_read_uuid(rkbuf, + // topic_id); + // } rd_kafka_buf_read_arraycnt(rkbuf, &PartArrayCnt, RD_KAFKAP_PARTITIONS_MAX); @@ -753,7 +1242,7 @@ rd_kafka_handle_OffsetFetch(rd_kafka_t *rk, int32_t partition; rd_kafka_toppar_t *rktp; rd_kafka_topic_partition_t *rktpar; - int32_t LeaderEpoch; + int32_t LeaderEpoch = -1; int16_t err2; rd_kafka_buf_read_i32(rkbuf, &partition); @@ -766,10 +1255,18 @@ rd_kafka_handle_OffsetFetch(rd_kafka_t *rk, rktpar = rd_kafka_topic_partition_list_find( *offsets, topic_name, partition); - if (!rktpar && add_part) - rktpar = rd_kafka_topic_partition_list_add( - *offsets, topic_name, partition); - else if (!rktpar) { + if (!rktpar && add_part) { + if (topic_id) { + rktpar = + rd_kafka_topic_partition_list_add_with_topic_id( + *offsets, *topic_id, partition); + } else { + rktpar = + rd_kafka_topic_partition_list_add( + *offsets, topic_name, + partition); + } + } else if (!rktpar) { rd_rkb_dbg(rkb, TOPIC, "OFFSETFETCH", "OffsetFetchResponse: %s [%" PRId32 "] " @@ -780,33 +1277,34 @@ rd_kafka_handle_OffsetFetch(rd_kafka_t *rk, seen_cnt++; - if (!(rktp = rktpar->_private)) { - rktp = rd_kafka_toppar_get2( - rkb->rkb_rk, topic_name, partition, 0, 0); - /* May be NULL if topic is not locally known */ - rktpar->_private = rktp; - } + rktp = rd_kafka_topic_partition_get_toppar( + rk, rktpar, rd_false /*no create on miss*/); /* broker reports invalid offset as -1 */ if (offset == -1) rktpar->offset = RD_KAFKA_OFFSET_INVALID; else rktpar->offset = offset; + + rd_kafka_topic_partition_set_leader_epoch(rktpar, + LeaderEpoch); rktpar->err = err2; rd_rkb_dbg(rkb, TOPIC, "OFFSETFETCH", "OffsetFetchResponse: %s [%" PRId32 "] " - "offset %" PRId64 + "offset %" PRId64 ", leader epoch %" PRId32 ", metadata %d byte(s): %s", - topic_name, partition, offset, + topic_name, partition, offset, LeaderEpoch, RD_KAFKAP_STR_LEN(&metadata), rd_kafka_err2name(rktpar->err)); if (update_toppar && !err2 && rktp) { /* Update toppar's committed offset */ rd_kafka_toppar_lock(rktp); - rktp->rktp_committed_offset = rktpar->offset; + rktp->rktp_committed_pos = + rd_kafka_topic_partition_get_fetch_pos( + rktpar); rd_kafka_toppar_unlock(rktp); } @@ -822,10 +1320,22 @@ rd_kafka_handle_OffsetFetch(rd_kafka_t *rk, rktpar->metadata = NULL; rktpar->metadata_size = 0; } else { - rktpar->metadata = RD_KAFKAP_STR_DUP(&metadata); - rktpar->metadata_size = - RD_KAFKAP_STR_LEN(&metadata); + /* It cannot use strndup because + * it stops at first 0 occurrence. */ + size_t len = RD_KAFKAP_STR_LEN(&metadata); + rktpar->metadata_size = len; + unsigned char *metadata_bytes = + rd_malloc(len + 1); + rktpar->metadata = metadata_bytes; + memcpy(rktpar->metadata, metadata.str, len); + metadata_bytes[len] = '\0'; } + + /* Loose ref from get_toppar() */ + if (rktp) + rd_kafka_toppar_destroy(rktp); + + RD_IF_FREE(topic_id, rd_kafka_Uuid_destroy); } rd_kafka_buf_skip_tags(rkbuf); @@ -922,8 +1432,7 @@ void rd_kafka_op_handle_OffsetFetch(rd_kafka_t *rk, err = rd_kafka_handle_OffsetFetch( rkb->rkb_rk, rkb, err, rkbuf, request, &offsets, rd_false /*dont update rktp*/, rd_false /*dont add part*/, - /* Allow retries if replyq - * is valid */ + /* Allow retries if replyq is valid */ rd_kafka_op_replyq_is_valid(rko)); if (err == RD_KAFKA_RESP_ERR__IN_PROGRESS) { if (offsets) @@ -952,6 +1461,9 @@ void rd_kafka_op_handle_OffsetFetch(rd_kafka_t *rk, * have usable offsets then no request is sent at all but an empty * reply is enqueued on the replyq. * + * FIXME: Even though the version is upgraded to v9, currently we support + * only a single group. + * * @param group_id Request offset for this group id. * @param parts (optional) List of topic partitions to request, * or NULL to return all topic partitions associated with the @@ -963,10 +1475,18 @@ void rd_kafka_op_handle_OffsetFetch(rd_kafka_t *rk, void rd_kafka_OffsetFetchRequest(rd_kafka_broker_t *rkb, const char *group_id, rd_kafka_topic_partition_list_t *parts, + rd_bool_t use_topic_id, + int32_t generation_id_or_member_epoch, + rd_kafkap_str_t *member_id, rd_bool_t require_stable_offsets, int timeout, rd_kafka_replyq_t replyq, - rd_kafka_resp_cb_t *resp_cb, + void (*resp_cb)(rd_kafka_t *, + rd_kafka_broker_t *, + rd_kafka_resp_err_t, + rd_kafka_buf_t *, + rd_kafka_buf_t *, + void *), void *opaque) { rd_kafka_buf_t *rkbuf; int16_t ApiVersion; @@ -974,7 +1494,7 @@ void rd_kafka_OffsetFetchRequest(rd_kafka_broker_t *rkb, int PartCnt = -1; ApiVersion = rd_kafka_broker_ApiVersion_supported( - rkb, RD_KAFKAP_OffsetFetch, 0, 7, NULL); + rkb, RD_KAFKAP_OffsetFetch, 0, 9, NULL); if (parts) { parts_size = parts->cnt * 32; @@ -982,26 +1502,55 @@ void rd_kafka_OffsetFetchRequest(rd_kafka_broker_t *rkb, rkbuf = rd_kafka_buf_new_flexver_request( rkb, RD_KAFKAP_OffsetFetch, 1, - /* GroupId + rd_kafka_buf_write_arraycnt_pos + - * Topics + RequireStable */ - 32 + 4 + parts_size + 1, ApiVersion >= 6 /*flexver*/); + /* GroupId + GenerationIdOrMemberEpoch + MemberId + + * rd_kafka_buf_write_arraycnt_pos + Topics + RequireStable */ + 32 + 4 + 50 + 4 + parts_size + 1, ApiVersion >= 6 /*flexver*/); + + if (ApiVersion >= 8) { + /* + * Groups array count. + * Currently, only supporting 1 group. + * TODO: Update to use multiple groups. + */ + rd_kafka_buf_write_arraycnt(rkbuf, 1); + } /* ConsumerGroup */ rd_kafka_buf_write_str(rkbuf, group_id, -1); + if (ApiVersion >= 9) { + if (!member_id) { + rd_kafkap_str_t *null_member_id = + rd_kafkap_str_new(NULL, -1); + rd_kafka_buf_write_kstr(rkbuf, null_member_id); + rd_kafkap_str_destroy(null_member_id); + } else { + rd_kafka_buf_write_kstr(rkbuf, member_id); + } + rd_kafka_buf_write_i32(rkbuf, generation_id_or_member_epoch); + } + if (parts) { /* Sort partitions by topic */ rd_kafka_topic_partition_list_sort_by_topic(parts); + /* Write partition list, filtering out partitions with valid * offsets */ + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; PartCnt = rd_kafka_buf_write_topic_partitions( rkbuf, parts, rd_false /*include invalid offsets*/, rd_false /*skip valid offsets */, - rd_false /*don't write offsets*/, - rd_false /*don't write epoch */, - rd_false /*don't write metadata*/); + use_topic_id /* use_topic id */, rd_true /*use topic name*/, + fields); } else { - rd_kafka_buf_write_arraycnt_pos(rkbuf); + rd_kafka_buf_write_arraycnt(rkbuf, PartCnt); + } + + if (ApiVersion >= 8) { + // Tags for the groups array + rd_kafka_buf_write_tags_empty(rkbuf); } if (ApiVersion >= 7) { @@ -1140,12 +1689,16 @@ rd_kafka_handle_OffsetCommit(rd_kafka_t *rk, rd_kafka_buf_t *request, rd_kafka_topic_partition_list_t *offsets, rd_bool_t ignore_cgrp) { - const int log_decode_errors = LOG_ERR; - int32_t TopicArrayCnt; - int errcnt = 0; - int partcnt = 0; - int i; - int actions = 0; + const int log_decode_errors = LOG_ERR; + int errcnt = 0; + int partcnt = 0; + int actions = 0; + rd_kafka_topic_partition_list_t *partitions = NULL; + rd_kafka_topic_partition_t *partition = NULL; + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_ERR, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; if (err) goto err; @@ -1153,49 +1706,37 @@ rd_kafka_handle_OffsetCommit(rd_kafka_t *rk, if (rd_kafka_buf_ApiVersion(rkbuf) >= 3) rd_kafka_buf_read_throttle_time(rkbuf); - rd_kafka_buf_read_i32(rkbuf, &TopicArrayCnt); - for (i = 0; i < TopicArrayCnt; i++) { - rd_kafkap_str_t topic; - char *topic_str; - int32_t PartArrayCnt; - int j; + partitions = rd_kafka_buf_read_topic_partitions( + rkbuf, rd_false /*don't use topic_id*/, rd_true /*use topic name*/, + 0, fields); - rd_kafka_buf_read_str(rkbuf, &topic); - rd_kafka_buf_read_i32(rkbuf, &PartArrayCnt); + if (!partitions) + goto err_parse; - RD_KAFKAP_STR_DUPA(&topic_str, &topic); + partcnt = partitions->cnt; + RD_KAFKA_TPLIST_FOREACH(partition, partitions) { + rd_kafka_topic_partition_t *rktpar; - for (j = 0; j < PartArrayCnt; j++) { - int32_t partition; - int16_t ErrorCode; - rd_kafka_topic_partition_t *rktpar; + rktpar = rd_kafka_topic_partition_list_find( + offsets, partition->topic, partition->partition); - rd_kafka_buf_read_i32(rkbuf, &partition); - rd_kafka_buf_read_i16(rkbuf, &ErrorCode); + if (!rktpar) { + /* Received offset for topic/partition we didn't + * ask for, this shouldn't really happen. */ + continue; + } - rktpar = rd_kafka_topic_partition_list_find( - offsets, topic_str, partition); - - if (!rktpar) { - /* Received offset for topic/partition we didn't - * ask for, this shouldn't really happen. */ - continue; - } - - rktpar->err = ErrorCode; - if (ErrorCode) { - err = ErrorCode; - errcnt++; - - /* Accumulate actions for per-partition - * errors. */ - actions |= rd_kafka_handle_OffsetCommit_error( - rkb, request, rktpar); - } - - partcnt++; + if (partition->err) { + rktpar->err = partition->err; + err = partition->err; + errcnt++; + /* Accumulate actions for per-partition + * errors. */ + actions |= rd_kafka_handle_OffsetCommit_error( + rkb, request, partition); } } + rd_kafka_topic_partition_list_destroy(partitions); /* If all partitions failed use error code * from last partition as the global error. */ @@ -1263,23 +1804,18 @@ int rd_kafka_OffsetCommitRequest(rd_kafka_broker_t *rkb, void *opaque, const char *reason) { rd_kafka_buf_t *rkbuf; - ssize_t of_TopicCnt = -1; - int TopicCnt = 0; - const char *last_topic = NULL; - ssize_t of_PartCnt = -1; - int PartCnt = 0; - int tot_PartCnt = 0; - int i; + int tot_PartCnt = 0; int16_t ApiVersion; int features; ApiVersion = rd_kafka_broker_ApiVersion_supported( - rkb, RD_KAFKAP_OffsetCommit, 0, 7, &features); + rkb, RD_KAFKAP_OffsetCommit, 0, 9, &features); rd_kafka_assert(NULL, offsets != NULL); - rkbuf = rd_kafka_buf_new_request(rkb, RD_KAFKAP_OffsetCommit, 1, - 100 + (offsets->cnt * 128)); + rkbuf = rd_kafka_buf_new_flexver_request(rkb, RD_KAFKAP_OffsetCommit, 1, + 100 + (offsets->cnt * 128), + ApiVersion >= 8); /* ConsumerGroup */ rd_kafka_buf_write_str(rkbuf, cgmetadata->group_id, -1); @@ -1304,59 +1840,23 @@ int rd_kafka_OffsetCommitRequest(rd_kafka_broker_t *rkb, /* Sort offsets by topic */ rd_kafka_topic_partition_list_sort_by_topic(offsets); - /* TopicArrayCnt: Will be updated when we know the number of topics. */ - of_TopicCnt = rd_kafka_buf_write_i32(rkbuf, 0); + /* Write partition list, filtering out partitions with valid + * offsets */ + rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_OFFSET, + ApiVersion >= 6 ? RD_KAFKA_TOPIC_PARTITION_FIELD_EPOCH + : RD_KAFKA_TOPIC_PARTITION_FIELD_NOOP, + ApiVersion == 1 ? RD_KAFKA_TOPIC_PARTITION_FIELD_TIMESTAMP + : RD_KAFKA_TOPIC_PARTITION_FIELD_NOOP, + RD_KAFKA_TOPIC_PARTITION_FIELD_METADATA, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; - for (i = 0; i < offsets->cnt; i++) { - rd_kafka_topic_partition_t *rktpar = &offsets->elems[i]; - - /* Skip partitions with invalid offset. */ - if (rktpar->offset < 0) - continue; - - if (last_topic == NULL || strcmp(last_topic, rktpar->topic)) { - /* New topic */ - - /* Finalize previous PartitionCnt */ - if (PartCnt > 0) - rd_kafka_buf_update_u32(rkbuf, of_PartCnt, - PartCnt); - - /* TopicName */ - rd_kafka_buf_write_str(rkbuf, rktpar->topic, -1); - /* PartitionCnt, finalized later */ - of_PartCnt = rd_kafka_buf_write_i32(rkbuf, 0); - PartCnt = 0; - last_topic = rktpar->topic; - TopicCnt++; - } - - /* Partition */ - rd_kafka_buf_write_i32(rkbuf, rktpar->partition); - PartCnt++; - tot_PartCnt++; - - /* Offset */ - rd_kafka_buf_write_i64(rkbuf, rktpar->offset); - - /* v6: KIP-101 CommittedLeaderEpoch */ - if (ApiVersion >= 6) - rd_kafka_buf_write_i32(rkbuf, -1); - - /* v1: TimeStamp */ - if (ApiVersion == 1) - rd_kafka_buf_write_i64(rkbuf, -1); - - /* Metadata */ - /* Java client 0.9.0 and broker <0.10.0 can't parse - * Null metadata fields, so as a workaround we send an - * empty string if it's Null. */ - if (!rktpar->metadata) - rd_kafka_buf_write_str(rkbuf, "", 0); - else - rd_kafka_buf_write_str(rkbuf, rktpar->metadata, - rktpar->metadata_size); - } + tot_PartCnt = rd_kafka_buf_write_topic_partitions( + rkbuf, offsets, rd_true /*skip invalid offsets*/, + rd_false /*include valid offsets */, + rd_false /*don't use topic id*/, rd_true /*use topic name*/, + fields); if (tot_PartCnt == 0) { /* No topic+partitions had valid offsets to commit. */ @@ -1365,13 +1865,6 @@ int rd_kafka_OffsetCommitRequest(rd_kafka_broker_t *rkb, return 0; } - /* Finalize previous PartitionCnt */ - if (PartCnt > 0) - rd_kafka_buf_update_u32(rkbuf, of_PartCnt, PartCnt); - - /* Finalize TopicCnt */ - rd_kafka_buf_update_u32(rkbuf, of_TopicCnt, TopicCnt); - rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); rd_rkb_dbg(rkb, TOPIC, "OFFSET", @@ -1432,11 +1925,14 @@ rd_kafka_OffsetDeleteRequest(rd_kafka_broker_t *rkb, /* GroupId */ rd_kafka_buf_write_str(rkbuf, grpoffsets->group, -1); + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; rd_kafka_buf_write_topic_partitions( rkbuf, grpoffsets->partitions, rd_false /*dont skip invalid offsets*/, rd_false /*any offset*/, - rd_false /*dont write offsets*/, rd_false /*dont write epoch*/, - rd_false /*dont write metadata*/); + rd_false /*don't use topic id*/, rd_true /*use topic name*/, + fields); rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); @@ -1460,11 +1956,14 @@ rd_kafka_group_MemberState_consumer_write(rd_kafka_buf_t *env_rkbuf, rkbuf = rd_kafka_buf_new(1, 100); rd_kafka_buf_write_i16(rkbuf, 0); /* Version */ rd_assert(rkgm->rkgm_assignment); + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; rd_kafka_buf_write_topic_partitions( rkbuf, rkgm->rkgm_assignment, rd_false /*don't skip invalid offsets*/, rd_false /* any offset */, - rd_false /*don't write offsets*/, rd_false /*don't write epoch*/, - rd_false /*don't write metadata*/); + rd_false /*don't use topic id*/, rd_true /*use topic name*/, + fields); rd_kafka_buf_write_kbytes(rkbuf, rkgm->rkgm_userdata); /* Get pointer to binary buffer */ @@ -1582,7 +2081,8 @@ void rd_kafka_JoinGroupRequest(rd_kafka_broker_t *rkb, rd_kafka_buf_write_kstr(rkbuf, rkas->rkas_protocol_name); member_metadata = rkas->rkas_get_metadata_cb( rkas, rk->rk_cgrp->rkcg_assignor_state, topics, - rk->rk_cgrp->rkcg_group_assignment); + rk->rk_cgrp->rkcg_group_assignment, + rk->rk_conf.client_rack); rd_kafka_buf_write_kbytes(rkbuf, member_metadata); rd_kafkap_bytes_destroy(member_metadata); } @@ -1765,11 +2265,185 @@ void rd_kafka_HeartbeatRequest(rd_kafka_broker_t *rkb, rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); } +void rd_kafka_ConsumerGroupHeartbeatRequest( + rd_kafka_broker_t *rkb, + const rd_kafkap_str_t *group_id, + const rd_kafkap_str_t *member_id, + int32_t member_epoch, + const rd_kafkap_str_t *group_instance_id, + const rd_kafkap_str_t *rack_id, + int32_t rebalance_timeout_ms, + const rd_kafka_topic_partition_list_t *subscribed_topics, + rd_kafkap_str_t *subscribed_topic_regex, + const rd_kafkap_str_t *remote_assignor, + const rd_kafka_topic_partition_list_t *current_assignments, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + + rd_kafka_buf_t *rkbuf; + int16_t ApiVersion = 0; + int features; + size_t rkbuf_size = 0; + rd_kafkap_str_t *subscribed_topic_regex_to_send = + subscribed_topic_regex; + + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_ConsumerGroupHeartbeat, 1, 1, &features); + + if (ApiVersion == -1) { + rd_kafka_cgrp_coord_dead(rkb->rkb_rk->rk_cgrp, + RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE, + "ConsumerGroupHeartbeatRequest not " + "supported by broker"); + return; + } + + if (rd_rkb_is_dbg(rkb, CGRP)) { + char current_assignments_str[512] = "NULL"; + char subscribed_topics_str[512] = "NULL"; + const char *member_id_str = "NULL"; + const char *group_instance_id_str = "NULL"; + const char *remote_assignor_str = "NULL"; + const char *subscribed_topic_regex_to_send_str = "NULL"; + + if (current_assignments) { + rd_kafka_topic_partition_list_str( + current_assignments, current_assignments_str, + sizeof(current_assignments_str), 0); + } + if (subscribed_topics) { + rd_kafka_topic_partition_list_str( + subscribed_topics, subscribed_topics_str, + sizeof(subscribed_topics_str), 0); + } + if (member_id) + member_id_str = member_id->str; + if (group_instance_id) + group_instance_id_str = group_instance_id->str; + if (remote_assignor) + remote_assignor_str = remote_assignor->str; + if (subscribed_topic_regex_to_send) + subscribed_topic_regex_to_send_str = + subscribed_topic_regex_to_send->str; + + rd_rkb_dbg(rkb, CGRP, "HEARTBEAT", + "ConsumerGroupHeartbeat of member id \"%s\", group " + "id \"%s\", " + "generation id %" PRId32 + ", group instance id \"%s\"" + ", current assignment \"%s\"" + ", subscribed topics \"%s\"" + ", subscribed topic regex \"%s\"" + ", remote assignor \"%s\"", + member_id_str, group_id->str, member_epoch, + group_instance_id_str, current_assignments_str, + subscribed_topics_str, + subscribed_topic_regex_to_send_str, + remote_assignor_str); + } + + size_t next_subscription_size = 0; + + if (!subscribed_topic_regex_to_send) + subscribed_topic_regex_to_send = rd_kafkap_str_new(NULL, -1); + + if (subscribed_topics) { + next_subscription_size = + ((subscribed_topics->cnt * (4 + 50)) + 4); + } + next_subscription_size += + RD_KAFKAP_STR_SIZE(subscribed_topic_regex_to_send); + + if (group_id) + rkbuf_size += RD_KAFKAP_STR_SIZE(group_id); + if (member_id) + rkbuf_size += RD_KAFKAP_STR_SIZE(member_id); + rkbuf_size += 4; /* MemberEpoch */ + if (group_instance_id) + rkbuf_size += RD_KAFKAP_STR_SIZE(group_instance_id); + if (rack_id) + rkbuf_size += RD_KAFKAP_STR_SIZE(rack_id); + rkbuf_size += 4; /* RebalanceTimeoutMs */ + if (next_subscription_size) + rkbuf_size += next_subscription_size; + if (remote_assignor) + rkbuf_size += RD_KAFKAP_STR_SIZE(remote_assignor); + if (current_assignments) + rkbuf_size += (current_assignments->cnt * (16 + 100)); + rkbuf_size += 4; /* TopicPartitions */ + + rkbuf = rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_ConsumerGroupHeartbeat, 1, rkbuf_size, rd_true); + + rd_kafka_buf_write_kstr(rkbuf, group_id); + rd_kafka_buf_write_kstr(rkbuf, member_id); + rd_kafka_buf_write_i32(rkbuf, member_epoch); + rd_kafka_buf_write_kstr(rkbuf, group_instance_id); + rd_kafka_buf_write_kstr(rkbuf, rack_id); + rd_kafka_buf_write_i32(rkbuf, rebalance_timeout_ms); + + if (subscribed_topics) { + int topics_cnt = subscribed_topics->cnt; + + /* write Topics */ + rd_kafka_buf_write_arraycnt(rkbuf, topics_cnt); + while (--topics_cnt >= 0) + rd_kafka_buf_write_str( + rkbuf, subscribed_topics->elems[topics_cnt].topic, + -1); + + } else { + rd_kafka_buf_write_arraycnt(rkbuf, -1); + } + + if (ApiVersion >= 1) + rd_kafka_buf_write_kstr(rkbuf, subscribed_topic_regex_to_send); + + rd_kafka_buf_write_kstr(rkbuf, remote_assignor); + + if (current_assignments) { + const rd_kafka_topic_partition_field_t + current_assignments_fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + rd_kafka_buf_write_topic_partitions( + rkbuf, current_assignments, rd_false, rd_false, + rd_true /*use topic id*/, rd_false /*don't use topic name*/, + current_assignments_fields); + } else { + rd_kafka_buf_write_arraycnt(rkbuf, -1); + } + + rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); + + /* FIXME: + * 1) Improve this timeout to something less than + * `rkcg_heartbeat_intvl_ms` so that the next heartbeat + * is not skipped. + * 2) Remove usage of `group_session_timeout_ms` altogether + * from the new protocol defined in KIP-848. + */ + if (rkb->rkb_rk->rk_cgrp->rkcg_heartbeat_intvl_ms > 0) { + rd_kafka_buf_set_abs_timeout( + rkbuf, rkb->rkb_rk->rk_cgrp->rkcg_heartbeat_intvl_ms, 0); + } else { + rd_kafka_buf_set_abs_timeout( + rkbuf, rkb->rkb_rk->rk_conf.group_session_timeout_ms, 0); + } + + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + + if (!subscribed_topic_regex) + rd_kafkap_str_destroy(subscribed_topic_regex_to_send); +} + /** * @brief Construct and send ListGroupsRequest to \p rkb - * with the states (const char *) in \p states. + * with the states (const char *) in \p states, + * and the types (const char *) in \p types. * Uses \p max_ApiVersion as maximum API version, * pass -1 to use the maximum available version. * @@ -1783,16 +2457,17 @@ rd_kafka_error_t *rd_kafka_ListGroupsRequest(rd_kafka_broker_t *rkb, int16_t max_ApiVersion, const char **states, size_t states_cnt, + const char **types, + size_t types_cnt, rd_kafka_replyq_t replyq, rd_kafka_resp_cb_t *resp_cb, void *opaque) { rd_kafka_buf_t *rkbuf; int16_t ApiVersion = 0; size_t i; - rd_bool_t is_flexver = rd_false; if (max_ApiVersion < 0) - max_ApiVersion = 4; + max_ApiVersion = 5; if (max_ApiVersion > ApiVersion) { /* Remark: don't check if max_ApiVersion is zero. @@ -1800,7 +2475,6 @@ rd_kafka_error_t *rd_kafka_ListGroupsRequest(rd_kafka_broker_t *rkb, * in the application thread reliably . */ ApiVersion = rd_kafka_broker_ApiVersion_supported( rkb, RD_KAFKAP_ListGroups, 0, max_ApiVersion, NULL); - is_flexver = ApiVersion >= 3; } if (ApiVersion == -1) { @@ -1812,18 +2486,20 @@ rd_kafka_error_t *rd_kafka_ListGroupsRequest(rd_kafka_broker_t *rkb, rkbuf = rd_kafka_buf_new_flexver_request( rkb, RD_KAFKAP_ListGroups, 1, /* rd_kafka_buf_write_arraycnt_pos + tags + StatesFilter */ - 4 + 1 + 32 * states_cnt, is_flexver); + 4 + 1 + 32 * states_cnt, ApiVersion >= 3 /* is_flexver */); if (ApiVersion >= 4) { - size_t of_GroupsArrayCnt = - rd_kafka_buf_write_arraycnt_pos(rkbuf); + rd_kafka_buf_write_arraycnt(rkbuf, states_cnt); for (i = 0; i < states_cnt; i++) { rd_kafka_buf_write_str(rkbuf, states[i], -1); } - rd_kafka_buf_finalize_arraycnt(rkbuf, of_GroupsArrayCnt, i); } - if (is_flexver) { - rd_kafka_buf_write_tags(rkbuf); + + if (ApiVersion >= 5) { + rd_kafka_buf_write_arraycnt(rkbuf, types_cnt); + for (i = 0; i < types_cnt; i++) { + rd_kafka_buf_write_str(rkbuf, types[i], -1); + } } rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); @@ -1836,6 +2512,8 @@ rd_kafka_error_t *rd_kafka_ListGroupsRequest(rd_kafka_broker_t *rkb, * with the groups (const char *) in \p groups. * Uses \p max_ApiVersion as maximum API version, * pass -1 to use the maximum available version. + * Uses \p include_authorized_operations to get + * group ACL authorized operations. * * The response (unparsed) will be enqueued on \p replyq * for handling by \p resp_cb (with \p opaque passed). @@ -1843,13 +2521,15 @@ rd_kafka_error_t *rd_kafka_ListGroupsRequest(rd_kafka_broker_t *rkb, * @return NULL on success, a new error instance that must be * released with rd_kafka_error_destroy() in case of error. */ -rd_kafka_error_t *rd_kafka_DescribeGroupsRequest(rd_kafka_broker_t *rkb, - int16_t max_ApiVersion, - char **groups, - size_t group_cnt, - rd_kafka_replyq_t replyq, - rd_kafka_resp_cb_t *resp_cb, - void *opaque) { +rd_kafka_error_t * +rd_kafka_DescribeGroupsRequest(rd_kafka_broker_t *rkb, + int16_t max_ApiVersion, + char **groups, + size_t group_cnt, + rd_bool_t include_authorized_operations, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { rd_kafka_buf_t *rkbuf; int16_t ApiVersion = 0; size_t of_GroupsArrayCnt; @@ -1886,8 +2566,7 @@ rd_kafka_error_t *rd_kafka_DescribeGroupsRequest(rd_kafka_broker_t *rkb, /* write IncludeAuthorizedOperations */ if (ApiVersion >= 3) { - /* TODO: implement KIP-430 */ - rd_kafka_buf_write_bool(rkbuf, rd_false); + rd_kafka_buf_write_bool(rkbuf, include_authorized_operations); } rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); @@ -1906,12 +2585,14 @@ static void rd_kafka_handle_Metadata(rd_kafka_t *rk, rd_kafka_buf_t *rkbuf, rd_kafka_buf_t *request, void *opaque) { - rd_kafka_op_t *rko = opaque; /* Possibly NULL */ - struct rd_kafka_metadata *md = NULL; - const rd_list_t *topics = request->rkbuf_u.Metadata.topics; + rd_kafka_op_t *rko = opaque; /* Possibly NULL */ + rd_kafka_metadata_internal_t *mdi = NULL; + const rd_list_t *topics = request->rkbuf_u.Metadata.topics; + const int32_t cgrp_subscription_version = + request->rkbuf_u.Metadata.cgrp_subscription_version; int actions; - rd_kafka_assert(NULL, err == RD_KAFKA_RESP_ERR__DESTROY || + rd_kafka_assert(NULL, rd_kafka_broker_is_any_err_destroy(err) || thrd_is_current(rk->rk_thread)); /* Avoid metadata updates when we're terminating. */ @@ -1935,54 +2616,72 @@ static void rd_kafka_handle_Metadata(rd_kafka_t *rk, rd_list_cnt(topics), request->rkbuf_u.Metadata.reason); - err = rd_kafka_parse_Metadata(rkb, request, rkbuf, &md); + err = rd_kafka_parse_Metadata(rkb, request, rkbuf, &mdi); if (err) goto err; if (rko && rko->rko_replyq.q) { /* Reply to metadata requester, passing on the metadata. * Reuse requesting rko for the reply. */ - rko->rko_err = err; - rko->rko_u.metadata.md = md; - + rko->rko_err = err; + rko->rko_u.metadata.md = &mdi->metadata; + rko->rko_u.metadata.mdi = mdi; + rko->rko_u.metadata.subscription_version = + cgrp_subscription_version; rd_kafka_replyq_enq(&rko->rko_replyq, rko, 0); rko = NULL; } else { - if (md) - rd_free(md); + if (mdi) + rd_free(mdi); } goto done; err: - actions = rd_kafka_err_action(rkb, err, request, + actions = rd_kafka_err_action( + rkb, err, request, - RD_KAFKA_ERR_ACTION_RETRY, - RD_KAFKA_RESP_ERR__PARTIAL, + RD_KAFKA_ERR_ACTION_SPECIAL, RD_KAFKA_RESP_ERR_REBOOTSTRAP_REQUIRED, - RD_KAFKA_ERR_ACTION_END); + RD_KAFKA_ERR_ACTION_RETRY, RD_KAFKA_RESP_ERR__PARTIAL, + + RD_KAFKA_ERR_ACTION_END); + + if (actions & RD_KAFKA_ERR_ACTION_SPECIAL) { + rd_kafka_rebootstrap(rk); + } if (actions & RD_KAFKA_ERR_ACTION_RETRY) { - if (rd_kafka_buf_retry(rkb, request)) + /* In case it's a brokers full refresh call, + * avoid retrying it on this same broker. + * This is to prevent client is hung + * until it can connect to this broker again. + * No need to acquire the lock here but + * when decrementing the integer pointed + * by `decr`. */ + if (!request->rkbuf_u.Metadata.decr && + rd_kafka_buf_retry(rkb, request)) return; /* FALLTHRU */ - } else { + } + + if (actions & RD_KAFKA_ERR_ACTION_PERMANENT) { rd_rkb_log(rkb, LOG_WARNING, "METADATA", "Metadata request failed: %s: %s (%dms): %s", request->rkbuf_u.Metadata.reason, rd_kafka_err2str(err), (int)(request->rkbuf_ts_sent / 1000), rd_kafka_actions2str(actions)); - /* Respond back to caller on non-retriable errors */ - if (rko && rko->rko_replyq.q) { - rko->rko_err = err; - rko->rko_u.metadata.md = NULL; - rd_kafka_replyq_enq(&rko->rko_replyq, rko, 0); - rko = NULL; - } } - + /* Respond back to caller on non-retriable errors */ + if (rko && rko->rko_replyq.q) { + rko->rko_err = err; + rko->rko_u.metadata.md = NULL; + rko->rko_u.metadata.mdi = NULL; + rd_kafka_replyq_enq(&rko->rko_replyq, rko, 0); + rko = NULL; + } /* FALLTHRU */ @@ -1991,95 +2690,178 @@ done: rd_kafka_op_destroy(rko); } - /** - * @brief Construct MetadataRequest (does not send) + * @brief Internal implementation of MetadataRequest. * - * \p topics is a list of topic names (char *) to request. + * - !topics && !topic_ids: only request brokers (if supported by + * broker, else all topics) + * - topics.cnt > 0 && topic_ids.cnt > 0: invalid request + * - topics.cnt > 0 || topic_ids.cnt > 0: only specified topics + * are requested + * - else: all topics in cluster are requested * - * !topics - only request brokers (if supported by broker, else - * all topics) - * topics.cnt==0 - all topics in cluster are requested - * topics.cnt >0 - only specified topics are requested - * - * @param reason - metadata request reason - * @param allow_auto_create_topics - allow broker-side auto topic creation. - * This is best-effort, depending on broker - * config and version. - * @param cgrp_update - Update cgrp in parse_Metadata (see comment there). - * @param rko - (optional) rko with replyq for handling response. + * @param topics A list of topic names (char *) to request. + * @param topic_ids A list of topic ids (rd_kafka_Uuid_t *) to request. + * @param reason Metadata request reason + * @param allow_auto_create_topics Allow broker-side auto topic creation. + * This is best-effort, depending on broker + * config and version. + * @param include_cluster_authorized_operations Request for cluster + * authorized operations. + * @param include_topic_authorized_operations Request for topic + * authorized operations. + * @param cgrp_update Update cgrp in parse_Metadata (see comment there). + * @param force_racks Force partition to rack mapping computation in + * parse_Metadata (see comment there). + * @param rko (optional) rko with replyq for handling response. * Specifying an rko forces a metadata request even if * there is already a matching one in-transit. + * @param resp_cb Callback to be used for handling response. + * @param replyq replyq on which response is handled. + * @param force rd_true: force a full request (including all topics and + * brokers) even if there is such a request already + * in flight. + * rd_false: check if there are multiple outstanding full + * requests, and don't send one if there is already + * one present. (See note below.) + * @param opaque (optional) parameter to be passed to resp_cb. * - * If full metadata for all topics is requested (or all brokers, which - * results in all-topics on older brokers) and there is already a full request - * in transit then this function will return RD_KAFKA_RESP_ERR__PREV_IN_PROGRESS - * otherwise RD_KAFKA_RESP_ERR_NO_ERROR. If \p rko is non-NULL the request - * is sent regardless. + * @return Error code: + * If full metadata for all topics is requested (or + * all brokers, which results in all-topics on older brokers) and + * there is already a full request in transit then this function + * will return RD_KAFKA_RESP_ERR__PREV_IN_PROGRESS, + * otherwise RD_KAFKA_RESP_ERR_NO_ERROR. + * + * @remark Either \p topics or \p topic_ids must be set, but not both. + * @remark If \p rko is specified, \p resp_cb, \p replyq, \p force, \p opaque + * should be NULL or rd_false. + * @remark If \p rko is non-NULL or if \p force is true, + * the request is sent regardless. + * @remark \p include_cluster_authorized_operations and + * \p include_topic_authorized_operations should not be set unless this + * MetadataRequest is for an admin operation. + * + * @sa rd_kafka_MetadataRequest(). + * @sa rd_kafka_MetadataRequest_resp_cb(). */ -rd_kafka_resp_err_t rd_kafka_MetadataRequest(rd_kafka_broker_t *rkb, - const rd_list_t *topics, - const char *reason, - rd_bool_t allow_auto_create_topics, - rd_bool_t cgrp_update, - rd_kafka_op_t *rko) { +static rd_kafka_resp_err_t +rd_kafka_MetadataRequest0(rd_kafka_broker_t *rkb, + const rd_list_t *topics, + const rd_list_t *topic_ids, + const char *reason, + rd_bool_t allow_auto_create_topics, + rd_bool_t include_cluster_authorized_operations, + rd_bool_t include_topic_authorized_operations, + rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, + rd_bool_t force_racks, + rd_kafka_op_t *rko, + rd_kafka_resp_cb_t *resp_cb, + rd_kafka_replyq_t replyq, + rd_bool_t force, + void *opaque) { rd_kafka_buf_t *rkbuf; int16_t ApiVersion = 0; + size_t of_TopicArrayCnt; int features; - int topic_cnt = topics ? rd_list_cnt(topics) : 0; - int *full_incr = NULL; + int topic_id_cnt; + int total_topic_cnt; + int topic_cnt = topics ? rd_list_cnt(topics) : 0; + int *full_incr = NULL; + void *handler_arg = NULL; + rd_kafka_resp_cb_t *handler_cb = rd_kafka_handle_Metadata; + int16_t metadata_max_version = 13; + rd_kafka_replyq_t use_replyq = replyq; + + /* In case we want cluster authorized operations in the Metadata + * request, we must send a request with version not exceeding 10 because + * KIP-700 deprecates those fields from the Metadata RPC. */ + if (include_cluster_authorized_operations) + metadata_max_version = RD_MIN(metadata_max_version, 10); ApiVersion = rd_kafka_broker_ApiVersion_supported( - rkb, RD_KAFKAP_Metadata, 0, 4, &features); + rkb, RD_KAFKAP_Metadata, 0, metadata_max_version, &features); - rkbuf = rd_kafka_buf_new_request(rkb, RD_KAFKAP_Metadata, 1, - 4 + (50 * topic_cnt) + 1); + topic_id_cnt = + (ApiVersion >= 10 && topic_ids) ? rd_list_cnt(topic_ids) : 0; + rd_assert(topic_id_cnt == 0 || ApiVersion >= 12); + + total_topic_cnt = topic_cnt + topic_id_cnt; + + rkbuf = rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_Metadata, 1, + 4 + ((50 /*topic name */ + 16 /* topic id */) * total_topic_cnt) + + 1, + ApiVersion >= 9); if (!reason) reason = ""; - rkbuf->rkbuf_u.Metadata.reason = rd_strdup(reason); - rkbuf->rkbuf_u.Metadata.cgrp_update = cgrp_update; + rkbuf->rkbuf_u.Metadata.reason = rd_strdup(reason); + rkbuf->rkbuf_u.Metadata.cgrp_update = cgrp_update; + rkbuf->rkbuf_u.Metadata.force_racks = force_racks; + rkbuf->rkbuf_u.Metadata.cgrp_subscription_version = -1; + + /* TopicArrayCnt */ + of_TopicArrayCnt = rd_kafka_buf_write_arraycnt_pos(rkbuf); + + if (!topics && !topic_ids) { + /* v0: keep 0, brokers only not available, + * request all topics */ + /* v1-8: 0 means empty array, brokers only */ + if (ApiVersion >= 9) { + /* v9+: varint encoded empty array (1), brokers only */ + rd_kafka_buf_finalize_arraycnt(rkbuf, of_TopicArrayCnt, + topic_cnt); + } - if (!topics && ApiVersion >= 1) { - /* a null(0) array (in the protocol) represents no topics */ - rd_kafka_buf_write_i32(rkbuf, 0); rd_rkb_dbg(rkb, METADATA, "METADATA", "Request metadata for brokers only: %s", reason); full_incr = &rkb->rkb_rk->rk_metadata_cache.rkmc_full_brokers_sent; - } else { - if (topic_cnt == 0 && !rko) + } else if (total_topic_cnt == 0) { + /* v0: keep 0, request all topics */ + if (ApiVersion >= 1 && ApiVersion < 9) { + /* v1-8: update to -1, all topics */ + rd_kafka_buf_update_i32(rkbuf, of_TopicArrayCnt, -1); + } + /* v9+: keep 0, varint encoded null, all topics */ + + rkbuf->rkbuf_u.Metadata.all_topics = 1; + rd_rkb_dbg(rkb, METADATA, "METADATA", + "Request metadata for all topics: " + "%s", + reason); + + if (!rko) full_incr = &rkb->rkb_rk->rk_metadata_cache .rkmc_full_topics_sent; - if (topic_cnt == 0 && ApiVersion >= 1) - rd_kafka_buf_write_i32(rkbuf, -1); /* Null: all topics*/ - else - rd_kafka_buf_write_i32(rkbuf, topic_cnt); + } else { + /* Cannot request topics by name and id at the same time */ + rd_dassert(!(topic_cnt > 0 && topic_id_cnt > 0)); - if (topic_cnt == 0) { - rkbuf->rkbuf_u.Metadata.all_topics = 1; - rd_rkb_dbg(rkb, METADATA, "METADATA", - "Request metadata for all topics: " - "%s", - reason); - } else - rd_rkb_dbg(rkb, METADATA, "METADATA", - "Request metadata for %d topic(s): " - "%s", - topic_cnt, reason); + /* request cnt topics */ + rd_kafka_buf_finalize_arraycnt(rkbuf, of_TopicArrayCnt, + total_topic_cnt); + + rd_rkb_dbg(rkb, METADATA, "METADATA", + "Request metadata for %d topic(s): " + "%s", + total_topic_cnt, reason); } if (full_incr) { /* Avoid multiple outstanding full requests * (since they are redundant and side-effect-less). - * Forced requests (app using metadata() API) are passed - * through regardless. */ + * Forced requests (app using metadata() API or Admin API) are + * passed through regardless. */ mtx_lock(&rkb->rkb_rk->rk_metadata_cache.rkmc_full_lock); - if (*full_incr > 0 && (!rko || !rko->rko_u.metadata.force)) { + if (!force && + (*full_incr > 0 && (!rko || !rko->rko_u.metadata.force))) { mtx_unlock( &rkb->rkb_rk->rk_metadata_cache.rkmc_full_lock); rd_rkb_dbg(rkb, METADATA, "METADATA", @@ -2101,14 +2883,38 @@ rd_kafka_resp_err_t rd_kafka_MetadataRequest(rd_kafka_broker_t *rkb, if (topic_cnt > 0) { char *topic; int i; + rd_kafka_Uuid_t zero_uuid = RD_KAFKA_UUID_ZERO; /* Maintain a copy of the topics list so we can purge * hints from the metadata cache on error. */ rkbuf->rkbuf_u.Metadata.topics = rd_list_copy(topics, rd_list_string_copy, NULL); - RD_LIST_FOREACH(topic, topics, i) - rd_kafka_buf_write_str(rkbuf, topic, -1); + RD_LIST_FOREACH(topic, topics, i) { + if (ApiVersion >= 10) { + rd_kafka_buf_write_uuid(rkbuf, &zero_uuid); + } + rd_kafka_buf_write_str(rkbuf, topic, -1); + /* Tags for previous topic */ + rd_kafka_buf_write_tags_empty(rkbuf); + } + } + + if (ApiVersion >= 10 && topic_id_cnt > 0) { + int i; + rd_kafka_Uuid_t *topic_id; + + /* Maintain a copy of the topics list so we can purge + * hints from the metadata cache on error. */ + rkbuf->rkbuf_u.Metadata.topic_ids = + rd_list_copy(topic_ids, rd_list_Uuid_copy, NULL); + + RD_LIST_FOREACH(topic_id, topic_ids, i) { + rd_kafka_buf_write_uuid(rkbuf, topic_id); + rd_kafka_buf_write_str(rkbuf, NULL, -1); + /* Tags for previous topic */ + rd_kafka_buf_write_tags_empty(rkbuf); + } } if (ApiVersion >= 4) { @@ -2130,6 +2936,17 @@ rd_kafka_resp_err_t rd_kafka_MetadataRequest(rd_kafka_broker_t *rkb, "on broker auto.create.topics.enable configuration"); } + if (ApiVersion >= 8 && ApiVersion <= 10) { + /* IncludeClusterAuthorizedOperations */ + rd_kafka_buf_write_bool(rkbuf, + include_cluster_authorized_operations); + } + + if (ApiVersion >= 8) { + /* IncludeTopicAuthorizedOperations */ + rd_kafka_buf_write_bool(rkbuf, + include_topic_authorized_operations); + } rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); @@ -2137,16 +2954,165 @@ rd_kafka_resp_err_t rd_kafka_MetadataRequest(rd_kafka_broker_t *rkb, * and should go before most other requests (Produce, Fetch, etc). */ rkbuf->rkbuf_prio = RD_KAFKA_PRIO_HIGH; - rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, - /* Handle response thru rk_ops, - * but forward parsed result to - * rko's replyq when done. */ - RD_KAFKA_REPLYQ(rkb->rkb_rk->rk_ops, 0), - rd_kafka_handle_Metadata, rko); + /* The default handler is rd_kafka_handle_Metadata, but it can be + * overriden to use a custom handler. */ + if (resp_cb) + handler_cb = resp_cb; + + /* If a custom handler is provided, we also allow the caller to set a + * custom argument which is passed as the opaque argument to the + * handler. However, if we're using the default handler, it expects + * either rko or NULL as its opaque argument (it forwards the response + * to rko's replyq if it's non-NULL). */ + if (resp_cb && opaque) + handler_arg = opaque; + else + handler_arg = rko; + + /* If a custom replyq is provided (and is valid), the response is + * handled through on that replyq. By default, response is handled on + * rk_ops, and the default handler (rd_kafka_handle_Metadata) forwards + * the parsed result to rko's replyq when done. */ + if (!use_replyq.q) + use_replyq = RD_KAFKA_REPLYQ(rkb->rkb_rk->rk_ops, 0); + + if (cgrp_update && rkb->rkb_rk->rk_cgrp && total_topic_cnt > 0) { + rkbuf->rkbuf_u.Metadata.cgrp_subscription_version = + cgrp_subscription_version >= 0 + ? cgrp_subscription_version + : rd_atomic32_get( + &rkb->rkb_rk->rk_cgrp->rkcg_subscription_version); + } + rd_kafka_broker_buf_enq_replyq( + rkb, rkbuf, use_replyq, + /* The default response handler is rd_kafka_handle_Metadata, but we + allow alternate handlers to be configured. */ + handler_cb, handler_arg); return RD_KAFKA_RESP_ERR_NO_ERROR; } +/** + * @brief Construct and enqueue a MetadataRequest + * + * - !topics && !topic_ids: only request brokers (if supported by + * broker, else all topics) + * - topics.cnt > 0 && topic_ids.cnt > 0: invalid request + * - topics.cnt > 0 || topic_ids.cnt > 0: only specified topics + * are requested + * - else: all topics in cluster are requested + * + * @param topics A list of topic names (char *) to request. + * @param topic_ids A list of topic ids (rd_kafka_Uuid_t *) to request. + * @param reason - metadata request reason + * @param allow_auto_create_topics - allow broker-side auto topic creation. + * This is best-effort, depending on broker + * config and version. + * @param cgrp_update - Update cgrp in parse_Metadata (see comment there). + * @param subscription_version - Consumer group subscription version. + * @param force_racks - Force partition to rack mapping computation in + * parse_Metadata (see comment there). + * @param rko - (optional) rko with replyq for handling response. + * Specifying an rko forces a metadata request even if + * there is already a matching one in-transit. + * + * @return Error code: + * If full metadata for all topics is requested (or + * all brokers, which results in all-topics on older brokers) and + * there is already a full request in transit then this function + * will return RD_KAFKA_RESP_ERR__PREV_IN_PROGRESS, + * otherwise RD_KAFKA_RESP_ERR_NO_ERROR. + * If \p rko is non-NULL, the request is sent regardless. + * + * @remark Either \p topics or \p topic_ids must be set, but not both. + */ +rd_kafka_resp_err_t rd_kafka_MetadataRequest(rd_kafka_broker_t *rkb, + const rd_list_t *topics, + rd_list_t *topic_ids, + const char *reason, + rd_bool_t allow_auto_create_topics, + rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, + rd_bool_t force_racks, + rd_kafka_op_t *rko) { + return rd_kafka_MetadataRequest0( + rkb, topics, topic_ids, reason, allow_auto_create_topics, + rd_false /*don't include cluster authorized operations*/, + rd_false /*don't include topic authorized operations*/, cgrp_update, + cgrp_subscription_version, force_racks, rko, + /* We use the default rd_kafka_handle_Metadata rather than a custom + resp_cb */ + NULL, + /* Use default replyq which works with the default handler + rd_kafka_handle_Metadata. */ + RD_KAFKA_NO_REPLYQ, + /* If the request needs to be forced, rko_u.metadata.force will be + set. We don't provide an explicit parameter force. */ + rd_false, NULL); +} + +/** + * @brief Construct and enqueue a MetadataRequest which use + * response callback \p resp_cb instead of a rko. + * + * - !topics && !topic_ids: only request brokers (if supported by + * broker, else all topics) + * - topics.cnt > 0 && topic_ids.cnt > 0: invalid request + * - topics.cnt > 0 || topic_ids.cnt > 0: only specified topics + * are requested + * - else: all topics in cluster are requested + * + * @param topics A list of topic names (char *) to request. + * @param topic_ids A list of topic ids (rd_kafka_Uuid_t *) to request. + * @param reason Metadata request reason + * @param allow_auto_create_topics Allow broker-side auto topic creation. + * This is best-effort, depending on broker + * config and version. + * @param include_cluster_authorized_operations Request for cluster + * authorized operations. + * @param include_topic_authorized_operations Request for topic + * authorized operations. + * @param cgrp_update Update cgrp in parse_Metadata (see comment there). + * @param force_racks Force partition to rack mapping computation in + * parse_Metadata (see comment there). + * @param resp_cb Callback to be used for handling response. + * @param replyq replyq on which response is handled. + * @param force Force request even if in progress. + * @param opaque (optional) parameter to be passed to resp_cb. + * + * @return Error code: + * If full metadata for all topics is requested (or + * all brokers, which results in all-topics on older brokers) and + * there is already a full request in transit then this function + * will return RD_KAFKA_RESP_ERR__PREV_IN_PROGRESS, + * otherwise RD_KAFKA_RESP_ERR_NO_ERROR. + * + * @remark Either \p topics or \p topic_ids must be set, but not both. + */ +rd_kafka_resp_err_t rd_kafka_MetadataRequest_resp_cb( + rd_kafka_broker_t *rkb, + const rd_list_t *topics, + const rd_list_t *topics_ids, + const char *reason, + rd_bool_t allow_auto_create_topics, + rd_bool_t include_cluster_authorized_operations, + rd_bool_t include_topic_authorized_operations, + rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, + rd_bool_t force_racks, + rd_kafka_resp_cb_t *resp_cb, + rd_kafka_replyq_t replyq, + rd_bool_t force, + void *opaque) { + return rd_kafka_MetadataRequest0( + rkb, topics, topics_ids, reason, allow_auto_create_topics, + include_cluster_authorized_operations, + include_topic_authorized_operations, cgrp_update, + cgrp_subscription_version, force_racks, + NULL /* No op - using custom resp_cb. */, resp_cb, replyq, force, + opaque); +} + /** @@ -2260,7 +3226,7 @@ void rd_kafka_ApiVersionRequest(rd_kafka_broker_t *rkb, ApiVersion = 3; rkbuf = rd_kafka_buf_new_flexver_request( - rkb, RD_KAFKAP_ApiVersion, 1, 4, ApiVersion >= 3 /*flexver*/); + rkb, RD_KAFKAP_ApiVersion, 1, 3, ApiVersion >= 3 /*flexver*/); if (ApiVersion >= 3) { /* KIP-511 adds software name and version through the optional @@ -2367,6 +3333,9 @@ void rd_kafka_handle_SaslAuthenticate(rd_kafka_t *rk, rd_kafkap_bytes_t auth_data; char errstr[512]; + if (rd_kafka_broker_is_any_err_destroy(err)) + return; + if (err) { rd_snprintf(errstr, sizeof(errstr), "SaslAuthenticateRequest failed: %s", @@ -2392,7 +3361,19 @@ void rd_kafka_handle_SaslAuthenticate(rd_kafka_t *rk, goto err; } - rd_kafka_buf_read_bytes(rkbuf, &auth_data); + rd_kafka_buf_read_kbytes(rkbuf, &auth_data); + + if (request->rkbuf_reqhdr.ApiVersion >= 1) { + int64_t session_lifetime_ms; + rd_kafka_buf_read_i64(rkbuf, &session_lifetime_ms); + + if (session_lifetime_ms) + rd_kafka_dbg( + rk, SECURITY, "REAUTH", + "Received session lifetime %ld ms from broker", + session_lifetime_ms); + rd_kafka_broker_start_reauth_timer(rkb, session_lifetime_ms); + } /* Pass SASL auth frame to SASL handler */ if (rd_kafka_sasl_recv(rkb->rkb_transport, auth_data.data, @@ -2427,6 +3408,8 @@ void rd_kafka_SaslAuthenticateRequest(rd_kafka_broker_t *rkb, rd_kafka_resp_cb_t *resp_cb, void *opaque) { rd_kafka_buf_t *rkbuf; + int16_t ApiVersion; + int features; rkbuf = rd_kafka_buf_new_request(rkb, RD_KAFKAP_SaslAuthenticate, 0, 0); @@ -2441,6 +3424,10 @@ void rd_kafka_SaslAuthenticateRequest(rd_kafka_broker_t *rkb, * close down the connection and reconnect on failure. */ rkbuf->rkbuf_max_retries = RD_KAFKA_REQUEST_NO_RETRIES; + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_SaslAuthenticate, 0, 1, &features); + rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); + if (replyq.q) rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); @@ -2448,15 +3435,263 @@ void rd_kafka_SaslAuthenticateRequest(rd_kafka_broker_t *rkb, rd_kafka_broker_buf_enq1(rkb, rkbuf, resp_cb, opaque); } - - /** - * @struct Hold temporary result and return values from ProduceResponse + * @name Leader discovery (KIP-951) + * @{ */ -struct rd_kafka_Produce_result { - int64_t offset; /**< Assigned offset of first message */ - int64_t timestamp; /**< (Possibly assigned) offset of first message */ -}; + +void rd_kafkap_leader_discovery_tmpabuf_add_alloc_brokers( + rd_tmpabuf_t *tbuf, + rd_kafkap_NodeEndpoints_t *NodeEndpoints) { + int i; + size_t md_brokers_size = + NodeEndpoints->NodeEndpointCnt * sizeof(rd_kafka_metadata_broker_t); + size_t mdi_brokers_size = NodeEndpoints->NodeEndpointCnt * + sizeof(rd_kafka_metadata_broker_internal_t); + rd_tmpabuf_add_alloc_times(tbuf, md_brokers_size, 2); + rd_tmpabuf_add_alloc(tbuf, mdi_brokers_size); + for (i = 0; i < NodeEndpoints->NodeEndpointCnt; i++) { + size_t HostSize = + RD_KAFKAP_STR_LEN(&NodeEndpoints->NodeEndpoints[i].Host) + + 1; + rd_tmpabuf_add_alloc(tbuf, HostSize); + } +} + +void rd_kafkap_leader_discovery_tmpabuf_add_alloc_topics(rd_tmpabuf_t *tbuf, + int topic_cnt) { + rd_tmpabuf_add_alloc(tbuf, + sizeof(rd_kafka_metadata_topic_t) * topic_cnt); + rd_tmpabuf_add_alloc(tbuf, sizeof(rd_kafka_metadata_topic_internal_t) * + topic_cnt); +} + +void rd_kafkap_leader_discovery_tmpabuf_add_alloc_topic(rd_tmpabuf_t *tbuf, + char *topic_name, + int32_t partition_cnt) { + if (topic_name) { + rd_tmpabuf_add_alloc(tbuf, strlen(topic_name) + 1); + } + rd_tmpabuf_add_alloc(tbuf, sizeof(rd_kafka_metadata_partition_t) * + partition_cnt); + rd_tmpabuf_add_alloc(tbuf, + sizeof(rd_kafka_metadata_partition_internal_t) * + partition_cnt); +} + +void rd_kafkap_leader_discovery_metadata_init(rd_kafka_metadata_internal_t *mdi, + int32_t broker_id) { + memset(mdi, 0, sizeof(*mdi)); + mdi->metadata.orig_broker_id = broker_id; + mdi->controller_id = -1; + mdi->cluster_authorized_operations = -1; +} + +void rd_kafkap_leader_discovery_set_brokers( + rd_tmpabuf_t *tbuf, + rd_kafka_metadata_internal_t *mdi, + rd_kafkap_NodeEndpoints_t *NodeEndpoints) { + int i; + rd_kafka_metadata_t *md = &mdi->metadata; + + size_t md_brokers_size = + NodeEndpoints->NodeEndpointCnt * sizeof(rd_kafka_metadata_broker_t); + size_t mdi_brokers_size = NodeEndpoints->NodeEndpointCnt * + sizeof(rd_kafka_metadata_broker_internal_t); + + md->broker_cnt = NodeEndpoints->NodeEndpointCnt; + md->brokers = rd_tmpabuf_alloc(tbuf, md_brokers_size); + mdi->brokers_sorted = rd_tmpabuf_alloc(tbuf, md_brokers_size); + mdi->brokers = rd_tmpabuf_alloc(tbuf, mdi_brokers_size); + + for (i = 0; i < NodeEndpoints->NodeEndpointCnt; i++) { + rd_kafkap_NodeEndpoint_t *NodeEndpoint = + &NodeEndpoints->NodeEndpoints[i]; + rd_kafka_metadata_broker_t *mdb = &md->brokers[i]; + rd_kafka_metadata_broker_internal_t *mdbi = &mdi->brokers[i]; + mdb->id = NodeEndpoint->NodeId; + mdb->host = NULL; + if (!RD_KAFKAP_STR_IS_NULL(&NodeEndpoint->Host)) { + mdb->host = rd_tmpabuf_alloc( + tbuf, RD_KAFKAP_STR_LEN(&NodeEndpoint->Host) + 1); + rd_snprintf(mdb->host, + RD_KAFKAP_STR_LEN(&NodeEndpoint->Host) + 1, + "%.*s", + RD_KAFKAP_STR_PR(&NodeEndpoint->Host)); + } + mdb->port = NodeEndpoints->NodeEndpoints[i].Port; + + /* Metadata internal fields */ + mdbi->id = mdb->id; + mdbi->rack_id = NULL; + } + + qsort(mdi->brokers, md->broker_cnt, sizeof(mdi->brokers[0]), + rd_kafka_metadata_broker_internal_cmp); + memcpy(mdi->brokers_sorted, md->brokers, + sizeof(*mdi->brokers_sorted) * md->broker_cnt); + qsort(mdi->brokers_sorted, md->broker_cnt, sizeof(*mdi->brokers_sorted), + rd_kafka_metadata_broker_cmp); +} + +void rd_kafkap_leader_discovery_set_topic_cnt(rd_tmpabuf_t *tbuf, + rd_kafka_metadata_internal_t *mdi, + int topic_cnt) { + + rd_kafka_metadata_t *md = &mdi->metadata; + + md->topic_cnt = topic_cnt; + md->topics = rd_tmpabuf_alloc(tbuf, sizeof(*md->topics) * topic_cnt); + mdi->topics = rd_tmpabuf_alloc(tbuf, sizeof(*mdi->topics) * topic_cnt); +} + +void rd_kafkap_leader_discovery_set_topic(rd_tmpabuf_t *tbuf, + rd_kafka_metadata_internal_t *mdi, + int topic_idx, + rd_kafka_Uuid_t topic_id, + char *topic_name, + int partition_cnt) { + + rd_kafka_metadata_t *md = &mdi->metadata; + rd_kafka_metadata_topic_t *mdt = &md->topics[topic_idx]; + rd_kafka_metadata_topic_internal_t *mdti = &mdi->topics[topic_idx]; + + memset(mdt, 0, sizeof(*mdt)); + mdt->topic = + topic_name ? rd_tmpabuf_alloc(tbuf, strlen(topic_name) + 1) : NULL; + mdt->partition_cnt = partition_cnt; + mdt->partitions = + rd_tmpabuf_alloc(tbuf, sizeof(*mdt->partitions) * partition_cnt); + + if (topic_name) + rd_snprintf(mdt->topic, strlen(topic_name) + 1, "%s", + topic_name); + + memset(mdti, 0, sizeof(*mdti)); + mdti->partitions = + rd_tmpabuf_alloc(tbuf, sizeof(*mdti->partitions) * partition_cnt); + mdti->topic_id = topic_id; + mdti->topic_authorized_operations = -1; +} + +void rd_kafkap_leader_discovery_set_CurrentLeader( + rd_tmpabuf_t *tbuf, + rd_kafka_metadata_internal_t *mdi, + int topic_idx, + int partition_idx, + int32_t partition_id, + rd_kafkap_CurrentLeader_t *CurrentLeader) { + + rd_kafka_metadata_t *md = &mdi->metadata; + rd_kafka_metadata_partition_t *mdp = + &md->topics[topic_idx].partitions[partition_idx]; + rd_kafka_metadata_partition_internal_t *mdpi = + &mdi->topics[topic_idx].partitions[partition_idx]; + + memset(mdp, 0, sizeof(*mdp)); + mdp->id = partition_id; + mdp->leader = CurrentLeader->LeaderId, + + memset(mdpi, 0, sizeof(*mdpi)); + mdpi->id = partition_id; + mdpi->leader_epoch = CurrentLeader->LeaderEpoch; +} +/**@}*/ + +static int rd_kafkap_Produce_reply_tags_partition_parse( + rd_kafka_buf_t *rkbuf, + uint64_t tagtype, + uint64_t taglen, + rd_kafkap_Produce_reply_tags_t *ProduceTags, + rd_kafkap_Produce_reply_tags_Partition_t *PartitionTags) { + switch (tagtype) { + case 0: /* CurrentLeader */ + if (rd_kafka_buf_read_CurrentLeader( + rkbuf, &PartitionTags->CurrentLeader) == -1) + goto err_parse; + ProduceTags->leader_change_cnt++; + return 1; + default: + return 0; + } +err_parse: + return -1; +} + +static int +rd_kafkap_Produce_reply_tags_parse(rd_kafka_buf_t *rkbuf, + uint64_t tagtype, + uint64_t taglen, + rd_kafkap_Produce_reply_tags_t *tags) { + switch (tagtype) { + case 0: /* NodeEndpoints */ + if (rd_kafka_buf_read_NodeEndpoints(rkbuf, + &tags->NodeEndpoints) == -1) + goto err_parse; + return 1; + default: + return 0; + } +err_parse: + return -1; +} + +static void rd_kafka_handle_Produce_metadata_update( + rd_kafka_broker_t *rkb, + rd_kafkap_Produce_reply_tags_t *ProduceTags) { + if (ProduceTags->leader_change_cnt) { + rd_kafka_metadata_t *md = NULL; + rd_kafka_metadata_internal_t *mdi = NULL; + rd_kafkap_Produce_reply_tags_Partition_t *Partition; + rd_tmpabuf_t tbuf; + int32_t nodeid; + rd_kafka_op_t *rko; + + rd_kafka_broker_lock(rkb); + nodeid = rkb->rkb_nodeid; + rd_kafka_broker_unlock(rkb); + + rd_tmpabuf_new(&tbuf, 0, rd_true /*assert on fail*/); + rd_tmpabuf_add_alloc(&tbuf, sizeof(*mdi)); + rd_kafkap_leader_discovery_tmpabuf_add_alloc_brokers( + &tbuf, &ProduceTags->NodeEndpoints); + rd_kafkap_leader_discovery_tmpabuf_add_alloc_topics(&tbuf, 1); + rd_kafkap_leader_discovery_tmpabuf_add_alloc_topic( + &tbuf, ProduceTags->Topic.TopicName, 1); + rd_tmpabuf_finalize(&tbuf); + + mdi = rd_tmpabuf_alloc(&tbuf, sizeof(*mdi)); + md = &mdi->metadata; + + rd_kafkap_leader_discovery_metadata_init(mdi, nodeid); + + rd_kafkap_leader_discovery_set_brokers( + &tbuf, mdi, &ProduceTags->NodeEndpoints); + + rd_kafkap_leader_discovery_set_topic_cnt(&tbuf, mdi, 1); + + rd_kafkap_leader_discovery_set_topic( + &tbuf, mdi, 0, RD_KAFKA_UUID_ZERO, + ProduceTags->Topic.TopicName, 1); + + Partition = &ProduceTags->Topic.Partition; + rd_kafkap_leader_discovery_set_CurrentLeader( + &tbuf, mdi, 0, 0, Partition->Partition, + &Partition->CurrentLeader); + + rko = rd_kafka_op_new(RD_KAFKA_OP_METADATA_UPDATE); + rko->rko_u.metadata.md = md; + rko->rko_u.metadata.mdi = mdi; + rd_kafka_q_enq(rkb->rkb_rk->rk_ops, rko); + } +} + +static void rd_kafkap_Produce_reply_tags_destroy( + rd_kafkap_Produce_reply_tags_t *reply_tags) { + RD_IF_FREE(reply_tags->Topic.TopicName, rd_free); + RD_IF_FREE(reply_tags->NodeEndpoints.NodeEndpoints, rd_free); +} + /** * @brief Parses a Produce reply. @@ -2468,7 +3703,7 @@ rd_kafka_handle_Produce_parse(rd_kafka_broker_t *rkb, rd_kafka_toppar_t *rktp, rd_kafka_buf_t *rkbuf, rd_kafka_buf_t *request, - struct rd_kafka_Produce_result *result) { + rd_kafka_Produce_result_t *result) { int32_t TopicArrayCnt; int32_t PartitionArrayCnt; struct { @@ -2476,10 +3711,12 @@ rd_kafka_handle_Produce_parse(rd_kafka_broker_t *rkb, int16_t ErrorCode; int64_t Offset; } hdr; - const int log_decode_errors = LOG_ERR; - int64_t log_start_offset = -1; + const int log_decode_errors = LOG_ERR; + int64_t log_start_offset = -1; + rd_kafkap_str_t TopicName = RD_ZERO_INIT; + rd_kafkap_Produce_reply_tags_t ProduceTags = RD_ZERO_INIT; - rd_kafka_buf_read_i32(rkbuf, &TopicArrayCnt); + rd_kafka_buf_read_arraycnt(rkbuf, &TopicArrayCnt, RD_KAFKAP_TOPICS_MAX); if (TopicArrayCnt != 1) goto err; @@ -2487,8 +3724,12 @@ rd_kafka_handle_Produce_parse(rd_kafka_broker_t *rkb, * request we assume that the reply only contains one topic+partition * and that it is the same that we requested. * If not the broker is buggy. */ - rd_kafka_buf_skip_str(rkbuf); - rd_kafka_buf_read_i32(rkbuf, &PartitionArrayCnt); + if (request->rkbuf_reqhdr.ApiVersion >= 10) + rd_kafka_buf_read_str(rkbuf, &TopicName); + else + rd_kafka_buf_skip_str(rkbuf); + rd_kafka_buf_read_arraycnt(rkbuf, &PartitionArrayCnt, + RD_KAFKAP_PARTITIONS_MAX); if (PartitionArrayCnt != 1) goto err; @@ -2506,6 +3747,57 @@ rd_kafka_handle_Produce_parse(rd_kafka_broker_t *rkb, if (request->rkbuf_reqhdr.ApiVersion >= 5) rd_kafka_buf_read_i64(rkbuf, &log_start_offset); + if (request->rkbuf_reqhdr.ApiVersion >= 8) { + int i; + int32_t RecordErrorsCnt; + rd_kafkap_str_t ErrorMessage; + rd_kafka_buf_read_arraycnt(rkbuf, &RecordErrorsCnt, -1); + if (RecordErrorsCnt) { + result->record_errors = rd_calloc( + RecordErrorsCnt, sizeof(*result->record_errors)); + result->record_errors_cnt = RecordErrorsCnt; + for (i = 0; i < RecordErrorsCnt; i++) { + int32_t BatchIndex; + rd_kafkap_str_t BatchIndexErrorMessage; + rd_kafka_buf_read_i32(rkbuf, &BatchIndex); + rd_kafka_buf_read_str(rkbuf, + &BatchIndexErrorMessage); + result->record_errors[i].batch_index = + BatchIndex; + if (!RD_KAFKAP_STR_IS_NULL( + &BatchIndexErrorMessage)) + result->record_errors[i].errstr = + RD_KAFKAP_STR_DUP( + &BatchIndexErrorMessage); + /* RecordError tags */ + rd_kafka_buf_skip_tags(rkbuf); + } + } + + rd_kafka_buf_read_str(rkbuf, &ErrorMessage); + if (!RD_KAFKAP_STR_IS_NULL(&ErrorMessage)) + result->errstr = RD_KAFKAP_STR_DUP(&ErrorMessage); + } + + if (request->rkbuf_reqhdr.ApiVersion >= 10) { + rd_kafkap_Produce_reply_tags_Topic_t *TopicTags = + &ProduceTags.Topic; + rd_kafkap_Produce_reply_tags_Partition_t *PartitionTags = + &TopicTags->Partition; + + /* Partition tags count */ + TopicTags->TopicName = RD_KAFKAP_STR_DUP(&TopicName); + PartitionTags->Partition = hdr.Partition; + } + + /* Partition tags */ + rd_kafka_buf_read_tags(rkbuf, + rd_kafkap_Produce_reply_tags_partition_parse, + &ProduceTags, &ProduceTags.Topic.Partition); + + /* Topic tags */ + rd_kafka_buf_skip_tags(rkbuf); + if (request->rkbuf_reqhdr.ApiVersion >= 1) { int32_t Throttle_Time; rd_kafka_buf_read_i32(rkbuf, &Throttle_Time); @@ -2514,12 +3806,19 @@ rd_kafka_handle_Produce_parse(rd_kafka_broker_t *rkb, Throttle_Time); } + /* ProduceResponse tags */ + rd_kafka_buf_read_tags(rkbuf, rd_kafkap_Produce_reply_tags_parse, + &ProduceTags); + rd_kafka_handle_Produce_metadata_update(rkb, &ProduceTags); + + rd_kafkap_Produce_reply_tags_destroy(&ProduceTags); return hdr.ErrorCode; - err_parse: + rd_kafkap_Produce_reply_tags_destroy(&ProduceTags); return rkbuf->rkbuf_err; err: + rd_kafkap_Produce_reply_tags_destroy(&ProduceTags); return RD_KAFKA_RESP_ERR__BAD_MSG; } @@ -3100,17 +4399,12 @@ static int rd_kafka_handle_Produce_error(rd_kafka_broker_t *rkb, * which should not be treated as a fatal error * since this request and sub-sequent requests * will be retried and thus return to order. - * Unless the error was a timeout, or similar, - * in which case the request might have made it - * and the messages are considered possibly persisted: - * in this case we allow the next in-flight response - * to be successful, in which case we mark - * this request's messages as succesfully delivered. */ - if (perr->status & - RD_KAFKA_MSG_STATUS_POSSIBLY_PERSISTED) - perr->update_next_ack = rd_true; - else - perr->update_next_ack = rd_false; + * In case the message is possibly persisted + * we still treat it as not persisted, + * expecting DUPLICATE_SEQUENCE_NUMBER + * in case it was persisted or NO_ERROR in case + * it wasn't. */ + perr->update_next_ack = rd_false; perr->update_next_err = rd_true; /* Drain outstanding requests so that retries @@ -3338,6 +4632,59 @@ rd_kafka_handle_idempotent_Produce_success(rd_kafka_broker_t *rkb, rk, RD_KAFKA_RESP_ERR__INCONSISTENT, "%s", fatal_err); } +/** + * @brief Set \p batch error codes, corresponding to the indices that caused + * the error in 'presult->record_errors', to INVALID_RECORD and + * the rest to _INVALID_DIFFERENT_RECORD. + * + * @param presult Produce result structure + * @param batch Batch of messages + * + * @locks none + * @locality broker thread (but not necessarily the leader broker thread) + */ +static void rd_kafka_msgbatch_handle_Produce_result_record_errors( + const rd_kafka_Produce_result_t *presult, + rd_kafka_msgbatch_t *batch) { + rd_kafka_msg_t *rkm = TAILQ_FIRST(&batch->msgq.rkmq_msgs); + if (presult->record_errors) { + int i = 0, j = 0; + while (rkm) { + if (j < presult->record_errors_cnt && + presult->record_errors[j].batch_index == i) { + rkm->rkm_u.producer.errstr = + presult->record_errors[j].errstr; + /* If the batch contained only a single record + * error, then we can unambiguously use the + * error corresponding to the partition-level + * error code. */ + if (presult->record_errors_cnt > 1) + rkm->rkm_err = + RD_KAFKA_RESP_ERR_INVALID_RECORD; + j++; + } else { + /* If the response contains record errors, then + * the records which failed validation will be + * present in the response. To avoid confusion + * for the remaining records, we return a + * generic error code. */ + rkm->rkm_u.producer.errstr = + "Failed to append record because it was " + "part of a batch " + "which had one more more invalid records"; + rkm->rkm_err = + RD_KAFKA_RESP_ERR__INVALID_DIFFERENT_RECORD; + } + rkm = TAILQ_NEXT(rkm, rkm_link); + i++; + } + } else if (presult->errstr) { + while (rkm) { + rkm->rkm_u.producer.errstr = presult->errstr; + rkm = TAILQ_NEXT(rkm, rkm_link); + } + } +} /** * @brief Handle ProduceRequest result for a message batch. @@ -3351,7 +4698,7 @@ static void rd_kafka_msgbatch_handle_Produce_result( rd_kafka_broker_t *rkb, rd_kafka_msgbatch_t *batch, rd_kafka_resp_err_t err, - const struct rd_kafka_Produce_result *presult, + const rd_kafka_Produce_result_t *presult, const rd_kafka_buf_t *request) { rd_kafka_t *rk = rkb->rkb_rk; @@ -3391,7 +4738,7 @@ static void rd_kafka_msgbatch_handle_Produce_result( .err = err, .incr_retry = 1, .status = status, - .update_next_ack = rd_true, + .update_next_ack = rd_false, .update_next_err = rd_true, .last_seq = (batch->first_seq + rd_kafka_msgq_len(&batch->msgq) - 1)}; @@ -3420,8 +4767,11 @@ static void rd_kafka_msgbatch_handle_Produce_result( presult->offset, presult->timestamp, status); + /* Change error codes if necessary */ + rd_kafka_msgbatch_handle_Produce_result_record_errors(presult, + batch); /* Enqueue messages for delivery report. */ - rd_kafka_dr_msgq(rktp->rktp_rkt, &batch->msgq, err); + rd_kafka_dr_msgq0(rktp->rktp_rkt, &batch->msgq, err, presult); } if (rd_kafka_is_idempotent(rk) && last_inflight) @@ -3449,10 +4799,10 @@ static void rd_kafka_handle_Produce(rd_kafka_t *rk, rd_kafka_buf_t *reply, rd_kafka_buf_t *request, void *opaque) { - rd_kafka_msgbatch_t *batch = &request->rkbuf_batch; - rd_kafka_toppar_t *rktp = batch->rktp; - struct rd_kafka_Produce_result result = { - .offset = RD_KAFKA_OFFSET_INVALID, .timestamp = -1}; + rd_kafka_msgbatch_t *batch = &request->rkbuf_batch; + rd_kafka_toppar_t *rktp = batch->rktp; + rd_kafka_Produce_result_t *result = + rd_kafka_Produce_result_new(RD_KAFKA_OFFSET_INVALID, -1); /* Unit test interface: inject errors */ if (unlikely(rk->rk_conf.ut.handle_ProduceResponse != NULL)) { @@ -3463,10 +4813,11 @@ static void rd_kafka_handle_Produce(rd_kafka_t *rk, /* Parse Produce reply (unless the request errored) */ if (!err && reply) err = rd_kafka_handle_Produce_parse(rkb, rktp, reply, request, - &result); + result); - rd_kafka_msgbatch_handle_Produce_result(rkb, batch, err, &result, + rd_kafka_msgbatch_handle_Produce_result(rkb, batch, err, result, request); + rd_kafka_Produce_result_destroy(result); } @@ -3815,10 +5166,14 @@ rd_kafka_DeleteRecordsRequest(rd_kafka_broker_t *rkb, rkbuf = rd_kafka_buf_new_request(rkb, RD_KAFKAP_DeleteRecords, 1, 4 + (partitions->cnt * 100) + 4); + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_OFFSET, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; rd_kafka_buf_write_topic_partitions( rkbuf, partitions, rd_false /*don't skip invalid offsets*/, - rd_false /*any offset*/, rd_true /*do write offsets*/, - rd_false /*don't write epoch*/, rd_false /*don't write metadata*/); + rd_false /*any offset*/, rd_false /*don't use topic id*/, + rd_true /*use topic name*/, fields); /* timeout */ op_timeout = rd_kafka_confval_get_int(&options->operation_timeout); @@ -3975,7 +5330,7 @@ rd_kafka_AlterConfigsRequest(rd_kafka_broker_t *rkb, } ApiVersion = rd_kafka_broker_ApiVersion_supported( - rkb, RD_KAFKAP_AlterConfigs, 0, 0, NULL); + rkb, RD_KAFKAP_AlterConfigs, 0, 2, NULL); if (ApiVersion == -1) { rd_snprintf(errstr, errstr_size, "AlterConfigs (KIP-133) not supported " @@ -3984,56 +5339,39 @@ rd_kafka_AlterConfigsRequest(rd_kafka_broker_t *rkb, return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; } - /* incremental requires ApiVersion > FIXME */ - if (ApiVersion < 1 /* FIXME */ && - rd_kafka_confval_get_int(&options->incremental)) { - rd_snprintf(errstr, errstr_size, - "AlterConfigs.incremental=true (KIP-248) " - "not supported by broker, " - "requires broker version >= 2.0.0"); - rd_kafka_replyq_destroy(&replyq); - return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; - } + rkbuf = rd_kafka_buf_new_flexver_request(rkb, RD_KAFKAP_AlterConfigs, 1, + rd_list_cnt(configs) * 200, + ApiVersion >= 2); - rkbuf = rd_kafka_buf_new_request(rkb, RD_KAFKAP_AlterConfigs, 1, - rd_list_cnt(configs) * 200); - - /* #resources */ - rd_kafka_buf_write_i32(rkbuf, rd_list_cnt(configs)); + /* #Resources */ + rd_kafka_buf_write_arraycnt(rkbuf, rd_list_cnt(configs)); RD_LIST_FOREACH(config, configs, i) { const rd_kafka_ConfigEntry_t *entry; int ei; - /* resource_type */ - rd_kafka_buf_write_i8(rkbuf, config->restype); + /* ResourceType */ + rd_kafka_buf_write_i8( + rkbuf, rd_kafka_ResourceType_to_ConfigResourceType( + config->restype)); - /* resource_name */ + /* ResourceName */ rd_kafka_buf_write_str(rkbuf, config->name, -1); - /* #config */ - rd_kafka_buf_write_i32(rkbuf, rd_list_cnt(&config->config)); + /* #Configs */ + rd_kafka_buf_write_arraycnt(rkbuf, + rd_list_cnt(&config->config)); RD_LIST_FOREACH(entry, &config->config, ei) { - /* config_name */ + /* Name */ rd_kafka_buf_write_str(rkbuf, entry->kv->name, -1); - /* config_value (nullable) */ + /* Value (nullable) */ rd_kafka_buf_write_str(rkbuf, entry->kv->value, -1); - if (ApiVersion == 1) - rd_kafka_buf_write_i8(rkbuf, - entry->a.operation); - else if (entry->a.operation != RD_KAFKA_ALTER_OP_SET) { - rd_snprintf(errstr, errstr_size, - "Broker version >= 2.0.0 required " - "for add/delete config " - "entries: only set supported " - "by this broker"); - rd_kafka_buf_destroy(rkbuf); - rd_kafka_replyq_destroy(&replyq); - return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; - } + rd_kafka_buf_write_tags_empty(rkbuf); } + + rd_kafka_buf_write_tags_empty(rkbuf); } /* timeout */ @@ -4053,6 +5391,91 @@ rd_kafka_AlterConfigsRequest(rd_kafka_broker_t *rkb, } +rd_kafka_resp_err_t rd_kafka_IncrementalAlterConfigsRequest( + rd_kafka_broker_t *rkb, + const rd_list_t *configs /*(ConfigResource_t*)*/, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_buf_t *rkbuf; + int16_t ApiVersion = 0; + int i; + const rd_kafka_ConfigResource_t *config; + int op_timeout; + + if (rd_list_cnt(configs) == 0) { + rd_snprintf(errstr, errstr_size, + "No config resources specified"); + rd_kafka_replyq_destroy(&replyq); + return RD_KAFKA_RESP_ERR__INVALID_ARG; + } + + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_IncrementalAlterConfigs, 0, 1, NULL); + if (ApiVersion == -1) { + rd_snprintf(errstr, errstr_size, + "IncrementalAlterConfigs (KIP-339) not supported " + "by broker, requires broker version >= 2.3.0"); + rd_kafka_replyq_destroy(&replyq); + return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; + } + + rkbuf = rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_IncrementalAlterConfigs, 1, + rd_list_cnt(configs) * 200, ApiVersion >= 1); + + /* #Resources */ + rd_kafka_buf_write_arraycnt(rkbuf, rd_list_cnt(configs)); + + RD_LIST_FOREACH(config, configs, i) { + const rd_kafka_ConfigEntry_t *entry; + int ei; + + /* ResourceType */ + rd_kafka_buf_write_i8( + rkbuf, rd_kafka_ResourceType_to_ConfigResourceType( + config->restype)); + + /* ResourceName */ + rd_kafka_buf_write_str(rkbuf, config->name, -1); + + /* #Configs */ + rd_kafka_buf_write_arraycnt(rkbuf, + rd_list_cnt(&config->config)); + + RD_LIST_FOREACH(entry, &config->config, ei) { + /* Name */ + rd_kafka_buf_write_str(rkbuf, entry->kv->name, -1); + /* ConfigOperation */ + rd_kafka_buf_write_i8(rkbuf, entry->a.op_type); + /* Value (nullable) */ + rd_kafka_buf_write_str(rkbuf, entry->kv->value, -1); + + rd_kafka_buf_write_tags_empty(rkbuf); + } + + rd_kafka_buf_write_tags_empty(rkbuf); + } + + /* timeout */ + op_timeout = rd_kafka_confval_get_int(&options->operation_timeout); + if (op_timeout > rkb->rkb_rk->rk_conf.socket_timeout_ms) + rd_kafka_buf_set_abs_timeout(rkbuf, op_timeout + 1000, 0); + + /* ValidateOnly */ + rd_kafka_buf_write_i8( + rkbuf, rd_kafka_confval_get_int(&options->validate_only)); + + rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); + + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + /** * @brief Construct and send DescribeConfigsRequest to \p rkb * with the configs (ConfigResource_t*) in \p configs, using @@ -4108,7 +5531,9 @@ rd_kafka_resp_err_t rd_kafka_DescribeConfigsRequest( int ei; /* resource_type */ - rd_kafka_buf_write_i8(rkbuf, config->restype); + rd_kafka_buf_write_i8( + rkbuf, rd_kafka_ResourceType_to_ConfigResourceType( + config->restype)); /* resource_name */ rd_kafka_buf_write_str(rkbuf, config->name, -1); @@ -4557,6 +5982,150 @@ rd_kafka_DeleteAclsRequest(rd_kafka_broker_t *rkb, return RD_KAFKA_RESP_ERR_NO_ERROR; } +/** + * @brief Construct and send ElectLeadersRequest to \p rkb + * with the partitions (ElectLeaders_t*) in \p elect_leaders, using + * \p options. + * + * The response (unparsed) will be enqueued on \p replyq + * for handling by \p resp_cb (with \p opaque passed). + * + * @returns RD_KAFKA_RESP_ERR_NO_ERROR if the request was enqueued for + * transmission, otherwise an error code and errstr will be + * updated with a human readable error string. + */ +rd_kafka_resp_err_t rd_kafka_ElectLeadersRequest( + rd_kafka_broker_t *rkb, + const rd_list_t *elect_leaders /*(rd_kafka_EleactLeaders_t*)*/, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_buf_t *rkbuf; + int16_t ApiVersion; + const rd_kafka_ElectLeaders_t *elect_leaders_request; + int rd_buf_size_estimate; + int op_timeout; + + if (rd_list_cnt(elect_leaders) == 0) { + rd_snprintf(errstr, errstr_size, + "No partitions specified for leader election"); + rd_kafka_replyq_destroy(&replyq); + return RD_KAFKA_RESP_ERR__INVALID_ARG; + } + + elect_leaders_request = rd_list_elem(elect_leaders, 0); + + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_ElectLeaders, 0, 2, NULL); + if (ApiVersion == -1) { + rd_snprintf(errstr, errstr_size, + "ElectLeaders Admin API (KIP-460) not supported " + "by broker, requires broker version >= 2.4.0"); + rd_kafka_replyq_destroy(&replyq); + return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; + } + + rd_buf_size_estimate = + 1 /* ElectionType */ + 4 /* #TopicPartitions */ + 4 /* TimeoutMs */; + if (elect_leaders_request->partitions) + rd_buf_size_estimate += + (50 + 4) * elect_leaders_request->partitions->cnt; + rkbuf = rd_kafka_buf_new_flexver_request(rkb, RD_KAFKAP_ElectLeaders, 1, + rd_buf_size_estimate, + ApiVersion >= 2); + + if (ApiVersion >= 1) { + /* Election type */ + rd_kafka_buf_write_i8(rkbuf, + elect_leaders_request->election_type); + } + + /* Write partition list */ + if (elect_leaders_request->partitions) { + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + rd_kafka_buf_write_topic_partitions( + rkbuf, elect_leaders_request->partitions, + rd_false /*don't skip invalid offsets*/, + rd_false /* any offset */, + rd_false /* don't use topic_id */, + rd_true /* use topic_names */, fields); + } else { + rd_kafka_buf_write_arraycnt(rkbuf, -1); + } + + /* timeout */ + op_timeout = rd_kafka_confval_get_int(&options->operation_timeout); + rd_kafka_buf_write_i32(rkbuf, op_timeout); + + if (op_timeout > rkb->rkb_rk->rk_conf.socket_timeout_ms) + rd_kafka_buf_set_abs_timeout(rkbuf, op_timeout + 1000, 0); + + rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); + + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +/** + * @brief Construct and send ConsumerGroupDescribe requests + * to \p rkb with the groups (const char *) in \p groups. + * Uses \p include_authorized_operations to get + * group ACL authorized operations. + * + * The response (unparsed) will be enqueued on \p replyq + * for handling by \p resp_cb (with \p opaque passed). + * + * @return NULL on success, a new error instance that must be + * released with rd_kafka_error_destroy() in case of error. + */ +rd_kafka_error_t * +rd_kafka_ConsumerGroupDescribeRequest(rd_kafka_broker_t *rkb, + char **groups, + size_t group_cnt, + rd_bool_t include_authorized_operations, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_buf_t *rkbuf; + size_t i; + + int16_t ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_ConsumerGroupDescribe, 0, 0, NULL); + + if (ApiVersion == -1) { + return rd_kafka_error_new( + RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE, + "ConsumerGroupDescribe (KIP-848) " + "not supported by broker, " + "requires broker version >= 4.0.0"); + } + + rkbuf = rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_ConsumerGroupDescribe, 1, + 4 /* rd_kafka_buf_write_arraycnt_pos */ + + 1 /* IncludeAuthorizedOperations */ + 1 /* tags */ + + 32 * group_cnt /* Groups */, + rd_true /* flexver */); + + rd_kafka_buf_write_arraycnt(rkbuf, group_cnt); + + for (i = 0; i < group_cnt; i++) { + rd_kafka_buf_write_str(rkbuf, groups[i], -1); + } + + rd_kafka_buf_write_bool(rkbuf, include_authorized_operations); + rd_kafka_buf_ApiVersion_set(rkbuf, ApiVersion, 0); + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + + return NULL; +} + /** * @brief Parses and handles an InitProducerId reply. * @@ -4890,6 +6459,239 @@ rd_kafka_resp_err_t rd_kafka_EndTxnRequest(rd_kafka_broker_t *rkb, return RD_KAFKA_RESP_ERR_NO_ERROR; } +rd_kafka_resp_err_t +rd_kafka_GetTelemetrySubscriptionsRequest(rd_kafka_broker_t *rkb, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_buf_t *rkbuf; + int16_t ApiVersion = 0; + + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_GetTelemetrySubscriptions, 0, 0, NULL); + if (ApiVersion == -1) { + rd_snprintf(errstr, errstr_size, + "GetTelemetrySubscriptions (KIP-714) not supported " + "by broker, requires broker version >= 3.X.Y"); + rd_kafka_replyq_destroy(&replyq); + return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; + } + + rkbuf = rd_kafka_buf_new_flexver_request( + rkb, RD_KAFKAP_GetTelemetrySubscriptions, 1, + 16 /* client_instance_id */, rd_true); + + rd_kafka_buf_write_uuid(rkbuf, + &rkb->rkb_rk->rk_telemetry.client_instance_id); + + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +rd_kafka_resp_err_t +rd_kafka_PushTelemetryRequest(rd_kafka_broker_t *rkb, + rd_kafka_Uuid_t *client_instance_id, + int32_t subscription_id, + rd_bool_t terminating, + const rd_kafka_compression_t compression_type, + const void *metrics, + size_t metrics_size, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque) { + rd_kafka_buf_t *rkbuf; + int16_t ApiVersion = 0; + + ApiVersion = rd_kafka_broker_ApiVersion_supported( + rkb, RD_KAFKAP_PushTelemetry, 0, 0, NULL); + if (ApiVersion == -1) { + rd_snprintf(errstr, errstr_size, + "PushTelemetryRequest (KIP-714) not supported "); + rd_kafka_replyq_destroy(&replyq); + return RD_KAFKA_RESP_ERR__UNSUPPORTED_FEATURE; + } + + size_t len = sizeof(rd_kafka_Uuid_t) + sizeof(int32_t) + + sizeof(rd_bool_t) + sizeof(compression_type) + + metrics_size; + rkbuf = rd_kafka_buf_new_flexver_request(rkb, RD_KAFKAP_PushTelemetry, + 1, len, rd_true); + + rd_kafka_buf_write_uuid(rkbuf, client_instance_id); + rd_kafka_buf_write_i32(rkbuf, subscription_id); + rd_kafka_buf_write_bool(rkbuf, terminating); + rd_kafka_buf_write_i8(rkbuf, compression_type); + + rd_dassert(metrics != NULL); + rd_dassert(metrics_size >= 0); + rd_kafkap_bytes_t *metric_bytes = + rd_kafkap_bytes_new(metrics, metrics_size); + rd_kafka_buf_write_kbytes(rkbuf, metric_bytes); + rd_free(metric_bytes); + + rkbuf->rkbuf_max_retries = RD_KAFKA_REQUEST_NO_RETRIES; + + + /* Processing... */ + rd_kafka_broker_buf_enq_replyq(rkb, rkbuf, replyq, resp_cb, opaque); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} + +void rd_kafka_handle_GetTelemetrySubscriptions(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + void *opaque) { + int16_t ErrorCode = 0; + const int log_decode_errors = LOG_ERR; + int32_t arraycnt; + size_t i; + rd_kafka_Uuid_t prev_client_instance_id = + rk->rk_telemetry.client_instance_id; + + if (err == RD_KAFKA_RESP_ERR__DESTROY) { + /* Termination */ + return; + } + + if (err) + goto err; + + rd_kafka_buf_read_throttle_time(rkbuf); + + rd_kafka_buf_read_i16(rkbuf, &ErrorCode); + + if (ErrorCode) { + err = ErrorCode; + goto err; + } + + rd_kafka_buf_read_uuid(rkbuf, &rk->rk_telemetry.client_instance_id); + rd_kafka_buf_read_i32(rkbuf, &rk->rk_telemetry.subscription_id); + + rd_kafka_dbg( + rk, TELEMETRY, "GETSUBSCRIPTIONS", "Parsing: client instance id %s", + rd_kafka_Uuid_base64str(&rk->rk_telemetry.client_instance_id)); + rd_kafka_dbg(rk, TELEMETRY, "GETSUBSCRIPTIONS", + "Parsing: subscription id %" PRId32, + rk->rk_telemetry.subscription_id); + + rd_kafka_buf_read_arraycnt(rkbuf, &arraycnt, -1); + + if (arraycnt) { + rk->rk_telemetry.accepted_compression_types_cnt = arraycnt; + rk->rk_telemetry.accepted_compression_types = + rd_calloc(arraycnt, sizeof(rd_kafka_compression_t)); + + for (i = 0; i < (size_t)arraycnt; i++) + rd_kafka_buf_read_i8( + rkbuf, + &rk->rk_telemetry.accepted_compression_types[i]); + } else { + rk->rk_telemetry.accepted_compression_types_cnt = 1; + rk->rk_telemetry.accepted_compression_types = + rd_calloc(1, sizeof(rd_kafka_compression_t)); + rk->rk_telemetry.accepted_compression_types[0] = + RD_KAFKA_COMPRESSION_NONE; + } + + rd_kafka_buf_read_i32(rkbuf, &rk->rk_telemetry.push_interval_ms); + rd_kafka_buf_read_i32(rkbuf, &rk->rk_telemetry.telemetry_max_bytes); + rd_kafka_buf_read_bool(rkbuf, &rk->rk_telemetry.delta_temporality); + + + if (rk->rk_telemetry.subscription_id && + rd_kafka_Uuid_cmp(prev_client_instance_id, + rk->rk_telemetry.client_instance_id)) { + rd_kafka_log( + rk, LOG_INFO, "GETSUBSCRIPTIONS", + "Telemetry client instance id changed from %s to %s", + rd_kafka_Uuid_base64str(&prev_client_instance_id), + rd_kafka_Uuid_base64str( + &rk->rk_telemetry.client_instance_id)); + } + + rd_kafka_dbg(rk, TELEMETRY, "GETSUBSCRIPTIONS", + "Parsing: push interval %" PRId32, + rk->rk_telemetry.push_interval_ms); + + rd_kafka_buf_read_arraycnt(rkbuf, &arraycnt, 1000); + + if (arraycnt) { + rk->rk_telemetry.requested_metrics_cnt = arraycnt; + rk->rk_telemetry.requested_metrics = + rd_calloc(arraycnt, sizeof(char *)); + + for (i = 0; i < (size_t)arraycnt; i++) { + rd_kafkap_str_t Metric; + rd_kafka_buf_read_str(rkbuf, &Metric); + rk->rk_telemetry.requested_metrics[i] = + RD_KAFKAP_STR_DUP(&Metric); + } + } + + rd_kafka_dbg(rk, TELEMETRY, "GETSUBSCRIPTIONS", + "Parsing: requested metrics count %" PRIusz, + rk->rk_telemetry.requested_metrics_cnt); + + rd_kafka_handle_get_telemetry_subscriptions(rk, err); + return; + +err_parse: + err = rkbuf->rkbuf_err; + goto err; + +err: + /* TODO: Add error handling actions, possibly call + * rd_kafka_handle_get_telemetry_subscriptions with error. */ + rd_kafka_handle_get_telemetry_subscriptions(rk, err); +} + +void rd_kafka_handle_PushTelemetry(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + void *opaque) { + const int log_decode_errors = LOG_ERR; + int16_t ErrorCode; + + if (err == RD_KAFKA_RESP_ERR__DESTROY) { + /* Termination */ + return; + } + + if (err) + goto err; + + + rd_kafka_buf_read_throttle_time(rkbuf); + + rd_kafka_buf_read_i16(rkbuf, &ErrorCode); + + if (ErrorCode) { + err = ErrorCode; + goto err; + } + rd_kafka_handle_push_telemetry(rk, err); + return; +err_parse: + err = rkbuf->rkbuf_err; + goto err; + +err: + /* TODO: Add error handling actions, possibly call + * rd_kafka_handle_push_telemetry with error. */ + rd_kafka_handle_push_telemetry(rk, err); +} + /** @@ -4943,9 +6745,9 @@ static int unittest_idempotent_producer(void) { int remaining_batches; uint64_t msgid = 1; rd_kafka_toppar_t *rktp; - rd_kafka_pid_t pid = {.id = 1000, .epoch = 0}; - struct rd_kafka_Produce_result result = {.offset = 1, - .timestamp = 1000}; + rd_kafka_pid_t pid = {.id = 1000, .epoch = 0}; + rd_kafka_Produce_result_t *result = + rd_kafka_Produce_result_new(1, 1000); rd_kafka_queue_t *rkqu; rd_kafka_event_t *rkev; rd_kafka_buf_t *request[_BATCH_CNT]; @@ -5026,8 +6828,8 @@ static int unittest_idempotent_producer(void) { RD_UT_ASSERT(r == _MSGS_PER_BATCH, "."); rd_kafka_msgbatch_handle_Produce_result(rkb, &request[i]->rkbuf_batch, RD_KAFKA_RESP_ERR_NO_ERROR, - &result, request[i]); - result.offset += r; + result, request[i]); + result->offset += r; RD_UT_ASSERT(rd_kafka_msgq_len(&rktp->rktp_msgq) == 0, "batch %d: expected no messages in rktp_msgq, not %d", i, rd_kafka_msgq_len(&rktp->rktp_msgq)); @@ -5040,7 +6842,7 @@ static int unittest_idempotent_producer(void) { RD_UT_ASSERT(r == _MSGS_PER_BATCH, "."); rd_kafka_msgbatch_handle_Produce_result( rkb, &request[i]->rkbuf_batch, - RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION, &result, request[i]); + RD_KAFKA_RESP_ERR_NOT_LEADER_FOR_PARTITION, result, request[i]); retry_msg_cnt += r; RD_UT_ASSERT(rd_kafka_msgq_len(&rktp->rktp_msgq) == retry_msg_cnt, "batch %d: expected %d messages in rktp_msgq, not %d", i, @@ -5053,8 +6855,7 @@ static int unittest_idempotent_producer(void) { RD_UT_ASSERT(r == _MSGS_PER_BATCH, "."); rd_kafka_msgbatch_handle_Produce_result( rkb, &request[i]->rkbuf_batch, - RD_KAFKA_RESP_ERR_OUT_OF_ORDER_SEQUENCE_NUMBER, &result, - request[i]); + RD_KAFKA_RESP_ERR_OUT_OF_ORDER_SEQUENCE_NUMBER, result, request[i]); retry_msg_cnt += r; RD_UT_ASSERT(rd_kafka_msgq_len(&rktp->rktp_msgq) == retry_msg_cnt, "batch %d: expected %d messages in rktp_xmit_msgq, not %d", @@ -5066,8 +6867,7 @@ static int unittest_idempotent_producer(void) { r = rd_kafka_msgq_len(&request[i]->rkbuf_batch.msgq); rd_kafka_msgbatch_handle_Produce_result( rkb, &request[i]->rkbuf_batch, - RD_KAFKA_RESP_ERR_OUT_OF_ORDER_SEQUENCE_NUMBER, &result, - request[i]); + RD_KAFKA_RESP_ERR_OUT_OF_ORDER_SEQUENCE_NUMBER, result, request[i]); retry_msg_cnt += r; RD_UT_ASSERT(rd_kafka_msgq_len(&rktp->rktp_msgq) == retry_msg_cnt, "batch %d: expected %d messages in rktp_xmit_msgq, not %d", @@ -5085,7 +6885,8 @@ static int unittest_idempotent_producer(void) { "Expected %d messages in retry queue, not %d", retry_msg_cnt, rd_kafka_msgq_len(&rkmq)); - /* Sleep a short while to make sure the retry backoff expires. */ + /* Sleep a short while to make sure the retry backoff expires. + */ rd_usleep(5 * 1000, NULL); /* 5ms */ /* @@ -5107,8 +6908,8 @@ static int unittest_idempotent_producer(void) { r = rd_kafka_msgq_len(&request[i]->rkbuf_batch.msgq); rd_kafka_msgbatch_handle_Produce_result( rkb, &request[i]->rkbuf_batch, RD_KAFKA_RESP_ERR_NO_ERROR, - &result, request[i]); - result.offset += r; + result, request[i]); + result->offset += r; rd_kafka_buf_destroy(request[i]); } @@ -5143,9 +6944,11 @@ static int unittest_idempotent_producer(void) { r = rd_kafka_outq_len(rk); RD_UT_ASSERT(r == 0, "expected outq to return 0, not %d", r); - /* Verify the expected number of good delivery reports were seen */ + /* Verify the expected number of good delivery reports were seen + */ RD_UT_ASSERT(drcnt == msgcnt, "expected %d DRs, not %d", msgcnt, drcnt); + rd_kafka_Produce_result_destroy(result); rd_kafka_queue_destroy(rkqu); rd_kafka_toppar_destroy(rktp); rd_kafka_broker_destroy(rkb); @@ -5155,6 +6958,85 @@ static int unittest_idempotent_producer(void) { return 0; } +/** + * @brief Test for the GetTelemetrySubscriptions response handling. + * + * @returns 1 on failure, 0 on success. + */ +static int unittest_handle_GetTelemetrySubscriptions(void) { + rd_kafka_t *rk; + rd_kafka_broker_t *rkb; + rd_kafka_buf_t *rkbuf; + + RD_UT_SAY("Verifying GetTelemetrySubscriptions response handling"); + + rk = rd_kafka_new(RD_KAFKA_CONSUMER, NULL, NULL, 0); + rkb = rd_kafka_broker_add_logical(rk, "unittest"); + + rkbuf = rd_kafka_buf_new(0, 0); + rkbuf->rkbuf_rkb = rkb; + rd_kafka_buf_write_i32(rkbuf, 0); /* ThrottleTime */ + rd_kafka_buf_write_i16(rkbuf, 0); /* ErrorCode */ + + rd_kafka_buf_write_uuid(rkbuf, &rk->rk_telemetry.client_instance_id); + + rd_kafka_buf_write_i32(rkbuf, 0); /* SubscriptionId */ + + rd_kafka_buf_write_arraycnt(rkbuf, 2); /* #AcceptedCompressionTypes */ + /* AcceptedCompressionTypes[0] */ + rd_kafka_buf_write_i8(rkbuf, RD_KAFKA_COMPRESSION_GZIP); + /* AcceptedCompressionTypes[1] */ + rd_kafka_buf_write_i8(rkbuf, RD_KAFKA_COMPRESSION_KLZ4); + + rd_kafka_buf_write_i32(rkbuf, 0); /* PushIntervalMs */ + rd_kafka_buf_write_i32(rkbuf, 0); /* TelemetryMaxBytes */ + rd_kafka_buf_write_bool(rkbuf, 0); /* DeltaTemporality */ + + rd_kafka_buf_write_arraycnt(rkbuf, 2); /* #RequestedMetrics */ + /* RequestedMetrics[0] */ + rd_kafka_buf_write_str(rkbuf, "metric1", -1); + /* RequestedMetrics[1] */ + rd_kafka_buf_write_str(rkbuf, "metric2", -1); + + /* Set up a buffer reader for sending the buffer. */ + rd_slice_init_full(&rkbuf->rkbuf_reader, &rkbuf->rkbuf_buf); + + /* Handle the response */ + rd_kafka_handle_GetTelemetrySubscriptions( + rk, rkb, RD_KAFKA_RESP_ERR_NO_ERROR, rkbuf, NULL, NULL); + + + RD_UT_ASSERT(rk->rk_telemetry.accepted_compression_types_cnt == 2, + "Expected 2 accepted compression types, got %" PRIusz, + rk->rk_telemetry.accepted_compression_types_cnt); + RD_UT_ASSERT(rk->rk_telemetry.accepted_compression_types[0] == + RD_KAFKA_COMPRESSION_GZIP, + "Expected 'gzip' compression type, got '%s'", + rd_kafka_compression2str( + rk->rk_telemetry.accepted_compression_types[0])); + RD_UT_ASSERT(rk->rk_telemetry.accepted_compression_types[1] == + RD_KAFKA_COMPRESSION_KLZ4, + "Expected 'lz4' compression type, got '%s'", + rd_kafka_compression2str( + rk->rk_telemetry.accepted_compression_types[1])); + + RD_UT_ASSERT(rk->rk_telemetry.requested_metrics_cnt == 2, + "Expected 2 requested metrics, got %" PRIusz, + rk->rk_telemetry.requested_metrics_cnt); + RD_UT_ASSERT( + rd_strcmp(rk->rk_telemetry.requested_metrics[0], "metric1") == 0, + "Expected 'metric1', got '%s'", + rk->rk_telemetry.requested_metrics[0]); + RD_UT_ASSERT( + rd_strcmp(rk->rk_telemetry.requested_metrics[1], "metric2") == 0, + "Expected 'metric2', got '%s'", + rk->rk_telemetry.requested_metrics[1]); + + rd_kafka_buf_destroy(rkbuf); + rd_kafka_destroy(rk); + return 0; +} + /** * @brief Request/response unit tests */ @@ -5162,6 +7044,7 @@ int unittest_request(void) { int fails = 0; fails += unittest_idempotent_producer(); + fails += unittest_handle_GetTelemetrySubscriptions(); return fails; } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_request.h b/src/third_party/librdkafka/dist/src/rdkafka_request.h index 956a6d280c4..68932b309ee 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_request.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_request.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -60,19 +61,135 @@ int rd_kafka_err_action(rd_kafka_broker_t *rkb, const char *rd_kafka_actions2str(int actions); -rd_kafka_topic_partition_list_t * -rd_kafka_buf_read_topic_partitions(rd_kafka_buf_t *rkbuf, - size_t estimated_part_cnt, - rd_bool_t read_offset, - rd_bool_t read_part_errs); + +typedef enum { + /** Array end sentinel */ + RD_KAFKA_TOPIC_PARTITION_FIELD_END = 0, + /** Read/write int32_t for partition */ + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + /** Read/write int64_t for offset */ + RD_KAFKA_TOPIC_PARTITION_FIELD_OFFSET, + /** Read/write int32_t for offset leader_epoch */ + RD_KAFKA_TOPIC_PARTITION_FIELD_EPOCH, + /** Read/write int32_t for current leader_epoch */ + RD_KAFKA_TOPIC_PARTITION_FIELD_CURRENT_EPOCH, + /** Read/write int16_t for error code */ + RD_KAFKA_TOPIC_PARTITION_FIELD_ERR, + /** Read/write timestamp */ + RD_KAFKA_TOPIC_PARTITION_FIELD_TIMESTAMP, + /** Read/write str for metadata */ + RD_KAFKA_TOPIC_PARTITION_FIELD_METADATA, + /** Noop, useful for ternary ifs */ + RD_KAFKA_TOPIC_PARTITION_FIELD_NOOP, +} rd_kafka_topic_partition_field_t; + +/** + * @name Current Leader and NodeEndpoints for KIP-951 + * response triggered metadata updates. + * + * @{ + */ + +typedef struct rd_kafkap_CurrentLeader_s { + int32_t LeaderId; + int32_t LeaderEpoch; +} rd_kafkap_CurrentLeader_t; + +typedef struct rd_kafkap_NodeEndpoint_s { + int32_t NodeId; + rd_kafkap_str_t Host; + int32_t Port; + rd_kafkap_str_t Rack; +} rd_kafkap_NodeEndpoint_t; + +typedef struct rd_kafkap_NodeEndpoints_s { + int32_t NodeEndpointCnt; + rd_kafkap_NodeEndpoint_t *NodeEndpoints; +} rd_kafkap_NodeEndpoints_t; + +/**@}*/ + +/** + * @name Produce tags + * @{ + * + */ + +typedef struct rd_kafkap_Produce_reply_tags_Partition_s { + int32_t Partition; + rd_kafkap_CurrentLeader_t CurrentLeader; +} rd_kafkap_Produce_reply_tags_Partition_t; + +typedef struct rd_kafkap_Produce_reply_tags_Topic_s { + char *TopicName; + rd_kafkap_Produce_reply_tags_Partition_t Partition; +} rd_kafkap_Produce_reply_tags_Topic_t; + +typedef struct rd_kafkap_Produce_reply_tags_s { + int32_t leader_change_cnt; + rd_kafkap_NodeEndpoints_t NodeEndpoints; + rd_kafkap_Produce_reply_tags_Topic_t Topic; +} rd_kafkap_Produce_reply_tags_t; + +/**@}*/ + +/** + * @name Fetch tags + * @{ + * + */ + +typedef struct rd_kafkap_Fetch_reply_tags_Partition_s { + int32_t Partition; + rd_kafkap_CurrentLeader_t CurrentLeader; +} rd_kafkap_Fetch_reply_tags_Partition_t; + +typedef struct rd_kafkap_Fetch_reply_tags_Topic_s { + rd_kafka_Uuid_t TopicId; + int32_t PartitionCnt; + rd_kafkap_Fetch_reply_tags_Partition_t *Partitions; + int32_t partitions_with_leader_change_cnt; +} rd_kafkap_Fetch_reply_tags_Topic_t; + +typedef struct rd_kafkap_Fetch_reply_tags_s { + rd_kafkap_NodeEndpoints_t NodeEndpoints; + int32_t TopicCnt; + rd_kafkap_Fetch_reply_tags_Topic_t *Topics; + int32_t topics_with_leader_change_cnt; +} rd_kafkap_Fetch_reply_tags_t; + +/**@}*/ + +rd_kafka_topic_partition_list_t *rd_kafka_buf_read_topic_partitions( + rd_kafka_buf_t *rkbuf, + rd_bool_t use_topic_id, + rd_bool_t use_topic_name, + size_t estimated_part_cnt, + const rd_kafka_topic_partition_field_t *fields); + +rd_kafka_topic_partition_list_t *rd_kafka_buf_read_topic_partitions_nullable( + rd_kafka_buf_t *rkbuf, + rd_bool_t use_topic_id, + rd_bool_t use_topic_name, + size_t estimated_part_cnt, + const rd_kafka_topic_partition_field_t *fields, + rd_bool_t *parse_err); + int rd_kafka_buf_write_topic_partitions( rd_kafka_buf_t *rkbuf, const rd_kafka_topic_partition_list_t *parts, rd_bool_t skip_invalid_offsets, rd_bool_t only_invalid_offsets, - rd_bool_t write_Offset, - rd_bool_t write_Epoch, - rd_bool_t write_Metadata); + rd_bool_t use_topic_id, + rd_bool_t use_topic_name, + const rd_kafka_topic_partition_field_t *fields); + +int rd_kafka_buf_read_CurrentLeader(rd_kafka_buf_t *rkbuf, + rd_kafkap_CurrentLeader_t *CurrentLeader); + +int rd_kafka_buf_read_NodeEndpoints(rd_kafka_buf_t *rkbuf, + rd_kafkap_NodeEndpoints_t *NodeEndpoints); + rd_kafka_resp_err_t rd_kafka_FindCoordinatorRequest(rd_kafka_broker_t *rkb, @@ -82,6 +199,7 @@ rd_kafka_FindCoordinatorRequest(rd_kafka_broker_t *rkb, rd_kafka_resp_cb_t *resp_cb, void *opaque); + rd_kafka_resp_err_t rd_kafka_handle_ListOffsets(rd_kafka_t *rk, rd_kafka_broker_t *rkb, @@ -95,8 +213,39 @@ void rd_kafka_ListOffsetsRequest(rd_kafka_broker_t *rkb, rd_kafka_topic_partition_list_t *offsets, rd_kafka_replyq_t replyq, rd_kafka_resp_cb_t *resp_cb, + int timeout_ms, void *opaque); +rd_kafka_resp_err_t +rd_kafka_ListOffsetsRequest_admin(rd_kafka_broker_t *rkb, + const rd_list_t *offsets, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); + +rd_kafka_resp_err_t +rd_kafka_parse_ListOffsets(rd_kafka_buf_t *rkbuf, + rd_kafka_topic_partition_list_t *offsets, + rd_list_t *result_infos); + +rd_kafka_resp_err_t +rd_kafka_handle_OffsetForLeaderEpoch(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + rd_kafka_topic_partition_list_t **offsets); +void rd_kafka_OffsetForLeaderEpochRequest( + rd_kafka_broker_t *rkb, + rd_kafka_topic_partition_list_t *parts, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); + + rd_kafka_resp_err_t rd_kafka_handle_OffsetFetch(rd_kafka_t *rk, rd_kafka_broker_t *rkb, @@ -118,10 +267,18 @@ void rd_kafka_op_handle_OffsetFetch(rd_kafka_t *rk, void rd_kafka_OffsetFetchRequest(rd_kafka_broker_t *rkb, const char *group_id, rd_kafka_topic_partition_list_t *parts, + rd_bool_t use_topic_id, + int32_t generation_id_or_member_epoch, + rd_kafkap_str_t *member_id, rd_bool_t require_stable_offsets, int timeout, rd_kafka_replyq_t replyq, - rd_kafka_resp_cb_t *resp_cb, + void (*resp_cb)(rd_kafka_t *, + rd_kafka_broker_t *, + rd_kafka_resp_err_t, + rd_kafka_buf_t *, + rd_kafka_buf_t *, + void *), void *opaque); rd_kafka_resp_err_t @@ -198,17 +355,21 @@ rd_kafka_error_t *rd_kafka_ListGroupsRequest(rd_kafka_broker_t *rkb, int16_t max_ApiVersion, const char **states, size_t states_cnt, + const char **types, + size_t types_cnt, rd_kafka_replyq_t replyq, rd_kafka_resp_cb_t *resp_cb, void *opaque); -rd_kafka_error_t *rd_kafka_DescribeGroupsRequest(rd_kafka_broker_t *rkb, - int16_t max_ApiVersion, - char **groups, - size_t group_cnt, - rd_kafka_replyq_t replyq, - rd_kafka_resp_cb_t *resp_cb, - void *opaque); +rd_kafka_error_t * +rd_kafka_DescribeGroupsRequest(rd_kafka_broker_t *rkb, + int16_t max_ApiVersion, + char **groups, + size_t group_cnt, + rd_bool_t include_authorized_operations, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); void rd_kafka_HeartbeatRequest(rd_kafka_broker_t *rkb, @@ -220,13 +381,48 @@ void rd_kafka_HeartbeatRequest(rd_kafka_broker_t *rkb, rd_kafka_resp_cb_t *resp_cb, void *opaque); +void rd_kafka_ConsumerGroupHeartbeatRequest( + rd_kafka_broker_t *rkb, + const rd_kafkap_str_t *group_id, + const rd_kafkap_str_t *member_id, + int32_t member_epoch, + const rd_kafkap_str_t *group_instance_id, + const rd_kafkap_str_t *rack_id, + int32_t rebalance_timeout_ms, + const rd_kafka_topic_partition_list_t *subscribed_topics, + rd_kafkap_str_t *subscribed_topic_regex, + const rd_kafkap_str_t *remote_assignor, + const rd_kafka_topic_partition_list_t *current_assignments, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); + rd_kafka_resp_err_t rd_kafka_MetadataRequest(rd_kafka_broker_t *rkb, const rd_list_t *topics, + rd_list_t *topic_ids, const char *reason, rd_bool_t allow_auto_create_topics, rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, + rd_bool_t force_racks, rd_kafka_op_t *rko); +rd_kafka_resp_err_t rd_kafka_MetadataRequest_resp_cb( + rd_kafka_broker_t *rkb, + const rd_list_t *topics, + const rd_list_t *topic_ids, + const char *reason, + rd_bool_t allow_auto_create_topics, + rd_bool_t include_cluster_authorized_operations, + rd_bool_t include_topic_authorized_operations, + rd_bool_t cgrp_update, + int32_t cgrp_subscription_version, + rd_bool_t force_racks, + rd_kafka_resp_cb_t *resp_cb, + rd_kafka_replyq_t replyq, + rd_bool_t force, + void *opaque); + rd_kafka_resp_err_t rd_kafka_handle_ApiVersion(rd_kafka_t *rk, rd_kafka_broker_t *rkb, @@ -253,6 +449,7 @@ void rd_kafka_handle_SaslAuthenticate(rd_kafka_t *rk, rd_kafka_buf_t *rkbuf, rd_kafka_buf_t *request, void *opaque); + void rd_kafka_SaslAuthenticateRequest(rd_kafka_broker_t *rkb, const void *buf, size_t size, @@ -305,6 +502,16 @@ rd_kafka_AlterConfigsRequest(rd_kafka_broker_t *rkb, rd_kafka_resp_cb_t *resp_cb, void *opaque); +rd_kafka_resp_err_t rd_kafka_IncrementalAlterConfigsRequest( + rd_kafka_broker_t *rkb, + const rd_list_t *configs /*(ConfigResource_t*)*/, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); + rd_kafka_resp_err_t rd_kafka_DescribeConfigsRequest( rd_kafka_broker_t *rkb, const rd_list_t *configs /*(ConfigResource_t*)*/, @@ -384,7 +591,6 @@ rd_kafka_resp_err_t rd_kafka_EndTxnRequest(rd_kafka_broker_t *rkb, int unittest_request(void); - rd_kafka_resp_err_t rd_kafka_DeleteRecordsRequest(rd_kafka_broker_t *rkb, /*(rd_topic_partition_list_t*)*/ @@ -426,5 +632,98 @@ rd_kafka_DeleteAclsRequest(rd_kafka_broker_t *rkb, rd_kafka_resp_cb_t *resp_cb, void *opaque); +rd_kafka_resp_err_t rd_kafka_ElectLeadersRequest( + rd_kafka_broker_t *rkb, + const rd_list_t *elect_leaders /*(rd_kafka_EleactLeaders_t*)*/, + rd_kafka_AdminOptions_t *options, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); + +rd_kafka_error_t * +rd_kafka_ConsumerGroupDescribeRequest(rd_kafka_broker_t *rkb, + char **groups, + size_t group_cnt, + rd_bool_t include_authorized_operations, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); + +void rd_kafkap_leader_discovery_tmpabuf_add_alloc_brokers( + rd_tmpabuf_t *tbuf, + rd_kafkap_NodeEndpoints_t *NodeEndpoints); + +void rd_kafkap_leader_discovery_tmpabuf_add_alloc_topics(rd_tmpabuf_t *tbuf, + int topic_cnt); + +void rd_kafkap_leader_discovery_tmpabuf_add_alloc_topic(rd_tmpabuf_t *tbuf, + char *topic_name, + int32_t partition_cnt); + +void rd_kafkap_leader_discovery_metadata_init(rd_kafka_metadata_internal_t *mdi, + int32_t broker_id); + +void rd_kafkap_leader_discovery_set_brokers( + rd_tmpabuf_t *tbuf, + rd_kafka_metadata_internal_t *mdi, + rd_kafkap_NodeEndpoints_t *NodeEndpoints); + +void rd_kafkap_leader_discovery_set_topic_cnt(rd_tmpabuf_t *tbuf, + rd_kafka_metadata_internal_t *mdi, + int topic_cnt); + +void rd_kafkap_leader_discovery_set_topic(rd_tmpabuf_t *tbuf, + rd_kafka_metadata_internal_t *mdi, + int topic_idx, + rd_kafka_Uuid_t topic_id, + char *topic_name, + int partition_cnt); + +void rd_kafkap_leader_discovery_set_CurrentLeader( + rd_tmpabuf_t *tbuf, + rd_kafka_metadata_internal_t *mdi, + int topic_idx, + int partition_idx, + int32_t partition_id, + rd_kafkap_CurrentLeader_t *CurrentLeader); + +rd_kafka_resp_err_t +rd_kafka_GetTelemetrySubscriptionsRequest(rd_kafka_broker_t *rkb, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); + +rd_kafka_resp_err_t +rd_kafka_PushTelemetryRequest(rd_kafka_broker_t *rkb, + rd_kafka_Uuid_t *client_instance_id, + int32_t subscription_id, + rd_bool_t terminating, + rd_kafka_compression_t compression_type, + const void *metrics, + size_t metrics_size, + char *errstr, + size_t errstr_size, + rd_kafka_replyq_t replyq, + rd_kafka_resp_cb_t *resp_cb, + void *opaque); + +void rd_kafka_handle_GetTelemetrySubscriptions(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + void *opaque); + +void rd_kafka_handle_PushTelemetry(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_kafka_resp_err_t err, + rd_kafka_buf_t *rkbuf, + rd_kafka_buf_t *request, + void *opaque); + #endif /* _RDKAFKA_REQUEST_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_roundrobin_assignor.c b/src/third_party/librdkafka/dist/src/rdkafka_roundrobin_assignor.c index 6cb91936452..28d437f4f79 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_roundrobin_assignor.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_roundrobin_assignor.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl.c b/src/third_party/librdkafka/dist/src/rdkafka_sasl.c index cab67f241f7..32ebe3b198e 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -206,6 +207,11 @@ int rd_kafka_sasl_io_event(rd_kafka_transport_t *rktrans, * @remark May be called on non-SASL transports (no-op) */ void rd_kafka_sasl_close(rd_kafka_transport_t *rktrans) { + /* The broker might not be up, and the transport might not exist in that + * case.*/ + if (!rktrans) + return; + const struct rd_kafka_sasl_provider *provider = rktrans->rktrans_rkb->rkb_rk->rk_conf.sasl.provider; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl.h b/src/third_party/librdkafka/dist/src/rdkafka_sasl.h index d0dd01b8b21..0ac12c5d210 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_cyrus.c b/src/third_party/librdkafka/dist/src/rdkafka_sasl_cyrus.c index 41452a33647..89ff15c4272 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_cyrus.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_cyrus.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -489,6 +490,7 @@ static void rd_kafka_sasl_cyrus_close(struct rd_kafka_transport_s *rktrans) { mtx_unlock(&rktrans->rktrans_rkb->rkb_rk->rk_conf.sasl.lock); } rd_free(state); + rktrans->rktrans_sasl.state = NULL; } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_int.h b/src/third_party/librdkafka/dist/src/rdkafka_sasl_int.h index 33e3bdd05f6..8a49a6a2964 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_int.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_int.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2015 Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer.c b/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer.c index 39b165a7dc7..69dd124ec83 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -123,9 +124,10 @@ struct rd_kafka_sasl_oauthbearer_token { * @brief Per-connection state */ struct rd_kafka_sasl_oauthbearer_state { - enum { RD_KAFKA_SASL_OAUTHB_STATE_SEND_CLIENT_FIRST_MESSAGE, - RD_KAFKA_SASL_OAUTHB_STATE_RECV_SERVER_FIRST_MSG, - RD_KAFKA_SASL_OAUTHB_STATE_RECV_SERVER_MSG_AFTER_FAIL, + enum { + RD_KAFKA_SASL_OAUTHB_STATE_SEND_CLIENT_FIRST_MESSAGE, + RD_KAFKA_SASL_OAUTHB_STATE_RECV_SERVER_FIRST_MSG, + RD_KAFKA_SASL_OAUTHB_STATE_RECV_SERVER_MSG_AFTER_FAIL, } state; char *server_error_msg; @@ -1041,6 +1043,7 @@ static void rd_kafka_sasl_oauthbearer_close(rd_kafka_transport_t *rktrans) { rd_free(state->md_principal_name); rd_list_destroy(&state->extensions); rd_free(state); + rktrans->rktrans_sasl.state = NULL; } @@ -1089,8 +1092,8 @@ static void rd_kafka_sasl_oauthbearer_build_client_first_message( buf = out->ptr; size_written = 0; r = rd_snprintf(buf, out->size + 1 - size_written, - "%s%sauth=Bearer %s%s", gs2_header, kvsep, - state->token_value, kvsep); + "%s%sauth=Bearer %s%s", gs2_header, kvsep, + state->token_value, kvsep); rd_assert(r < out->size + 1 - size_written); size_written += r; buf = out->ptr + size_written; @@ -1253,8 +1256,13 @@ static int rd_kafka_sasl_oauthbearer_client_new(rd_kafka_transport_t *rktrans, return -1; } - state->token_value = rd_strdup(handle->token_value); - state->md_principal_name = rd_strdup(handle->md_principal_name); + state->token_value = rd_strdup(handle->token_value); + if (handle->md_principal_name) + state->md_principal_name = rd_strdup(handle->md_principal_name); + else + state->md_principal_name = NULL; + + rd_list_init_copy(&state->extensions, &handle->extensions); rd_list_copy_to(&state->extensions, &handle->extensions, rd_strtup_list_copy, NULL); @@ -1300,6 +1308,16 @@ static int rd_kafka_sasl_oauthbearer_init(rd_kafka_t *rk, rd_list_init(&handle->extensions, 0, (void (*)(void *))rd_strtup_destroy); + + if (rk->rk_conf.sasl.enable_callback_queue) { + /* SASL specific callback queue enabled */ + rk->rk_sasl.callback_q = rd_kafka_q_new(rk); + handle->callback_q = rd_kafka_q_keep(rk->rk_sasl.callback_q); + } else { + /* Use main queue */ + handle->callback_q = rd_kafka_q_keep(rk->rk_rep); + } + rd_kafka_timer_start( &rk->rk_timers, &handle->token_refresh_tmr, 1 * 1000 * 1000, rd_kafka_sasl_oauthbearer_token_refresh_tmr_cb, rk); @@ -1316,25 +1334,18 @@ static int rd_kafka_sasl_oauthbearer_init(rd_kafka_t *rk, return 0; } - if (rk->rk_conf.sasl.enable_callback_queue) { - /* SASL specific callback queue enabled */ - rk->rk_sasl.callback_q = rd_kafka_q_new(rk); - handle->callback_q = rd_kafka_q_keep(rk->rk_sasl.callback_q); - } else { - /* Use main queue */ - handle->callback_q = rd_kafka_q_keep(rk->rk_rep); - } #if WITH_OAUTHBEARER_OIDC if (rk->rk_conf.sasl.oauthbearer.method == RD_KAFKA_SASL_OAUTHBEARER_METHOD_OIDC && - rk->rk_conf.sasl.oauthbearer.token_refresh_cb == - rd_kafka_oidc_token_refresh_cb) { + (rk->rk_conf.sasl.oauthbearer.token_refresh_cb == + rd_kafka_oidc_token_jwt_bearer_refresh_cb || + rk->rk_conf.sasl.oauthbearer.token_refresh_cb == + rd_kafka_oidc_token_client_credentials_refresh_cb)) { handle->internal_refresh = rd_true; rd_kafka_sasl_background_callbacks_enable(rk); } #endif - /* Otherwise enqueue a refresh callback for the application. */ rd_kafka_oauthbearer_enqueue_token_refresh(handle); diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer.h b/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer.h index 75ab51d02fb..cdcea0608cd 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer_oidc.c b/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer_oidc.c index 6c2773b027b..b8580b113c4 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer_oidc.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer_oidc.c @@ -1,7 +1,9 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2021 Magnus Edenhill + * Copyright (c) 2021-2022, Magnus Edenhill + * 2023, Confluent Inc. + * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,25 +39,7 @@ #include #include "rdhttp.h" #include "rdkafka_sasl_oauthbearer_oidc.h" - - -/** - * @brief Base64 encode binary input \p in, and write base64-encoded string - * and it's size to \p out - */ -static void rd_base64_encode(const rd_chariov_t *in, rd_chariov_t *out) { - size_t max_len; - - max_len = (((in->size + 2) / 3) * 4) + 1; - out->ptr = rd_malloc(max_len); - rd_assert(out->ptr); - - out->size = EVP_EncodeBlock((uint8_t *)out->ptr, (uint8_t *)in->ptr, - (int)in->size); - - rd_assert(out->size <= max_len); - out->ptr[out->size] = 0; -} +#include "rdbase64.h" /** @@ -67,8 +51,9 @@ static void rd_base64_encode(const rd_chariov_t *in, rd_chariov_t *out) { * * @locality Any thread. */ -static char *rd_kafka_oidc_build_auth_header(const char *client_id, - const char *client_secret) { +static char * +rd_kafka_oidc_client_credentials_build_auth_header(const char *client_id, + const char *client_secret) { rd_chariov_t client_authorization_in; rd_chariov_t client_authorization_out; @@ -84,6 +69,7 @@ static char *rd_kafka_oidc_build_auth_header(const char *client_id, client_authorization_in.size--; rd_base64_encode(&client_authorization_in, &client_authorization_out); + rd_assert(client_authorization_out.ptr); authorization_base64_header_size = strlen("Authorization: Basic ") + client_authorization_out.size + 1; @@ -105,13 +91,15 @@ static char *rd_kafka_oidc_build_auth_header(const char *client_id, * * @locality Any thread. */ -static void rd_kafka_oidc_build_headers(const char *client_id, - const char *client_secret, - struct curl_slist **headersp) { +static void +rd_kafka_oidc_client_credentials_build_headers(const char *client_id, + const char *client_secret, + struct curl_slist **headersp) { char *authorization_base64_header; authorization_base64_header = - rd_kafka_oidc_build_auth_header(client_id, client_secret); + rd_kafka_oidc_client_credentials_build_auth_header(client_id, + client_secret); *headersp = curl_slist_append(*headersp, "Accept: application/json"); *headersp = curl_slist_append(*headersp, authorization_base64_header); @@ -213,9 +201,10 @@ done: * The post_fields_size will be returned in \p post_fields_size. * */ -static void rd_kafka_oidc_build_post_fields(const char *scope, - char **post_fields, - size_t *post_fields_size) { +static void +rd_kafka_oidc_client_credentials_build_post_fields(const char *scope, + char **post_fields, + size_t *post_fields_size) { size_t scope_size = 0; if (scope) @@ -232,6 +221,684 @@ static void rd_kafka_oidc_build_post_fields(const char *scope, } } +/** + * @brief Get JWT algorithm label string for the specified signing algorithm. + * + * @param token_signing_algo The algorithm enum value + * + * @returns String representation of the algorithm. + * + * @locality Any thread. + */ +static char *rd_kafka_oidc_assertion_get_algo_label( + const rd_kafka_oauthbearer_assertion_algorithm_t token_signing_algo) { + switch (token_signing_algo) { + case RD_KAFKA_SASL_OAUTHBEARER_ASSERTION_ALGORITHM_RS256: + return "RS256"; + case RD_KAFKA_SASL_OAUTHBEARER_ASSERTION_ALGORITHM_ES256: + return "ES256"; + default: + rd_assert(!*"Unknown JOSE algorithm"); + return NULL; + } +} + +/** + * @brief Parse a JWT template file and extract header and payload JSON + * objects. + * + * Reads and parses the JWT template file, which should contain a JSON object + * with "header" and "payload" properties. + * + * @param rk + * @param jwt_template_file_path Path to the template file + * @param header Pointer to store the parsed header JSON object + * @param payload Pointer to store the parsed payload JSON object + * + * @returns 0 on success, -1 on failure + * + * @locality Any thread. + */ +static int +rd_kafka_oidc_assertion_parse_template_file(rd_kafka_t *rk, + const char *jwt_template_file_path, + cJSON **header, + cJSON **payload) { + char *template_content = NULL; + cJSON *template_json = NULL; + int ret = -1; + size_t file_size; + + *header = NULL; + *payload = NULL; + + template_content = + rd_file_read(jwt_template_file_path, &file_size, 1024 * 1024); + if (!template_content) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to open JWT template file: %s", + jwt_template_file_path); + return -1; + } + + if (file_size == 0) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "JWT template file is empty or invalid"); + rd_free(template_content); + return -1; + } + + template_json = cJSON_Parse((char *)template_content); + if (!template_json) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to parse JWT template JSON"); + goto cleanup; + } + + cJSON *header_item = cJSON_GetObjectItem(template_json, "header"); + cJSON *payload_item = cJSON_GetObjectItem(template_json, "payload"); + + if (!header_item || !payload_item) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "JWT template must contain both 'header' " + "and 'payload' objects"); + goto cleanup; + } + + *header = cJSON_Duplicate(header_item, 1); + *payload = cJSON_Duplicate(payload_item, 1); + + if (!*header || !*payload) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to duplicate header or payload objects"); + if (*header) { + cJSON_Delete(*header); + *header = NULL; + } + goto cleanup; + } + + ret = 0; + +cleanup: + if (template_content) + rd_free(template_content); + if (template_json) + cJSON_Delete(template_json); + + return ret; +} + +/** + * @brief Create JWT assertion. + * + * Creates a JWT token signed with the specified private key using the + * algorithm specified. The token can be created from a template file or + * will create a minimal default token if no template is provided. + * + * @param rk The rd_kafka_t instance for logging + * @param private_key_pem PEM formatted private key string (mutually exclusive + * with key_file_location) + * @param key_file_location Path to private key file (mutually exclusive with + * private_key_pem) + * @param passphrase Optional passphrase for encrypted private key + * @param token_signing_algo Algorithm to use for signing (RS256 or ES256) + * @param jwt_template_file Optional path to JWT template file + * @param subject Optional subject claim value. + * @param issuer Optional issuer claim value. + * @param audience Optional audience claim value. + * @param nbf `nbf` claim value to express seconds of validity in the past. + * @param exp `exp` claim value to express seconds of validity in the future. + * @param jti_include Whether to include a JTI claim (UUID) + * + * @returns Newly allocated JWT string, caller must free with rd_free(). NULL on + * error. + * + * @locality Any thread. + */ +static char *rd_kafka_oidc_assertion_create( + rd_kafka_t *rk, + const char *private_key_pem, + const char *key_file_location, + const char *passphrase, + const rd_kafka_oauthbearer_assertion_algorithm_t token_signing_algo, + const char *jwt_template_file, + const char *subject, + const char *issuer, + const char *audience, + const int nbf, + const int exp, + const rd_bool_t jti_include) { + + char *encoded_header = NULL; + char *encoded_payload = NULL; + char *encoded_signature = NULL; + char *unsigned_token = NULL; + char *result = NULL; + char *header_str = NULL; + char *payload_str = NULL; + EVP_PKEY *pkey = NULL; + BIO *bio = NULL; + cJSON *header_json_obj = NULL; + cJSON *payload_json_obj = NULL; + EVP_MD_CTX *mdctx = NULL; + unsigned char *sig = NULL; + rd_chariov_t header_iov; + rd_chariov_t payload_iov; + rd_chariov_t sig_iov; + rd_kafka_Uuid_t jti_uuid; + char *jti_uuid_str = NULL; + + rd_ts_t issued_at = rd_uclock() / 1000000; + rd_ts_t not_before = issued_at - nbf; + rd_ts_t expiration_time = issued_at + exp; + + if (jwt_template_file) { + if (rd_kafka_oidc_assertion_parse_template_file( + rk, jwt_template_file, &header_json_obj, + &payload_json_obj) != 0) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to process JWT template file %s", + jwt_template_file); + return NULL; + } + } else { + header_json_obj = cJSON_CreateObject(); + payload_json_obj = cJSON_CreateObject(); + } + + /* Add required header fields */ + cJSON_DeleteItemFromObjectCaseSensitive(header_json_obj, "alg"); + cJSON_DeleteItemFromObjectCaseSensitive(header_json_obj, "typ"); + cJSON_DeleteItemFromObjectCaseSensitive(payload_json_obj, "iat"); + cJSON_DeleteItemFromObjectCaseSensitive(payload_json_obj, "exp"); + cJSON_DeleteItemFromObjectCaseSensitive(payload_json_obj, "nbf"); + cJSON_AddStringToObject( + header_json_obj, "alg", + rd_kafka_oidc_assertion_get_algo_label(token_signing_algo)); + cJSON_AddStringToObject(header_json_obj, "typ", "JWT"); + + /* Add required payload fields */ + cJSON_AddNumberToObject(payload_json_obj, "iat", (double)issued_at); + cJSON_AddNumberToObject(payload_json_obj, "exp", + (double)expiration_time); + cJSON_AddNumberToObject(payload_json_obj, "nbf", (double)not_before); + + if (subject) { + cJSON_DeleteItemFromObjectCaseSensitive(payload_json_obj, + "sub"); + cJSON_AddStringToObject(payload_json_obj, "sub", subject); + } + + if (issuer) { + cJSON_DeleteItemFromObjectCaseSensitive(payload_json_obj, + "iss"); + cJSON_AddStringToObject(payload_json_obj, "iss", issuer); + } + + if (audience) { + cJSON_DeleteItemFromObjectCaseSensitive(payload_json_obj, + "aud"); + cJSON_AddStringToObject(payload_json_obj, "aud", audience); + } + + if (jti_include) { + jti_uuid = rd_kafka_Uuid_random(); + jti_uuid_str = rd_kafka_Uuid_str(&jti_uuid); + cJSON_DeleteItemFromObjectCaseSensitive(payload_json_obj, + "jti"); + cJSON_AddStringToObject(payload_json_obj, "jti", jti_uuid_str); + rd_free(jti_uuid_str); + } + + header_str = cJSON_PrintUnformatted(header_json_obj); + payload_str = cJSON_PrintUnformatted(payload_json_obj); + + if (!header_str || !payload_str) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to convert template objects to JSON"); + goto cleanup; + } + + header_iov.ptr = header_str; + header_iov.size = strlen(header_str); + encoded_header = rd_base64_encode_str_urlsafe(&header_iov); + + payload_iov.ptr = payload_str; + payload_iov.size = strlen(payload_str); + encoded_payload = rd_base64_encode_str_urlsafe(&payload_iov); + if (!encoded_header || !encoded_payload) + goto cleanup; + + size_t unsigned_token_len = + strlen(encoded_header) + strlen(encoded_payload) + 2; + unsigned_token = rd_malloc(unsigned_token_len); + + if (!unsigned_token) + goto cleanup; + rd_snprintf(unsigned_token, unsigned_token_len, "%s.%s", encoded_header, + encoded_payload); + + if (private_key_pem) { + bio = BIO_new_mem_buf((void *)private_key_pem, -1); + } else if (key_file_location) { + bio = BIO_new_file(key_file_location, "r"); + } + + if (!bio) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to create BIO for private key"); + goto cleanup; + } + + if (passphrase) { + pkey = PEM_read_bio_PrivateKey(bio, NULL, NULL, + (void *)passphrase); + } else { + pkey = PEM_read_bio_PrivateKey(bio, NULL, NULL, NULL); + } + BIO_free(bio); + bio = NULL; + + if (!pkey) { + rd_kafka_log(rk, LOG_ERR, "JWT", "Failed to load private key"); + goto cleanup; + } + + mdctx = EVP_MD_CTX_new(); + if (!mdctx) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to create message digest context"); + goto cleanup; + } + + const EVP_MD *md = EVP_sha256(); /* Both RS256 and ES256 use SHA-256 */ + + if (EVP_DigestSignInit(mdctx, NULL, md, NULL, pkey) != 1) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to initialize signing context"); + goto cleanup; + } + + if (EVP_DigestSignUpdate(mdctx, unsigned_token, + strlen(unsigned_token)) != 1) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to update digest with token data"); + goto cleanup; + } + + size_t siglen = 0; + if (EVP_DigestSignFinal(mdctx, NULL, &siglen) != 1) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to get signature length"); + goto cleanup; + } + + sig = rd_malloc(siglen); + if (!sig) { + rd_kafka_log(rk, LOG_ERR, "JWT", + "Failed to allocate memory for signature"); + goto cleanup; + } + + if (EVP_DigestSignFinal(mdctx, sig, &siglen) != 1) { + rd_kafka_log(rk, LOG_ERR, "JWT", "Failed to create signature"); + goto cleanup; + } + + sig_iov.ptr = (char *)sig; + sig_iov.size = siglen; + encoded_signature = rd_base64_encode_str_urlsafe(&sig_iov); + + if (!encoded_signature) + goto cleanup; + + size_t jwt_len = strlen(encoded_header) + strlen(encoded_payload) + + strlen(encoded_signature) + 3; + result = rd_malloc(jwt_len); + if (!result) + goto cleanup; + rd_snprintf(result, jwt_len, "%s.%s.%s", encoded_header, + encoded_payload, encoded_signature); + +cleanup: + if (encoded_header) + rd_free(encoded_header); + if (encoded_payload) + rd_free(encoded_payload); + if (encoded_signature) + rd_free(encoded_signature); + if (unsigned_token) + rd_free(unsigned_token); + if (sig) + rd_free(sig); + + if (header_json_obj) { + if (header_str) + free(header_str); /* cJSON_PrintUnformatted uses malloc + */ + cJSON_Delete(header_json_obj); + } else if (header_str) { + rd_free(header_str); /* rd_malloc was used */ + } + + if (payload_json_obj) { + if (payload_str) + free(payload_str); /* cJSON_PrintUnformatted uses malloc + */ + cJSON_Delete(payload_json_obj); + } else if (payload_str) { + rd_free(payload_str); /* rd_malloc was used */ + } + + if (pkey) + EVP_PKEY_free(pkey); + if (mdctx) + EVP_MD_CTX_free(mdctx); + + return result; +} + + +/** + * @brief Build request body for JWT bearer token request. + * + * Creates a URL-encoded request body for token exchange with the JWT assertion + * and optional scope. + * + * @param assertion The JWT assertion to include in the request. + * @param scope Optional scope to include in the request (will be URL encoded). + * + * @returns Newly allocated string with the URL-encoded request body. + * Caller must free with rd_free(). NULL on memory allocation failure. + * + * @locality Any thread. + */ +static char *rd_kafka_oidc_jwt_bearer_build_request_body(const char *assertion, + const char *scope) { + const char *assertion_prefix = + "grant_type=urn:ietf:params:oauth:" + "grant-type:jwt-bearer" + "&assertion="; + int assertion_prefix_len = strlen(assertion_prefix) + strlen(assertion); + int body_size = assertion_prefix_len + 1; + char *scope_escaped = NULL; + if (scope) { + scope_escaped = curl_easy_escape(NULL, scope, 0); + body_size += strlen("&scope=") + strlen(scope_escaped); + } + + char *body = rd_malloc(body_size); + + rd_snprintf(body, body_size, "%s%s", assertion_prefix, assertion); + if (scope) { + rd_snprintf(&body[assertion_prefix_len], + body_size - assertion_prefix_len, "&scope=%s", + scope_escaped); + rd_free(scope_escaped); + } + return body; +} + +/** + * @brief JWT assertion from file + * + * @param file_path Path to the file containing the JWT assertion. + * + * @returns Newly allocated string with the JWT assertion. + * Caller must free with rd_free(). NULL on error. + */ +static char *rd_kafka_oidc_assertion_read_from_file(const char *file_path) { + if (!file_path) + return NULL; + const size_t max_size = 1024 * 1024; /* 1MB limit */ + return rd_file_read(file_path, NULL, max_size); +} + +/** + * @brief Try to validate a token field from the JSON response. + * Extracts and validates the token, then decodes its payload to get + * subject and expiration. + * + * @param rk The rd_kafka_t instance + * @param json The JSON response from the token endpoint + * @param field_name The name of the field to extract (e.g., "access_token" or + * "id_token") + * @param token_out Pointer to store the extracted token + * @param sub_out Pointer to store the subject from the token + * @param exp_out Pointer to store the expiration from the token + * @param errstr_out Buffer to store error message + * @param errstr_size Size of error message buffer + * + * @returns The extracted token or NULL on failure. + */ +static char *rd_kafka_oidc_token_try_validate(cJSON *json, + const char *field, + char **sub, + double *exp, + char *errstr, + size_t errstr_size) { + cJSON *access_token_json, *jwt_exp, *jwt_sub, *payloads = NULL; + char *jwt_token = NULL, *decoded_payloads = NULL; + const char *decode_errstr = NULL; + *sub = NULL; + + access_token_json = cJSON_GetObjectItem(json, field); + + if (!access_token_json) { + rd_snprintf(errstr, errstr_size, + "Expected JSON response with \"%s\" field", field); + goto fail; + } + + jwt_token = cJSON_GetStringValue(access_token_json); + if (!jwt_token) { + rd_snprintf(errstr, errstr_size, + "Expected token as a string value"); + goto fail; + } + + decode_errstr = + rd_kafka_jwt_b64_decode_payload(jwt_token, &decoded_payloads); + if (decode_errstr != NULL) { + rd_snprintf(errstr, errstr_size, + "Failed to decode JWT payload: %s", decode_errstr); + goto fail; + } + + payloads = cJSON_Parse(decoded_payloads); + if (payloads == NULL) { + rd_snprintf(errstr, errstr_size, + "Failed to parse JSON JWT payload"); + goto fail; + } + + jwt_exp = cJSON_GetObjectItem(payloads, "exp"); + if (jwt_exp == NULL) { + rd_snprintf(errstr, errstr_size, + "Expected JSON JWT response with " + "\"exp\" field"); + goto fail; + } + + *exp = cJSON_GetNumberValue(jwt_exp); + if (*exp <= 0) { + rd_snprintf(errstr, errstr_size, + "Expected JSON JWT response with " + "valid \"exp\" field"); + goto fail; + } + + jwt_sub = cJSON_GetObjectItem(payloads, "sub"); + if (jwt_sub == NULL) { + rd_snprintf(errstr, errstr_size, + "Expected JSON JWT response with " + "\"sub\" field"); + goto fail; + } + + *sub = cJSON_GetStringValue(jwt_sub); + if (*sub == NULL) { + rd_snprintf(errstr, errstr_size, + "Expected JSON JWT response with " + "valid \"sub\" field"); + goto fail; + } + *sub = rd_strdup(*sub); +done: + if (payloads) + cJSON_Delete(payloads); + if (decoded_payloads) + rd_free(decoded_payloads); + return jwt_token; +fail: + jwt_token = NULL; + goto done; +} + +/** + * @brief Implementation of JWT token refresh callback function. + * Creates a JWT assertion, exchanges it for an access token, + * and sets the token for SASL OAUTHBEARER authentication. + * + * @param rk The rd_kafka_t instance. + * @param oauthbearer_config The OAUTHBEARER configuration. + * @param opaque Opaque pointer passed to the callback. + * + * @locality rdkafka main thread + */ +void rd_kafka_oidc_token_jwt_bearer_refresh_cb(rd_kafka_t *rk, + const char *oauthbearer_config, + void *opaque) { + const int timeout_s = 20; + const int retry = 4; + const int retry_ms = 5 * 1000; + + char *jwt_assertion = NULL; + char *request_body = NULL; + struct curl_slist *headers = NULL; + rd_http_error_t *herr = NULL; + cJSON *json = NULL; + char *jwt_token = NULL; + char set_token_errstr[512]; + double exp = 0; + char **extensions = NULL; + char **extension_key_value = NULL; + size_t extension_key_value_cnt = 0; + size_t extension_cnt; + char *sub = NULL; + char validate_errstr[512]; + + if (rd_kafka_terminating(rk)) + return; + + if (rk->rk_conf.sasl.oauthbearer.assertion.file) { + jwt_assertion = rd_kafka_oidc_assertion_read_from_file( + rk->rk_conf.sasl.oauthbearer.assertion.file); + } else { + jwt_assertion = rd_kafka_oidc_assertion_create( + rk, rk->rk_conf.sasl.oauthbearer.assertion.private_key.pem, + rk->rk_conf.sasl.oauthbearer.assertion.private_key.file, + rk->rk_conf.sasl.oauthbearer.assertion.private_key + .passphrase, + rk->rk_conf.sasl.oauthbearer.assertion.algorithm, + rk->rk_conf.sasl.oauthbearer.assertion.jwt_template_file, + rk->rk_conf.sasl.oauthbearer.assertion.claim.subject, + rk->rk_conf.sasl.oauthbearer.assertion.claim.issuer, + rk->rk_conf.sasl.oauthbearer.assertion.claim.audience, + rk->rk_conf.sasl.oauthbearer.assertion.claim.not_before_s, + rk->rk_conf.sasl.oauthbearer.assertion.claim.expiration_s, + rk->rk_conf.sasl.oauthbearer.assertion.claim.jti_include); + } + + if (!jwt_assertion) { + rd_kafka_oauthbearer_set_token_failure( + rk, "Failed to create JWT assertion"); + goto done; + } + + request_body = rd_kafka_oidc_jwt_bearer_build_request_body( + jwt_assertion, rk->rk_conf.sasl.oauthbearer.scope); + + if (!request_body) { + rd_kafka_oauthbearer_set_token_failure( + rk, "Failed to build JWT request body"); + goto done; + } + + headers = curl_slist_append( + headers, "Content-Type: application/x-www-form-urlencoded"); + headers = curl_slist_append(headers, "Accept: application/json"); + + herr = rd_http_post_expect_json( + rk, rk->rk_conf.sasl.oauthbearer.token_endpoint_url, headers, + request_body, strlen(request_body), timeout_s, retry, retry_ms, + &json); + + if (unlikely(herr != NULL)) { + rd_kafka_log( + rk, LOG_ERR, "JWT", + "Failed to retrieve JWT token from \"%s\": %s (%d)", + rk->rk_conf.sasl.oauthbearer.token_endpoint_url, + herr->errstr, herr->code); + rd_kafka_oauthbearer_set_token_failure(rk, herr->errstr); + rd_http_error_destroy(herr); + goto done; + } + + /* + * RFC 7523 Section 1 says that an access token should be returned + * https://datatracker.ietf.org/doc/html/rfc7523#section-1 + * Some providers (e.g. GCP) return an `id_token` instead, depending + * on the configured `target_audience` in the request JWT bearer token. + * This may be because the validation endpoint is not accessible + * for validating the `access_token` while the `id_token` is validated + * through the JWKS URL. + * This function will try to validate the `access_token` and then the + * `id_token`. + */ + jwt_token = rd_kafka_oidc_token_try_validate(json, "access_token", &sub, + &exp, validate_errstr, + sizeof(validate_errstr)); + if (!jwt_token) + jwt_token = rd_kafka_oidc_token_try_validate( + json, "id_token", &sub, &exp, validate_errstr, + sizeof(validate_errstr)); + + if (!jwt_token) { + rd_kafka_oauthbearer_set_token_failure(rk, validate_errstr); + goto done; + } + + + if (rk->rk_conf.sasl.oauthbearer.extensions_str) { + extensions = + rd_string_split(rk->rk_conf.sasl.oauthbearer.extensions_str, + ',', rd_true, &extension_cnt); + + extension_key_value = rd_kafka_conf_kv_split( + (const char **)extensions, extension_cnt, + &extension_key_value_cnt); + } + + if (rd_kafka_oauthbearer_set_token( + rk, jwt_token, (int64_t)exp * 1000, sub, + (const char **)extension_key_value, extension_key_value_cnt, + set_token_errstr, + sizeof(set_token_errstr)) != RD_KAFKA_RESP_ERR_NO_ERROR) { + rd_kafka_oauthbearer_set_token_failure(rk, validate_errstr); + } + +done: + RD_IF_FREE(sub, rd_free); + RD_IF_FREE(jwt_assertion, rd_free); + RD_IF_FREE(request_body, rd_free); + RD_IF_FREE(headers, curl_slist_free_all); + RD_IF_FREE(json, cJSON_Delete); + RD_IF_FREE(extensions, rd_free); + RD_IF_FREE(extension_key_value, rd_free); + /* jwt_token is freed as part of the json object */ +} /** * @brief Implementation of Oauth/OIDC token refresh callback function, @@ -239,37 +906,33 @@ static void rd_kafka_oidc_build_post_fields(const char *scope, * then extract the jwt from the JSON response, and forward it to * the broker. */ -void rd_kafka_oidc_token_refresh_cb(rd_kafka_t *rk, - const char *oauthbearer_config, - void *opaque) { +void rd_kafka_oidc_token_client_credentials_refresh_cb( + rd_kafka_t *rk, + const char *oauthbearer_config, + void *opaque) { const int timeout_s = 20; const int retry = 4; const int retry_ms = 5 * 1000; double exp; - cJSON *json = NULL; - cJSON *payloads = NULL; - cJSON *parsed_token, *jwt_exp, *jwt_sub; + cJSON *json = NULL; rd_http_error_t *herr; char *jwt_token; - char *post_fields; - char *decoded_payloads = NULL; + char *post_fields = NULL; struct curl_slist *headers = NULL; const char *token_url; - const char *sub; - const char *errstr; + char *sub = NULL; size_t post_fields_size; size_t extension_cnt; size_t extension_key_value_cnt = 0; char set_token_errstr[512]; - char decode_payload_errstr[512]; char **extensions = NULL; char **extension_key_value = NULL; @@ -277,13 +940,14 @@ void rd_kafka_oidc_token_refresh_cb(rd_kafka_t *rk, if (rd_kafka_terminating(rk)) return; - rd_kafka_oidc_build_headers(rk->rk_conf.sasl.oauthbearer.client_id, - rk->rk_conf.sasl.oauthbearer.client_secret, - &headers); + rd_kafka_oidc_client_credentials_build_headers( + rk->rk_conf.sasl.oauthbearer.client_id, + rk->rk_conf.sasl.oauthbearer.client_secret, &headers); /* Build post fields */ - rd_kafka_oidc_build_post_fields(rk->rk_conf.sasl.oauthbearer.scope, - &post_fields, &post_fields_size); + rd_kafka_oidc_client_credentials_build_post_fields( + rk->rk_conf.sasl.oauthbearer.scope, &post_fields, + &post_fields_size); token_url = rk->rk_conf.sasl.oauthbearer.token_endpoint_url; @@ -301,75 +965,11 @@ void rd_kafka_oidc_token_refresh_cb(rd_kafka_t *rk, goto done; } - parsed_token = cJSON_GetObjectItem(json, "access_token"); - - if (parsed_token == NULL) { - rd_kafka_oauthbearer_set_token_failure( - rk, - "Expected JSON JWT response with " - "\"access_token\" field"); - goto done; - } - - jwt_token = cJSON_GetStringValue(parsed_token); - if (jwt_token == NULL) { - rd_kafka_oauthbearer_set_token_failure( - rk, - "Expected JSON " - "response as a value string"); - goto done; - } - - errstr = rd_kafka_jwt_b64_decode_payload(jwt_token, &decoded_payloads); - if (errstr != NULL) { - rd_snprintf(decode_payload_errstr, - sizeof(decode_payload_errstr), - "Failed to decode JWT payload: %s", errstr); - rd_kafka_oauthbearer_set_token_failure(rk, - decode_payload_errstr); - goto done; - } - - payloads = cJSON_Parse(decoded_payloads); - if (payloads == NULL) { - rd_kafka_oauthbearer_set_token_failure( - rk, "Failed to parse JSON JWT payload"); - goto done; - } - - jwt_exp = cJSON_GetObjectItem(payloads, "exp"); - if (jwt_exp == NULL) { - rd_kafka_oauthbearer_set_token_failure( - rk, - "Expected JSON JWT response with " - "\"exp\" field"); - goto done; - } - - exp = cJSON_GetNumberValue(jwt_exp); - if (exp <= 0) { - rd_kafka_oauthbearer_set_token_failure( - rk, - "Expected JSON JWT response with " - "valid \"exp\" field"); - goto done; - } - - jwt_sub = cJSON_GetObjectItem(payloads, "sub"); - if (jwt_sub == NULL) { - rd_kafka_oauthbearer_set_token_failure( - rk, - "Expected JSON JWT response with " - "\"sub\" field"); - goto done; - } - - sub = cJSON_GetStringValue(jwt_sub); - if (sub == NULL) { - rd_kafka_oauthbearer_set_token_failure( - rk, - "Expected JSON JWT response with " - "valid \"sub\" field"); + jwt_token = rd_kafka_oidc_token_try_validate(json, "access_token", &sub, + &exp, set_token_errstr, + sizeof(set_token_errstr)); + if (!jwt_token) { + rd_kafka_oauthbearer_set_token_failure(rk, set_token_errstr); goto done; } @@ -391,16 +991,14 @@ void rd_kafka_oidc_token_refresh_cb(rd_kafka_t *rk, rd_kafka_oauthbearer_set_token_failure(rk, set_token_errstr); done: - RD_IF_FREE(decoded_payloads, rd_free); + RD_IF_FREE(sub, rd_free); RD_IF_FREE(post_fields, rd_free); RD_IF_FREE(json, cJSON_Delete); RD_IF_FREE(headers, curl_slist_free_all); RD_IF_FREE(extensions, rd_free); RD_IF_FREE(extension_key_value, rd_free); - RD_IF_FREE(payloads, cJSON_Delete); } - /** * @brief Make sure the jwt is able to be extracted from HTTP(S) response. * The JSON response after HTTP(S) call to token provider will be in @@ -436,10 +1034,11 @@ static int ut_sasl_oauthbearer_oidc_should_succeed(void) { cJSON *json = NULL; char *token; cJSON *parsed_token; + rd_kafka_t *rk = rd_calloc(1, sizeof(*rk)); RD_UT_BEGIN(); - herr = rd_http_req_init(&hreq, ""); + herr = rd_http_req_init(rk, &hreq, ""); RD_UT_ASSERT(!herr, "Expected initialize to succeed, " @@ -475,6 +1074,7 @@ static int ut_sasl_oauthbearer_oidc_should_succeed(void) { rd_http_error_destroy(herr); rd_http_req_destroy(&hreq); cJSON_Delete(json); + rd_free(rk); RD_UT_PASS(); } @@ -491,10 +1091,11 @@ static int ut_sasl_oauthbearer_oidc_with_empty_key(void) { rd_http_error_t *herr; cJSON *json = NULL; cJSON *parsed_token; + rd_kafka_t *rk = rd_calloc(1, sizeof(*rk)); RD_UT_BEGIN(); - herr = rd_http_req_init(&hreq, ""); + herr = rd_http_req_init(rk, &hreq, ""); RD_UT_ASSERT(!herr, "Expected initialization to succeed, " "but it failed with error code: %d, error string: %s", @@ -522,6 +1123,7 @@ static int ut_sasl_oauthbearer_oidc_with_empty_key(void) { rd_http_error_destroy(herr); cJSON_Delete(json); cJSON_Delete(parsed_token); + rd_free(rk); RD_UT_PASS(); } @@ -541,7 +1143,8 @@ static int ut_sasl_oauthbearer_oidc_post_fields(void) { RD_UT_BEGIN(); - rd_kafka_oidc_build_post_fields(scope, &post_fields, &post_fields_size); + rd_kafka_oidc_client_credentials_build_post_fields(scope, &post_fields, + &post_fields_size); RD_UT_ASSERT(expected_post_fields_size == post_fields_size, "Expected expected_post_fields_size is %" PRIusz @@ -573,7 +1176,8 @@ static int ut_sasl_oauthbearer_oidc_post_fields_with_empty_scope(void) { RD_UT_BEGIN(); - rd_kafka_oidc_build_post_fields(scope, &post_fields, &post_fields_size); + rd_kafka_oidc_client_credentials_build_post_fields(scope, &post_fields, + &post_fields_size); RD_UT_ASSERT(expected_post_fields_size == post_fields_size, "Expected expected_post_fields_size is %" PRIusz @@ -602,3 +1206,342 @@ int unittest_sasl_oauthbearer_oidc(void) { fails += ut_sasl_oauthbearer_oidc_post_fields_with_empty_scope(); return fails; } + +/** + * @brief Test the Base64Url encoding functionality. + * Verifies that the encoding correctly handles special characters + * and padding removal. + */ +static int ut_sasl_oauthbearer_oidc_jwt_bearer_base64url_encode(void) { + /* Test cases with expected inputs and outputs */ + static const struct { + const char *input; + const char *expected_output; + } test_cases[] = { + /* Regular case */ + {"Hello, world!", "SGVsbG8sIHdvcmxkIQ"}, + /* Case with padding characters that should be removed */ + {"test", "dGVzdA"}, + /* Empty string */ + {"", ""}, + /* Special characters that trigger Base64 padding */ + {"f", "Zg"}, + {"fo", "Zm8"}, + {"foo", "Zm9v"}, + {"foob", "Zm9vYg"}, + {"fooba", "Zm9vYmE"}, + {"foobar", "Zm9vYmFy"}, + /* Characters that produce + and / in standard Base64 */ + {"\x3E\x3F", + "Pj8"}, /* encodes to ">?" in standard Base64 with + and / */ + }; + unsigned int i; + + RD_UT_BEGIN(); + + for (i = 0; i < RD_ARRAYSIZE(test_cases); i++) { + rd_chariov_t input_iov; + input_iov.ptr = (char *)test_cases[i].input; + input_iov.size = strlen(test_cases[i].input); + char *output = rd_base64_encode_str_urlsafe(&input_iov); + + RD_UT_ASSERT(output != NULL, + "Expected non-NULL output for input: %s", + test_cases[i].input); + + RD_UT_ASSERT(!strcmp(output, test_cases[i].expected_output), + "Base64Url encoding failed: expected %s, got %s", + test_cases[i].expected_output, output); + + rd_free(output); + } + + RD_UT_PASS(); +} + +/** + * @brief Test JWT request body building. + * Verifies that the request body is correctly formatted with + * the required parameters. + */ +static int ut_sasl_oauthbearer_oidc_jwt_bearer_build_request_body(void) { + const char *assertion = "test.jwt.assertion"; + const char *scope = "test.scope"; + const char *expected = + "grant_type=urn:ietf:params:oauth:grant-type:jwt-bearer&assertion=" + "test.jwt.assertion&scope=test.scope"; + char *body; + + RD_UT_BEGIN(); + + body = rd_kafka_oidc_jwt_bearer_build_request_body(assertion, scope); + + RD_UT_ASSERT(body != NULL, "Expected non-NULL request body"); + + RD_UT_ASSERT(!strcmp(body, expected), + "Request body incorrect: expected '%s', got '%s'", + expected, body); + + rd_free(body); + + RD_UT_PASS(); +} + +/** + * @brief Test JWT assertion file parsing. + * Verifies that the function correctly reads a JWT from a file. + */ +static int ut_sasl_oauthbearer_oidc_assertion_parse_from_file(void) { + + char tempfile_path[512]; + FILE *tempfile; + const char *test_jwt = "header.payload.signature"; + char *result; + + RD_UT_BEGIN(); + + tempfile = rd_file_mkstemp("rdtmp", "wb", tempfile_path, + sizeof(tempfile_path)); + fprintf(tempfile, "%s", test_jwt); + fclose(tempfile); + + /* Test parsing from file */ + result = rd_kafka_oidc_assertion_read_from_file(tempfile_path); + RD_UT_ASSERT(result != NULL, + "Expected non-NULL result from parsing file"); + RD_UT_ASSERT(!strcmp(result, test_jwt), + "Incorrect JWT parsed: expected '%s', got '%s'", test_jwt, + result); + + rd_free(result); + + /* Test with NULL path */ + result = rd_kafka_oidc_assertion_read_from_file(NULL); + RD_UT_ASSERT(result == NULL, "Expected NULL result with NULL path"); + + /* Test with non-existent file */ + result = + rd_kafka_oidc_assertion_read_from_file("/non/existent/file/path"); + RD_UT_ASSERT(result == NULL, + "Expected NULL result with non-existent file"); + + remove(tempfile_path); + + RD_UT_PASS(); +} + +/** + * @brief Mock function for testing JWT template processing. + * Creates a file with valid JWT template JSON. + */ +static char *ut_create_mock_jwt_template_file(void) { + FILE *tempfile; + char tempfile_path[512]; + + const char *template_json = + "{\n" + " \"header\": {\n" + " \"kid\": \"test-key-id\"\n" + " },\n" + " \"payload\": {\n" + " \"sub\": \"test-subject\",\n" + " \"aud\": \"test-audience\"\n" + " }\n" + "}"; + + tempfile = rd_file_mkstemp("rdtmp", "wb", tempfile_path, + sizeof(tempfile_path)); + if (!tempfile) + return NULL; + + fprintf(tempfile, "%s", template_json); + fclose(tempfile); + + return rd_strdup(tempfile_path); +} + +/** + * @brief Test JWT template file processing. + * Verifies that the function correctly parses header and payload from + * template. + */ +static int ut_sasl_oauthbearer_oidc_assertion_process_template_file(void) { + char *template_path; + rd_kafka_t *rk; + cJSON *header = NULL, *payload = NULL; + int result; + + RD_UT_BEGIN(); + + rk = rd_calloc(1, sizeof(*rk)); + + template_path = ut_create_mock_jwt_template_file(); + RD_UT_ASSERT(template_path != NULL, "Failed to create template file"); + + /* Test template processing */ + result = rd_kafka_oidc_assertion_parse_template_file(rk, template_path, + &header, &payload); + RD_UT_ASSERT(result == 0, "Expected success from template processing"); + RD_UT_ASSERT(header != NULL, "Expected non-NULL header JSON"); + RD_UT_ASSERT(payload != NULL, "Expected non-NULL payload JSON"); + + /* Verify header contents */ + cJSON *kid = cJSON_GetObjectItem(header, "kid"); + RD_UT_ASSERT(kid != NULL, "Expected kid in header"); + RD_UT_ASSERT(cJSON_IsString(kid), "Expected kid to be string"); + RD_UT_ASSERT(!strcmp(cJSON_GetStringValue(kid), "test-key-id"), + "Incorrect kid value"); + + /* Verify payload contents */ + cJSON *sub = cJSON_GetObjectItem(payload, "sub"); + RD_UT_ASSERT(sub != NULL, "Expected sub in payload"); + RD_UT_ASSERT(cJSON_IsString(sub), "Expected sub to be string"); + RD_UT_ASSERT(!strcmp(cJSON_GetStringValue(sub), "test-subject"), + "Incorrect sub value"); + + cJSON *aud = cJSON_GetObjectItem(payload, "aud"); + RD_UT_ASSERT(aud != NULL, "Expected aud in payload"); + RD_UT_ASSERT(cJSON_IsString(aud), "Expected aud to be string"); + RD_UT_ASSERT(!strcmp(cJSON_GetStringValue(aud), "test-audience"), + "Incorrect aud value"); + + /* Test with non-existent file */ + cJSON_Delete(header); + cJSON_Delete(payload); + header = NULL; + payload = NULL; + + result = rd_kafka_oidc_assertion_parse_template_file( + rk, "/non/existent/file", &header, &payload); + RD_UT_ASSERT(result == -1, "Expected failure with non-existent file"); + RD_UT_ASSERT(header == NULL, + "Expected NULL header with failed processing"); + RD_UT_ASSERT(payload == NULL, + "Expected NULL payload with failed processing"); + + unlink(template_path); + rd_free(template_path); + rd_free(rk); + if (header) + cJSON_Delete(header); + if (payload) + cJSON_Delete(payload); + + RD_UT_PASS(); +} + +/** + * @brief Test JWT assertion creation with minimal approach. + * Creates a simplified test that validates the format of the created + * JWT. + */ +static int ut_sasl_oauthbearer_oidc_assertion_create(void) { + rd_kafka_t *rk; + char *private_key_pem; + char *jwt; + char *header_part, *payload_part, *signature_part; + char *dot1, *dot2; + + RD_UT_BEGIN(); + + rk = rd_calloc(1, sizeof(*rk)); + + /* Random key for signing */ + private_key_pem = + "-----BEGIN PRIVATE KEY-----\n" + "MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCuBS7qG5Cd2voa\n" + "7nSU2xaDbe6QOYU2P4bIY58SKHbFyq1iB517r61ImsWD+UfZuVxCqXRaWdxxnG/D\n" + "5VGTQzBOZYlgSYxdJ1KvITXO8kj5i2zBT/LI9R9MTQ7nLFh+vQm1aM8Ts1PmA5t9\n" + "zFtR9B8RfqN9kbt+2LnLY57aJxEkFC3D89D0WWT97UJWKo7/vxMqp9K9uAIL2Efo\n" + "5rp9qwyPbx9LmTbfZ8Vog6mG6tAQQHSUqw0PnfhADCVCkYtkzYcyDZy3qZQFu1bY\n" + "KuuMoMjssyCUL5tTHyNZju0p3Z0bSfOV/nkqHpSSjHKCeQkSKS18/7In6cfY/M4k\n" + "8rM4HWkdAgMBAAECggEAFsTo2YrXxj/Dn8h5ioyMCpBUuZw9GNcBDLE0PAz9VW3q\n" + "d7wlV+ypkKlnlJgGVa+SKcrARZ4iYN8mJIyZutn8tRVF/0pASmP9xppizvwWnkgm\n" + "57hNPQwNl08x1v+PaK3VWl4nUh2RqbPpIXGetT9q3UAjpiduT++Nh9Y2D7cy3/Ro\n" + "ritnpBDs1R6y5J3rxiE1s8kLYwhDRCPsgUg/ZtKPDTTFz42ArrFeqM91FmjHYP3t\n" + "p9Uh6CIZ80D6CsMX/TnZFfhKe6EvKBSl4W6tcdFlnXW52fm/670iKSmcJ09+fzPO\n" + "T1BLrkXGv51bFnlvUyJqQGVEv5+0+HUX/oTpTknMQQKBgQDbYhqip5e8r1f5v32B\n" + "k1r3xtEiWU2mZoTHJu6bVeuigzVhz4pTMVZChElJ4QnhwwO0t5Oe4Su1MZtjMRw7\n" + "qIE+YM2pXss25LRXbmWItuRWINzpe8omlxQSOj2tNO/67l0P4vmmrT5wkU2cG6TR\n" + "ddzorO3NDA4MY4+Xdli+SHXwUQKBgQDLEMqlwyvaGjuZ30l6F13fWnEt9PNCtJsa\n" + "nsdKJKyFMThdysY/PK40o2GTRRhgYa2jigN3OCYSSznRRZRlqznqL1bOLlYV6zS8\n" + "TGhdLXuApyLAjZYIK4RtZJYGR9+yg8rH13uNektgW8KnHh5Ko/ptRVoEukf3SBsh\n" + "f0Fib3ylDQKBgE11Bth0+bMJ6bLpNEPiphSjosVQ6ISe37R8/3Pi0y5uyxM8tqcG\n" + "3WDg2gt2pAmM1CsjQcCv2cHAwQ81kLVTmkZO4W4yZOd9ulrARKMPh/EM61KYfVhA\n" + "sTp6S7py3WQocr0gM2rw8gHGm7NJY1j9F0EjhVaHMhKXuGQOyehtJw7xAoGAPwuA\n" + "jwRQSg+Y74XmbxRwHZcbynPhTpV6DkK7huZp9ZQ5ds0szZdOUqNi+PEbx1isKzj/\n" + "KHVzRHy8f5+FmicV/QIjhjHWokl6/vcN89faHzBE1tleejzgiYIQHfUUm3zVaUQa\n" + "ZOtSGaGDhpUQPIY6itBcSVl4XGqzmavDpgcNAMUCgYBFFGtG+RbSySzKfRUp3vc5\n" + "8YqIdrtXfW9gc9s1+Pw8wfgrY0Rrvy+e3ClSwgGENxgxBvWvhzq2m0S8x2jdLAl1\n" + "b+VLGCOpUvS4iN2yrHkoHS7BSW40wLuVooJUAaNOIEPqiv1JC75q2dhTRrANp6WB\n" + "bm+7yWVTNlXYuKQqtuOkNQ==\n" + "-----END PRIVATE KEY-----\n"; + + jwt = rd_kafka_oidc_assertion_create( + rk, private_key_pem, NULL, NULL, + RD_KAFKA_SASL_OAUTHBEARER_ASSERTION_ALGORITHM_RS256, NULL, + "test-subject", "test-issuer", "test-audience", 2, 300, rd_true); + + RD_UT_ASSERT(jwt != NULL, "Failed to create JWT assertion"); + + dot1 = strchr(jwt, '.'); + RD_UT_ASSERT(dot1 != NULL, "JWT missing first dot separator"); + + dot2 = strchr(dot1 + 1, '.'); + RD_UT_ASSERT(dot2 != NULL, "JWT missing second dot separator"); + + header_part = rd_strndup(jwt, dot1 - jwt); + payload_part = rd_strndup(dot1 + 1, dot2 - (dot1 + 1)); + signature_part = rd_strdup(dot2 + 1); + + RD_UT_ASSERT(strlen(header_part) > 0, "JWT header part is empty"); + RD_UT_ASSERT(strlen(payload_part) > 0, "JWT payload part is empty"); + RD_UT_ASSERT(strlen(signature_part) > 0, "JWT signature part is empty"); + + RD_UT_ASSERT(!strchr(header_part, '='), + "JWT header contains padding character"); + RD_UT_ASSERT(!strchr(payload_part, '='), + "JWT payload contains padding character"); + RD_UT_ASSERT(!strchr(signature_part, '='), + "JWT signature contains padding character"); + + RD_UT_ASSERT(!strchr(header_part, '+'), + "JWT header contains '+' character"); + RD_UT_ASSERT(!strchr(header_part, '/'), + "JWT header contains '/' character"); + RD_UT_ASSERT(!strchr(payload_part, '+'), + "JWT payload contains '+' character"); + RD_UT_ASSERT(!strchr(payload_part, '/'), + "JWT payload contains '/' character"); + RD_UT_ASSERT(!strchr(signature_part, '+'), + "JWT signature contains '+' character"); + RD_UT_ASSERT(!strchr(signature_part, '/'), + "JWT signature contains '/' character"); + + rd_free(header_part); + rd_free(payload_part); + rd_free(signature_part); + rd_free(jwt); + rd_free(rk); + + RD_UT_PASS(); +} + +int unittest_sasl_oauthbearer_oidc_jwt_bearer(void) { + int fails = 0; + + fails += ut_sasl_oauthbearer_oidc_jwt_bearer_base64url_encode(); + fails += ut_sasl_oauthbearer_oidc_jwt_bearer_build_request_body(); + + return fails; +} + +int unittest_sasl_oauthbearer_oidc_assertion(void) { + int fails = 0; + + fails += ut_sasl_oauthbearer_oidc_assertion_parse_from_file(); + fails += ut_sasl_oauthbearer_oidc_assertion_process_template_file(); + fails += ut_sasl_oauthbearer_oidc_assertion_create(); + + return fails; +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer_oidc.h b/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer_oidc.h index a944f2efa10..eed50ba83d2 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer_oidc.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_oauthbearer_oidc.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2021 Magnus Edenhill + * Copyright (c) 2021-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,9 +28,14 @@ #ifndef _RDKAFKA_SASL_OAUTHBEARER_OIDC_H_ #define _RDKAFKA_SASL_OAUTHBEARER_OIDC_H_ -void rd_kafka_oidc_token_refresh_cb(rd_kafka_t *rk, - const char *oauthbearer_config, - void *opaque); +void rd_kafka_oidc_token_jwt_bearer_refresh_cb(rd_kafka_t *rk, + const char *oauthbearer_config, + void *opaque); + +void rd_kafka_oidc_token_client_credentials_refresh_cb( + rd_kafka_t *rk, + const char *oauthbearer_config, + void *opaque); int unittest_sasl_oauthbearer_oidc(void); diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_plain.c b/src/third_party/librdkafka/dist/src/rdkafka_sasl_plain.c index 1e715cfba22..3c817c64e3e 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_plain.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_plain.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -81,9 +81,9 @@ int rd_kafka_sasl_plain_client_new(rd_kafka_transport_t *rktrans, cidlen = rk->rk_conf.sasl.username ? (int)strlen(rk->rk_conf.sasl.username) : 0; - pwlen = rk->rk_conf.sasl.password - ? (int)strlen(rk->rk_conf.sasl.password) - : 0; + pwlen = rk->rk_conf.sasl.password + ? (int)strlen(rk->rk_conf.sasl.password) + : 0; buf = rd_alloca(zidlen + 1 + cidlen + 1 + pwlen + 1); diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_scram.c b/src/third_party/librdkafka/dist/src/rdkafka_sasl_scram.c index 7d5db564964..689b91284ef 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_scram.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_scram.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,6 +38,7 @@ #include "rdkafka_sasl_int.h" #include "rdrand.h" #include "rdunittest.h" +#include "rdbase64.h" #if WITH_SSL @@ -52,9 +54,10 @@ * @brief Per-connection state */ struct rd_kafka_sasl_scram_state { - enum { RD_KAFKA_SASL_SCRAM_STATE_CLIENT_FIRST_MESSAGE, - RD_KAFKA_SASL_SCRAM_STATE_SERVER_FIRST_MESSAGE, - RD_KAFKA_SASL_SCRAM_STATE_CLIENT_FINAL_MESSAGE, + enum { + RD_KAFKA_SASL_SCRAM_STATE_CLIENT_FIRST_MESSAGE, + RD_KAFKA_SASL_SCRAM_STATE_SERVER_FIRST_MESSAGE, + RD_KAFKA_SASL_SCRAM_STATE_CLIENT_FINAL_MESSAGE, } state; rd_chariov_t cnonce; /* client c-nonce */ rd_chariov_t first_msg_bare; /* client-first-message-bare */ @@ -76,6 +79,7 @@ static void rd_kafka_sasl_scram_close(rd_kafka_transport_t *rktrans) { RD_IF_FREE(state->first_msg_bare.ptr, rd_free); RD_IF_FREE(state->ServerSignatureB64, rd_free); rd_free(state); + rktrans->rktrans_sasl.state = NULL; } @@ -140,77 +144,6 @@ static char *rd_kafka_sasl_scram_get_attr(const rd_chariov_t *inbuf, } -/** - * @brief Base64 encode binary input \p in - * @returns a newly allocated, base64-encoded string or NULL on error. - */ -static char *rd_base64_encode(const rd_chariov_t *in) { - char *ret; - size_t ret_len, max_len; - - /* OpenSSL takes an |int| argument so the input cannot exceed that. */ - if (in->size > INT_MAX) { - return NULL; - } - - /* This does not overflow given the |INT_MAX| bound, above. */ - max_len = (((in->size + 2) / 3) * 4) + 1; - ret = rd_malloc(max_len); - if (ret == NULL) { - return NULL; - } - - ret_len = - EVP_EncodeBlock((uint8_t *)ret, (uint8_t *)in->ptr, (int)in->size); - assert(ret_len < max_len); - ret[ret_len] = 0; - - return ret; -} - - -/** - * @brief Base64 decode input string \p in. Ignores leading and trailing - * whitespace. - * @returns -1 on invalid Base64, or 0 on successes in which case a - * newly allocated binary string is set in out (and size). - */ -static int rd_base64_decode(const rd_chariov_t *in, rd_chariov_t *out) { - size_t ret_len; - - /* OpenSSL takes an |int| argument, so |in->size| must not exceed - * that. */ - if (in->size % 4 != 0 || in->size > INT_MAX) { - return -1; - } - - ret_len = ((in->size / 4) * 3); - out->ptr = rd_malloc(ret_len + 1); - - if (EVP_DecodeBlock((uint8_t *)out->ptr, (uint8_t *)in->ptr, - (int)in->size) == -1) { - rd_free(out->ptr); - out->ptr = NULL; - return -1; - } - - /* EVP_DecodeBlock will pad the output with trailing NULs and count - * them in the return value. */ - if (in->size > 1 && in->ptr[in->size - 1] == '=') { - if (in->size > 2 && in->ptr[in->size - 2] == '=') { - ret_len -= 2; - } else { - ret_len -= 1; - } - } - - out->ptr[ret_len] = 0; - out->size = ret_len; - - return 0; -} - - /** * @brief Perform H(str) hash function and stores the result in \p out * which must be at least EVP_MAX_MD_SIZE. @@ -254,8 +187,6 @@ static int rd_kafka_sasl_scram_HMAC(rd_kafka_transport_t *rktrans, return 0; } - - /** * @brief Perform \p itcnt iterations of HMAC() on the given buffer \p in * using \p salt, writing the output into \p out which must be @@ -267,57 +198,14 @@ static int rd_kafka_sasl_scram_Hi(rd_kafka_transport_t *rktrans, const rd_chariov_t *salt, int itcnt, rd_chariov_t *out) { + rd_kafka_broker_t *rkb = rktrans->rktrans_rkb; const EVP_MD *evp = rktrans->rktrans_rkb->rkb_rk->rk_conf.sasl.scram_evp; - unsigned int ressize = 0; - unsigned char tempres[EVP_MAX_MD_SIZE]; - unsigned char *saltplus; - int i; - - /* U1 := HMAC(str, salt + INT(1)) */ - saltplus = rd_alloca(salt->size + 4); - memcpy(saltplus, salt->ptr, salt->size); - saltplus[salt->size] = 0; - saltplus[salt->size + 1] = 0; - saltplus[salt->size + 2] = 0; - saltplus[salt->size + 3] = 1; - - /* U1 := HMAC(str, salt + INT(1)) */ - if (!HMAC(evp, (const unsigned char *)in->ptr, (int)in->size, saltplus, - salt->size + 4, tempres, &ressize)) { - rd_rkb_dbg(rktrans->rktrans_rkb, SECURITY, "SCRAM", - "HMAC priming failed"); - return -1; - } - - memcpy(out->ptr, tempres, ressize); - - /* Ui-1 := HMAC(str, Ui-2) .. */ - for (i = 1; i < itcnt; i++) { - unsigned char tempdest[EVP_MAX_MD_SIZE]; - int j; - - if (unlikely(!HMAC(evp, (const unsigned char *)in->ptr, - (int)in->size, tempres, ressize, tempdest, - NULL))) { - rd_rkb_dbg(rktrans->rktrans_rkb, SECURITY, "SCRAM", - "Hi() HMAC #%d/%d failed", i, itcnt); - return -1; - } - - /* U1 XOR U2 .. */ - for (j = 0; j < (int)ressize; j++) { - out->ptr[j] ^= tempdest[j]; - tempres[j] = tempdest[j]; - } - } - - out->size = ressize; - - return 0; + return rd_kafka_ssl_hmac(rkb, evp, in, salt, itcnt, out); } + /** * @returns a SASL value-safe-char encoded string, replacing "," and "=" * with their escaped counterparts in a newly allocated string. @@ -366,10 +254,9 @@ static char *rd_kafka_sasl_safe_string(const char *str) { * @brief Build client-final-message-without-proof * @remark out->ptr will be allocated and must be freed. */ -static void rd_kafka_sasl_scram_build_client_final_message_wo_proof( - struct rd_kafka_sasl_scram_state *state, - const char *snonce, - rd_chariov_t *out) { +static void +rd_kafka_sasl_scram_build_client_final_message_wo_proof(const char *snonce, + rd_chariov_t *out) { const char *attr_c = "biws"; /* base64 encode of "n,," */ /* @@ -377,11 +264,9 @@ static void rd_kafka_sasl_scram_build_client_final_message_wo_proof( * channel-binding "," nonce ["," * extensions] */ - out->size = strlen("c=,r=") + strlen(attr_c) + state->cnonce.size + - strlen(snonce); - out->ptr = rd_malloc(out->size + 1); - rd_snprintf(out->ptr, out->size + 1, "c=%s,r=%.*s%s", attr_c, - (int)state->cnonce.size, state->cnonce.ptr, snonce); + out->size = strlen("c=,r=") + strlen(attr_c) + strlen(snonce); + out->ptr = rd_malloc(out->size + 1); + rd_snprintf(out->ptr, out->size + 1, "c=%s,r=%s", attr_c, snonce); } @@ -451,7 +336,7 @@ static int rd_kafka_sasl_scram_build_client_final_message( /* client-final-message-without-proof */ rd_kafka_sasl_scram_build_client_final_message_wo_proof( - state, server_nonce, &client_final_msg_wo_proof); + server_nonce, &client_final_msg_wo_proof); /* AuthMessage := client-first-message-bare + "," + * server-first-message + "," + @@ -486,7 +371,7 @@ static int rd_kafka_sasl_scram_build_client_final_message( } /* Store the Base64 encoded ServerSignature for quick comparison */ - state->ServerSignatureB64 = rd_base64_encode(&ServerSignature); + state->ServerSignatureB64 = rd_base64_encode_str(&ServerSignature); if (state->ServerSignatureB64 == NULL) { rd_free(client_final_msg_wo_proof.ptr); return -1; @@ -511,7 +396,7 @@ static int rd_kafka_sasl_scram_build_client_final_message( /* Base64 encoded ClientProof */ - ClientProofB64 = rd_base64_encode(&ClientProof); + ClientProofB64 = rd_base64_encode_str(&ClientProof); if (ClientProofB64 == NULL) { rd_free(client_final_msg_wo_proof.ptr); return -1; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sasl_win32.c b/src/third_party/librdkafka/dist/src/rdkafka_sasl_win32.c index b07e1808d0b..b968bcece36 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sasl_win32.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_sasl_win32.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -490,6 +491,7 @@ static void rd_kafka_sasl_win32_close(rd_kafka_transport_t *rktrans) { rd_free(state->cred); } rd_free(state); + rktrans->rktrans_sasl.state = NULL; } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_ssl.c b/src/third_party/librdkafka/dist/src/rdkafka_ssl.c index 9961a240f71..dba5273db37 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_ssl.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_ssl.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -134,11 +135,14 @@ const char *rd_kafka_ssl_last_error_str(void) { * * If 'rkb' is non-NULL broker-specific logging will be used, * else it will fall back on global 'rk' debugging. + * + * `ctx_identifier` is a string used to customize the log message. */ -static char *rd_kafka_ssl_error(rd_kafka_t *rk, - rd_kafka_broker_t *rkb, - char *errstr, - size_t errstr_size) { +char *rd_kafka_ssl_error0(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + const char *ctx_identifier, + char *errstr, + size_t errstr_size) { unsigned long l; const char *file, *data, *func; int line, flags; @@ -165,9 +169,11 @@ static char *rd_kafka_ssl_error(rd_kafka_t *rk, if (cnt++ > 0) { /* Log last message */ if (rkb) - rd_rkb_log(rkb, LOG_ERR, "SSL", "%s", errstr); + rd_rkb_log(rkb, LOG_ERR, "SSL", "%s: %s", + ctx_identifier, errstr); else - rd_kafka_log(rk, LOG_ERR, "SSL", "%s", errstr); + rd_kafka_log(rk, LOG_ERR, "SSL", "%s: %s", + ctx_identifier, errstr); } ERR_error_string_n(l, buf, sizeof(buf)); @@ -187,12 +193,18 @@ static char *rd_kafka_ssl_error(rd_kafka_t *rk, if (cnt == 0) rd_snprintf(errstr, errstr_size, - "No further error information available"); + "%s: No further error information available", + ctx_identifier); return errstr; } - +static char *rd_kafka_ssl_error(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + char *errstr, + size_t errstr_size) { + return rd_kafka_ssl_error0(rk, rkb, "kafka", errstr, errstr_size); +} /** * Set transport IO event polling based on SSL error. @@ -224,15 +236,24 @@ rd_kafka_transport_ssl_io_update(rd_kafka_transport_t *rktrans, if (serr2) rd_kafka_ssl_error(NULL, rktrans->rktrans_rkb, errstr, errstr_size); - else if (!rd_socket_errno || rd_socket_errno == ECONNRESET) + else if (!rd_socket_errno) { + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: connection closed by " + "peer"); rd_snprintf(errstr, errstr_size, "Disconnected"); - else + } else if (rd_socket_errno == ECONNRESET) { + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: connection reset by peer"); + rd_snprintf(errstr, errstr_size, "Disconnected"); + } else rd_snprintf(errstr, errstr_size, "SSL transport error: %s", rd_strerror(rd_socket_errno)); return -1; case SSL_ERROR_ZERO_RETURN: + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: SSL connection closed by peer"); rd_snprintf(errstr, errstr_size, "Disconnected"); return -1; @@ -475,7 +496,8 @@ static int rd_kafka_transport_ssl_set_endpoint_id(rd_kafka_transport_t *rktrans, param = SSL_get0_param(rktrans->rktrans_ssl); - if (!X509_VERIFY_PARAM_set1_host(param, name, 0)) + if (!X509_VERIFY_PARAM_set1_host(param, name, + strnlen(name, sizeof(name)))) goto fail; } #else @@ -696,21 +718,91 @@ static EVP_PKEY *rd_kafka_ssl_PKEY_from_string(rd_kafka_t *rk, } /** - * @brief Parse a PEM-formatted string into an X509 object. + * Read a PEM formatted cert chain from BIO \p in into \p chainp . * - * @param str Input PEM string, nul-terminated + * @param rk rdkafka instance. + * @param in BIO to read from. + * @param chainp Stack to push the certificates to. + * + * @return 0 on success, -1 on error. + */ +int rd_kafka_ssl_read_cert_chain_from_BIO(BIO *in, + STACK_OF(X509) * chainp, + pem_password_cb *password_cb, + void *password_cb_opaque) { + X509 *ca; + int r, ret = 0; + unsigned long err; + while (1) { + ca = X509_new(); + if (ca == NULL) { + rd_assert(!*"X509_new() allocation failed"); + } + if (PEM_read_bio_X509(in, &ca, password_cb, + password_cb_opaque) != NULL) { + r = sk_X509_push(chainp, ca); + if (!r) { + X509_free(ca); + ret = -1; + goto end; + } + } else { + X509_free(ca); + break; + } + } + /* When the while loop ends, it's usually just EOF. */ + err = ERR_peek_last_error(); + if (ERR_GET_LIB(err) == ERR_LIB_PEM && + ERR_GET_REASON(err) == PEM_R_NO_START_LINE) + ret = 0; + else + ret = -1; /* some real error */ + ERR_clear_error(); +end: + return ret; +} + +/** + * @brief Parse a PEM-formatted string into an X509 object. + * Rest of CA chain is pushed to the \p chainp stack. + * + * @param str Input PEM string, nul-terminated. + * @param chainp Stack to push the certificates to. * * @returns a new X509 on success or NULL on error. + * + * @remark When NULL is returned the chainp stack is not modified. */ -static X509 *rd_kafka_ssl_X509_from_string(rd_kafka_t *rk, const char *str) { +static X509 *rd_kafka_ssl_X509_from_string(rd_kafka_t *rk, + const char *str, + STACK_OF(X509) * chainp) { BIO *bio = BIO_new_mem_buf((void *)str, -1); X509 *x509; x509 = PEM_read_bio_X509(bio, NULL, rd_kafka_transport_ssl_passwd_cb, rk); - BIO_free(bio); + if (!x509) { + BIO_free(bio); + return NULL; + } + if (rd_kafka_ssl_read_cert_chain_from_BIO( + bio, chainp, rd_kafka_transport_ssl_passwd_cb, rk) != 0) { + /* Rest of the certificate is present, + * but couldn't be read, + * returning NULL as certificate cannot be verified + * without its chain. */ + rd_kafka_log(rk, LOG_WARNING, "SSL", + "Failed to read certificate chain from PEM. " + "Returning NULL certificate too."); + X509_free(x509); + BIO_free(bio); + return NULL; + } + + BIO_free(bio); return x509; } @@ -721,6 +813,7 @@ static X509 *rd_kafka_ssl_X509_from_string(rd_kafka_t *rk, const char *str) { * @brief Attempt load CA certificates from a Windows Certificate store. */ static int rd_kafka_ssl_win_load_cert_store(rd_kafka_t *rk, + const char *ctx_identifier, SSL_CTX *ctx, const char *store_name) { HCERTSTORE w_store; @@ -735,15 +828,16 @@ static int rd_kafka_ssl_win_load_cert_store(rd_kafka_t *rk, /* Convert store_name to wide-char */ werr = mbstowcs_s(&wsize, NULL, 0, store_name, strlen(store_name)); if (werr || wsize < 2 || wsize > 1000) { - rd_kafka_log(rk, LOG_ERR, "CERTSTORE", - "Invalid Windows certificate store name: %.*s%s", - 30, store_name, - wsize < 2 ? " (empty)" : " (truncated)"); + rd_kafka_log( + rk, LOG_ERR, "CERTSTORE", + "%s: Invalid Windows certificate store name: %.*s%s", + ctx_identifier, 30, store_name, + wsize < 2 ? " (empty)" : " (truncated)"); return -1; } wstore_name = rd_alloca(sizeof(*wstore_name) * wsize); werr = mbstowcs_s(NULL, wstore_name, wsize, store_name, - strlen(store_name)); + strlen(store_name)); rd_assert(!werr); w_store = CertOpenStore(CERT_STORE_PROV_SYSTEM, 0, 0, @@ -754,9 +848,9 @@ static int rd_kafka_ssl_win_load_cert_store(rd_kafka_t *rk, if (!w_store) { rd_kafka_log( rk, LOG_ERR, "CERTSTORE", - "Failed to open Windows certificate " + "%s: Failed to open Windows certificate " "%s store: %s", - store_name, + ctx_identifier, store_name, rd_strerror_w32(GetLastError(), errstr, sizeof(errstr))); return -1; } @@ -792,9 +886,9 @@ static int rd_kafka_ssl_win_load_cert_store(rd_kafka_t *rk, CertCloseStore(w_store, 0); rd_kafka_dbg(rk, SECURITY, "CERTSTORE", - "%d certificate(s) successfully added from " + "%s: %d certificate(s) successfully added from " "Windows Certificate %s store, %d failed", - cnt, store_name, fail_cnt); + ctx_identifier, cnt, store_name, fail_cnt); if (cnt == 0 && fail_cnt > 0) return -1; @@ -807,9 +901,10 @@ static int rd_kafka_ssl_win_load_cert_store(rd_kafka_t *rk, * * @returns the number of successfully loaded certificates, or -1 on error. */ -static int rd_kafka_ssl_win_load_cert_stores(rd_kafka_t *rk, - SSL_CTX *ctx, - const char *store_names) { +int rd_kafka_ssl_win_load_cert_stores(rd_kafka_t *rk, + const char *ctx_identifier, + SSL_CTX *ctx, + const char *store_names) { char *s; int cert_cnt = 0, fail_cnt = 0; @@ -843,7 +938,8 @@ static int rd_kafka_ssl_win_load_cert_stores(rd_kafka_t *rk, s = ""; } - r = rd_kafka_ssl_win_load_cert_store(rk, ctx, store_name); + r = rd_kafka_ssl_win_load_cert_store(rk, ctx_identifier, ctx, + store_name); if (r != -1) cert_cnt += r; else @@ -857,7 +953,32 @@ static int rd_kafka_ssl_win_load_cert_stores(rd_kafka_t *rk, } #endif /* MSC_VER */ +/** + * @brief Probe for a single \p path and if found and not an empty directory, + * set it on the \p ctx. + * + * @returns 0 if CA location was set with an error, 1 if it was set correctly, + * -1 if path should be skipped. + */ +static int rd_kafka_ssl_set_ca_path(rd_kafka_t *rk, + const char *ctx_identifier, + const char *path, + SSL_CTX *ctx, + rd_bool_t *is_dir) { + if (!rd_file_stat(path, is_dir)) + return -1; + if (*is_dir && rd_kafka_dir_is_empty(path)) + return -1; + + rd_kafka_dbg(rk, SECURITY, "CACERTS", + "Setting default CA certificate location for %s " + "to \"%s\"", + ctx_identifier, path); + + return SSL_CTX_load_verify_locations(ctx, *is_dir ? NULL : path, + *is_dir ? path : NULL); +} /** * @brief Probe for the system's CA certificate location and if found set it @@ -865,8 +986,9 @@ static int rd_kafka_ssl_win_load_cert_stores(rd_kafka_t *rk, * * @returns 0 if CA location was set, else -1. */ -static int rd_kafka_ssl_probe_and_set_default_ca_location(rd_kafka_t *rk, - SSL_CTX *ctx) { +int rd_kafka_ssl_probe_and_set_default_ca_location(rd_kafka_t *rk, + const char *ctx_identifier, + SSL_CTX *ctx) { #if _WIN32 /* No standard location on Windows, CA certs are in the ROOT store. */ return -1; @@ -918,34 +1040,21 @@ static int rd_kafka_ssl_probe_and_set_default_ca_location(rd_kafka_t *rk, int i; for (i = 0; (path = paths[i]); i++) { - struct stat st; rd_bool_t is_dir; - int r; - - if (stat(path, &st) != 0) + int r = rd_kafka_ssl_set_ca_path(rk, ctx_identifier, path, ctx, + &is_dir); + if (r == -1) continue; - is_dir = S_ISDIR(st.st_mode); - - if (is_dir && rd_kafka_dir_is_empty(path)) - continue; - - rd_kafka_dbg(rk, SECURITY, "CACERTS", - "Setting default CA certificate location " - "to %s, override with ssl.ca.location", - path); - - r = SSL_CTX_load_verify_locations(ctx, is_dir ? NULL : path, - is_dir ? path : NULL); if (r != 1) { char errstr[512]; /* Read error and clear the error stack */ rd_kafka_ssl_error(rk, NULL, errstr, sizeof(errstr)); rd_kafka_dbg(rk, SECURITY, "CACERTS", "Failed to set default CA certificate " - "location to %s %s: %s: skipping", + "location to %s %s for %s: %s: skipping", is_dir ? "directory" : "file", path, - errstr); + ctx_identifier, errstr); continue; } @@ -954,11 +1063,108 @@ static int rd_kafka_ssl_probe_and_set_default_ca_location(rd_kafka_t *rk, rd_kafka_dbg(rk, SECURITY, "CACERTS", "Unable to find any standard CA certificate" - "paths: is the ca-certificates package installed?"); + "paths for %s: is the ca-certificates package installed?", + ctx_identifier); return -1; #endif } +/** + * @brief Simple utility function to check if \p ca DN is matching + * any of the DNs in the \p ca_dns stack. + */ +static int rd_kafka_ssl_cert_issuer_match(STACK_OF(X509_NAME) * ca_dns, + X509 *ca) { + X509_NAME *issuer_dn = X509_get_issuer_name(ca); + X509_NAME *dn; + int i; + + for (i = 0; i < sk_X509_NAME_num(ca_dns); i++) { + dn = sk_X509_NAME_value(ca_dns, i); + if (0 == X509_NAME_cmp(dn, issuer_dn)) { + /* match found */ + return 1; + } + } + return 0; +} + +/** + * @brief callback function for SSL_CTX_set_cert_cb, see + * https://docs.openssl.org/master/man3/SSL_CTX_set_cert_cb for details + * of the callback function requirements. + * + * According to section 4.2.4 of RFC 8446: + * The "certificate_authorities" extension is used to indicate the + * certificate authorities (CAs) which an endpoint supports and which + * SHOULD be used by the receiving endpoint to guide certificate + * selection. + * + * We avoid sending a client certificate if the issuer doesn't match any DN + * of server trusted certificate authorities (SSL_get_client_CA_list). + * This is done to avoid sending a client certificate that would almost + * certainly be rejected by the peer and would avoid successful + * SASL_SSL authentication on the same connection in case + * `ssl.client.auth=requested`. + */ +static int rd_kafka_ssl_cert_callback(SSL *ssl, void *arg) { + rd_kafka_t *rk = arg; + STACK_OF(X509_NAME) * ca_list; + STACK_OF(X509) *certs = NULL; + X509 *cert; + int i; + + /* Get client cert from SSL connection */ + cert = SSL_get_certificate(ssl); + if (cert == NULL) { + /* If there's no client certificate, + * skip certificate issuer verification and + * avoid logging a warning. */ + return 1; + } + + /* Get the accepted client CA list from the SSL connection, this + * comes from the `certificate_authorities` field. */ + ca_list = SSL_get_client_CA_list(ssl); + if (sk_X509_NAME_num(ca_list) < 1) { + /* `certificate_authorities` is supported either + * in CertificateRequest (SSL <= 3, TLS <= 1.2) + * or as an extension (TLS >= 1.3). This should be always + * available, but in case it isn't, just send the certificate + * and let the server validate it. */ + return 1; + } + + if (rd_kafka_ssl_cert_issuer_match(ca_list, cert)) { + /* A match is found, use the certificate. */ + return 1; + } + + /* Get client cert chain from SSL connection */ + SSL_get0_chain_certs(ssl, &certs); + + if (certs) { + /* Check if there's a match in the CA list for + * each cert in the chain. */ + for (i = 0; i < sk_X509_num(certs); i++) { + cert = sk_X509_value(certs, i); + if (rd_kafka_ssl_cert_issuer_match(ca_list, cert)) { + /* A match is found, use the certificate. */ + return 1; + } + } + } + + /* No match is found, which means they would almost certainly be + * rejected by the peer. + * We decide to send no certificates. */ + rd_kafka_log(rk, LOG_WARNING, "SSL", + "No matching issuer found in " + "server trusted certificate authorities, " + "not sending any client certificates"); + SSL_certs_clear(ssl); + return 1; +} /** * @brief Registers certificates, keys, etc, on the SSL_CTX @@ -1078,7 +1284,7 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, /* Attempt to load CA root certificates from the * configured Windows certificate stores. */ r = rd_kafka_ssl_win_load_cert_stores( - rk, ctx, rk->rk_conf.ssl.ca_cert_stores); + rk, "kafka", ctx, rk->rk_conf.ssl.ca_cert_stores); if (r == 0) { rd_kafka_log( rk, LOG_NOTICE, "CERTSTORE", @@ -1109,8 +1315,8 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, * of standard CA certificate paths and use the * first one that is found. * Ignore failures. */ - r = rd_kafka_ssl_probe_and_set_default_ca_location(rk, - ctx); + r = rd_kafka_ssl_probe_and_set_default_ca_location( + rk, "kafka", ctx); } if (r == -1) { @@ -1167,6 +1373,20 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, rd_snprintf(errstr, errstr_size, "ssl_cert failed: "); return -1; } + + if (rk->rk_conf.ssl.cert->chain) { + r = SSL_CTX_set0_chain(ctx, + rk->rk_conf.ssl.cert->chain); + if (r != 1) { + rd_snprintf(errstr, errstr_size, + "ssl_cert failed: " + "setting certificate chain: "); + return -1; + } else { + /* The chain is now owned by the CTX */ + rk->rk_conf.ssl.cert->chain = NULL; + } + } } if (rk->rk_conf.ssl.cert_location) { @@ -1186,16 +1406,21 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, if (rk->rk_conf.ssl.cert_pem) { X509 *x509; + STACK_OF(X509) *ca = sk_X509_new_null(); + if (!ca) { + rd_assert(!*"sk_X509_new_null() allocation failed"); + } rd_kafka_dbg(rk, SECURITY, "SSL", "Loading public key from string"); - x509 = - rd_kafka_ssl_X509_from_string(rk, rk->rk_conf.ssl.cert_pem); + x509 = rd_kafka_ssl_X509_from_string( + rk, rk->rk_conf.ssl.cert_pem, ca); if (!x509) { rd_snprintf(errstr, errstr_size, "ssl.certificate.pem failed: " "not in PEM format?: "); + sk_X509_pop_free(ca, X509_free); return -1; } @@ -1205,11 +1430,25 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, if (r != 1) { rd_snprintf(errstr, errstr_size, - "ssl.certificate.pem failed: "); + "ssl.certificate.pem failed: " + "setting main certificate: "); + sk_X509_pop_free(ca, X509_free); return -1; } - } + if (sk_X509_num(ca) == 0) { + sk_X509_pop_free(ca, X509_free); + } else { + r = SSL_CTX_set0_chain(ctx, ca); + if (r != 1) { + rd_snprintf(errstr, errstr_size, + "ssl.certificate.pem failed: " + "setting certificate chain: "); + sk_X509_pop_free(ca, X509_free); + return -1; + } + } + } /* * ssl_key, ssl.key.location and ssl.key.pem @@ -1282,8 +1521,8 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, * ssl.keystore.location */ if (rk->rk_conf.ssl.keystore_location) { - EVP_PKEY *pkey; - X509 *cert; + EVP_PKEY *pkey = NULL; + X509 *cert = NULL; STACK_OF(X509) *ca = NULL; BIO *bio; PKCS12 *p12; @@ -1311,8 +1550,6 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, return -1; } - pkey = EVP_PKEY_new(); - cert = X509_new(); if (!PKCS12_parse(p12, rk->rk_conf.ssl.keystore_password, &pkey, &cert, &ca)) { EVP_PKEY_free(pkey); @@ -1328,28 +1565,17 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, return -1; } - if (ca != NULL) - sk_X509_pop_free(ca, X509_free); - PKCS12_free(p12); BIO_free(bio); - r = SSL_CTX_use_certificate(ctx, cert); - X509_free(cert); - if (r != 1) { - EVP_PKEY_free(pkey); - rd_snprintf(errstr, errstr_size, - "Failed to use ssl.keystore.location " - "certificate: "); - return -1; - } - - r = SSL_CTX_use_PrivateKey(ctx, pkey); - EVP_PKEY_free(pkey); + r = SSL_CTX_use_cert_and_key(ctx, cert, pkey, ca, 1); + RD_IF_FREE(cert, X509_free); + RD_IF_FREE(pkey, EVP_PKEY_free); + if (ca != NULL) + sk_X509_pop_free(ca, X509_free); if (r != 1) { rd_snprintf(errstr, errstr_size, - "Failed to use ssl.keystore.location " - "private key: "); + "Failed to use ssl.keystore.location: "); return -1; } @@ -1434,6 +1660,10 @@ static int rd_kafka_ssl_set_certs(rd_kafka_t *rk, return -1; } + /* Set client certificate callback to control the behaviour + * of client certificate selection TLS handshake. */ + SSL_CTX_set_cert_cb(ctx, rd_kafka_ssl_cert_callback, rk); + return 0; } @@ -1568,8 +1798,8 @@ static rd_bool_t rd_kafka_ssl_ctx_load_providers(rd_kafka_t *rk, OSSL_PROVIDER *prov; const char *buildinfo = NULL; OSSL_PARAM request[] = {{"buildinfo", OSSL_PARAM_UTF8_PTR, - (void *)&buildinfo, 0, 0}, - {NULL, 0, NULL, 0, 0}}; + (void *)&buildinfo, 0, 0}, + {NULL, 0, NULL, 0, 0}}; prov = OSSL_PROVIDER_load(NULL, provider); if (!prov) { @@ -1722,6 +1952,14 @@ int rd_kafka_ssl_ctx_init(rd_kafka_t *rk, char *errstr, size_t errstr_size) { goto fail; +#ifdef SSL_OP_IGNORE_UNEXPECTED_EOF + /* Ignore unexpected EOF error in OpenSSL 3.x, treating + * it like a normal connection close even if + * close_notify wasn't received. + * see issue #4293 */ + SSL_CTX_set_options(ctx, SSL_OP_IGNORE_UNEXPECTED_EOF); +#endif + SSL_CTX_set_mode(ctx, SSL_MODE_ENABLE_PARTIAL_WRITE); rk->rk_conf.ssl.ctx = ctx; @@ -1811,7 +2049,7 @@ void rd_kafka_ssl_init(void) { if (!CRYPTO_get_locking_callback()) { rd_kafka_ssl_locks_cnt = CRYPTO_num_locks(); rd_kafka_ssl_locks = rd_malloc(rd_kafka_ssl_locks_cnt * - sizeof(*rd_kafka_ssl_locks)); + sizeof(*rd_kafka_ssl_locks)); for (i = 0; i < rd_kafka_ssl_locks_cnt; i++) mtx_init(&rd_kafka_ssl_locks[i], mtx_plain); @@ -1839,3 +2077,56 @@ void rd_kafka_ssl_init(void) { OpenSSL_add_all_algorithms(); #endif } + +int rd_kafka_ssl_hmac(rd_kafka_broker_t *rkb, + const EVP_MD *evp, + const rd_chariov_t *in, + const rd_chariov_t *salt, + int itcnt, + rd_chariov_t *out) { + unsigned int ressize = 0; + unsigned char tempres[EVP_MAX_MD_SIZE]; + unsigned char *saltplus; + int i; + + /* U1 := HMAC(str, salt + INT(1)) */ + saltplus = rd_alloca(salt->size + 4); + memcpy(saltplus, salt->ptr, salt->size); + saltplus[salt->size] = 0; + saltplus[salt->size + 1] = 0; + saltplus[salt->size + 2] = 0; + saltplus[salt->size + 3] = 1; + + /* U1 := HMAC(str, salt + INT(1)) */ + if (!HMAC(evp, (const unsigned char *)in->ptr, (int)in->size, saltplus, + salt->size + 4, tempres, &ressize)) { + rd_rkb_dbg(rkb, SECURITY, "SSLHMAC", "HMAC priming failed"); + return -1; + } + + memcpy(out->ptr, tempres, ressize); + + /* Ui-1 := HMAC(str, Ui-2) .. */ + for (i = 1; i < itcnt; i++) { + unsigned char tempdest[EVP_MAX_MD_SIZE]; + int j; + + if (unlikely(!HMAC(evp, (const unsigned char *)in->ptr, + (int)in->size, tempres, ressize, tempdest, + NULL))) { + rd_rkb_dbg(rkb, SECURITY, "SSLHMAC", + "Hi() HMAC #%d/%d failed", i, itcnt); + return -1; + } + + /* U1 XOR U2 .. */ + for (j = 0; j < (int)ressize; j++) { + out->ptr[j] ^= tempdest[j]; + tempres[j] = tempdest[j]; + } + } + + out->size = ressize; + + return 0; +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_ssl.h b/src/third_party/librdkafka/dist/src/rdkafka_ssl.h index 325abbe1d47..6307857c1d6 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_ssl.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_ssl.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,4 +54,33 @@ void rd_kafka_ssl_init(void); const char *rd_kafka_ssl_last_error_str(void); +int rd_kafka_ssl_hmac(rd_kafka_broker_t *rkb, + const EVP_MD *evp, + const rd_chariov_t *in, + const rd_chariov_t *salt, + int itcnt, + rd_chariov_t *out); + +int rd_kafka_ssl_read_cert_chain_from_BIO(BIO *in, + STACK_OF(X509) * chainp, + pem_password_cb *password_cb, + void *password_cb_opaque); + +int rd_kafka_ssl_probe_and_set_default_ca_location(rd_kafka_t *rk, + const char *ctx_identifier, + SSL_CTX *ctx); + +char *rd_kafka_ssl_error0(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + const char *ctx_identifier, + char *errstr, + size_t errstr_size); + +#ifdef _WIN32 +int rd_kafka_ssl_win_load_cert_stores(rd_kafka_t *rk, + const char *ctx_identifier, + SSL_CTX *ctx, + const char *store_names); +#endif + #endif /* _RDKAFKA_SSL_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_sticky_assignor.c b/src/third_party/librdkafka/dist/src/rdkafka_sticky_assignor.c index 3f5d91cf002..6141a23a7d6 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_sticky_assignor.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_sticky_assignor.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -160,6 +161,9 @@ typedef RD_MAP_TYPE(const rd_kafka_topic_partition_t *, typedef RD_MAP_TYPE(const rd_kafka_topic_partition_t *, rd_list_t *) map_toppar_list_t; +typedef RD_MAP_TYPE(const rd_kafka_topic_partition_t *, + rd_kafka_metadata_partition_internal_t *) map_toppar_mdpi_t; + typedef RD_MAP_TYPE(const rd_kafka_topic_partition_t *, ConsumerGenerationPair_t *) map_toppar_cgpair_t; @@ -173,6 +177,7 @@ typedef RD_MAP_TYPE(const ConsumerPair_t *, typedef RD_MAP_TYPE(const char *, map_cpair_toppar_list_t *) map_str_map_cpair_toppar_list_t; +typedef RD_MAP_TYPE(const char *, const char *) map_str_str_t; /** Glue type helpers */ @@ -193,6 +198,121 @@ static void map_cpair_toppar_list_t_free(void *ptr) { } +/** @struct Convenience struct for storing consumer/rack and toppar/rack + * mappings. */ +typedef struct { + /** A map of member_id -> rack_id pairs. */ + map_str_str_t member_id_to_rack_id; + /* A map of topic partition to rd_kafka_metadata_partition_internal_t */ + map_toppar_mdpi_t toppar_to_mdpi; +} rd_kafka_rack_info_t; + +/** + * @brief Initialize a rd_kafka_rack_info_t. + * + * @param topics + * @param topic_cnt + * @param mdi + * + * This struct is for convenience/easy grouping, and as a consequence, we avoid + * copying values. Thus, it is intended to be used within the lifetime of this + * function's arguments. + * + * @return rd_kafka_rack_info_t* + */ +static rd_kafka_rack_info_t * +rd_kafka_rack_info_new(rd_kafka_assignor_topic_t **topics, + size_t topic_cnt, + const rd_kafka_metadata_internal_t *mdi) { + int i; + size_t t; + rd_kafka_group_member_t *rkgm; + rd_kafka_rack_info_t *rkri = rd_calloc(1, sizeof(rd_kafka_rack_info_t)); + + if (!rd_kafka_use_rack_aware_assignment(topics, topic_cnt, mdi)) { + /* Free everything immediately, we aren't using rack aware + assignment, this struct is not applicable. */ + rd_free(rkri); + return NULL; + } + + rkri->member_id_to_rack_id = (map_str_str_t)RD_MAP_INITIALIZER( + 0, rd_map_str_cmp, rd_map_str_hash, + NULL /* refs members.rkgm_member_id */, + NULL /* refs members.rkgm_rack_id */); + rkri->toppar_to_mdpi = (map_toppar_mdpi_t)RD_MAP_INITIALIZER( + 0, rd_kafka_topic_partition_cmp, rd_kafka_topic_partition_hash, + rd_kafka_topic_partition_destroy_free, NULL); + + for (t = 0; t < topic_cnt; t++) { + RD_LIST_FOREACH(rkgm, &topics[t]->members, i) { + RD_MAP_SET(&rkri->member_id_to_rack_id, + rkgm->rkgm_member_id->str, + rkgm->rkgm_rack_id->str); + } + + for (i = 0; i < topics[t]->metadata->partition_cnt; i++) { + rd_kafka_topic_partition_t *rkpart = + rd_kafka_topic_partition_new( + topics[t]->metadata->topic, i); + RD_MAP_SET( + &rkri->toppar_to_mdpi, rkpart, + &topics[t]->metadata_internal->partitions[i]); + } + } + + return rkri; +} + +/* Destroy a rd_kafka_rack_info_t. */ +static void rd_kafka_rack_info_destroy(rd_kafka_rack_info_t *rkri) { + if (!rkri) + return; + + RD_MAP_DESTROY(&rkri->member_id_to_rack_id); + RD_MAP_DESTROY(&rkri->toppar_to_mdpi); + + rd_free(rkri); +} + + +/* Convenience function to bsearch inside the racks of a + * rd_kafka_metadata_partition_internal_t. */ +static char *rd_kafka_partition_internal_find_rack( + rd_kafka_metadata_partition_internal_t *mdpi, + const char *rack) { + char **partition_racks = mdpi->racks; + size_t cnt = mdpi->racks_cnt; + + void *res = + bsearch(&rack, partition_racks, cnt, sizeof(char *), rd_strcmp3); + + if (res) + return *(char **)res; + return NULL; +} + + +/* Computes whether there is a rack mismatch between the rack of the consumer + * and the topic partition/any of its replicas. */ +static rd_bool_t +rd_kafka_racks_mismatch(rd_kafka_rack_info_t *rkri, + const char *consumer, + const rd_kafka_topic_partition_t *topic_partition) { + const char *consumer_rack; + rd_kafka_metadata_partition_internal_t *mdpi; + + if (rkri == NULL) /* Not using rack aware assignment */ + return rd_false; + + consumer_rack = RD_MAP_GET(&rkri->member_id_to_rack_id, consumer); + + mdpi = RD_MAP_GET(&rkri->toppar_to_mdpi, topic_partition); + + return consumer_rack != NULL && + (mdpi == NULL || + !rd_kafka_partition_internal_find_rack(mdpi, consumer_rack)); +} /** * @struct Provides current state of partition movements between consumers @@ -399,13 +519,15 @@ static int sort_by_map_elem_val_toppar_list_cnt(const void *_a, * * The assignment should improve the overall balance of the partition * assignments to consumers. + * @returns true if partition was assigned, false otherwise. */ -static void -assignPartition(const rd_kafka_topic_partition_t *partition, - rd_list_t *sortedCurrentSubscriptions /*rd_map_elem_t*/, - map_str_toppar_list_t *currentAssignment, - map_str_toppar_list_t *consumer2AllPotentialPartitions, - map_toppar_str_t *currentPartitionConsumer) { +static rd_bool_t +maybeAssignPartition(const rd_kafka_topic_partition_t *partition, + rd_list_t *sortedCurrentSubscriptions /*rd_map_elem_t*/, + map_str_toppar_list_t *currentAssignment, + map_str_toppar_list_t *consumer2AllPotentialPartitions, + map_toppar_str_t *currentPartitionConsumer, + rd_kafka_rack_info_t *rkri) { const rd_map_elem_t *elem; int i; @@ -418,6 +540,9 @@ assignPartition(const rd_kafka_topic_partition_t *partition, if (!rd_kafka_topic_partition_list_find( partitions, partition->topic, partition->partition)) continue; + if (rkri != NULL && + rd_kafka_racks_mismatch(rkri, consumer, partition)) + continue; rd_kafka_topic_partition_list_add( RD_MAP_GET(currentAssignment, consumer), partition->topic, @@ -431,8 +556,9 @@ assignPartition(const rd_kafka_topic_partition_t *partition, * This is an O(N) operation since it is a single shuffle. */ rd_list_sort(sortedCurrentSubscriptions, sort_by_map_elem_val_toppar_list_cnt); - return; + return rd_true; } + return rd_false; } /** @@ -639,14 +765,6 @@ isBalanced(rd_kafka_t *rk, ->value) ->cnt; - /* Mapping from partitions to the consumer assigned to them */ - // FIXME: don't create prior to min/max check below */ - map_toppar_str_t allPartitions = RD_MAP_INITIALIZER( - RD_MAP_CNT(partition2AllPotentialConsumers), - rd_kafka_topic_partition_cmp, rd_kafka_topic_partition_hash, - NULL /* references currentAssignment */, - NULL /* references currentAssignment */); - /* Iterators */ const rd_kafka_topic_partition_list_t *partitions; const char *consumer; @@ -661,10 +779,16 @@ isBalanced(rd_kafka_t *rk, "minimum %d and maximum %d partitions assigned " "to each consumer", minimum, maximum); - RD_MAP_DESTROY(&allPartitions); return rd_true; } + /* Mapping from partitions to the consumer assigned to them */ + map_toppar_str_t allPartitions = RD_MAP_INITIALIZER( + RD_MAP_CNT(partition2AllPotentialConsumers), + rd_kafka_topic_partition_cmp, rd_kafka_topic_partition_hash, + NULL /* references currentAssignment */, + NULL /* references currentAssignment */); + /* Create a mapping from partitions to the consumer assigned to them */ RD_MAP_FOREACH(consumer, partitions, currentAssignment) { @@ -695,6 +819,7 @@ isBalanced(rd_kafka_t *rk, * currentAssignment's element we get both the consumer * and partition list in elem here. */ RD_LIST_FOREACH(elem, sortedCurrentSubscriptions, i) { + int j; const char *consumer = (const char *)elem->key; const rd_kafka_topic_partition_list_t *potentialTopicPartitions; const rd_kafka_topic_partition_list_t *consumerPartitions; @@ -712,9 +837,9 @@ isBalanced(rd_kafka_t *rk, /* Otherwise make sure it can't get any more partitions */ - for (i = 0; i < potentialTopicPartitions->cnt; i++) { + for (j = 0; j < potentialTopicPartitions->cnt; j++) { const rd_kafka_topic_partition_t *partition = - &potentialTopicPartitions->elems[i]; + &potentialTopicPartitions->elems[j]; const char *otherConsumer; int otherConsumerPartitionCount; @@ -764,7 +889,8 @@ performReassignments(rd_kafka_t *rk, rd_list_t *sortedCurrentSubscriptions /*rd_map_elem_t*/, map_str_toppar_list_t *consumer2AllPotentialPartitions, map_toppar_list_t *partition2AllPotentialConsumers, - map_toppar_str_t *currentPartitionConsumer) { + map_toppar_str_t *currentPartitionConsumer, + rd_kafka_rack_info_t *rkri) { rd_bool_t reassignmentPerformed = rd_false; rd_bool_t modified, saveIsBalanced = rd_false; int iterations = 0; @@ -796,6 +922,9 @@ performReassignments(rd_kafka_t *rk, const ConsumerGenerationPair_t *prevcgp; const rd_kafka_topic_partition_list_t *currAssignment; int j; + rd_bool_t found_rack; + const char *consumer_rack = NULL; + rd_kafka_metadata_partition_internal_t *mdpi = NULL; /* FIXME: Is this a local error/bug? If so, assert */ if (rd_list_cnt(consumers) <= 1) @@ -832,7 +961,59 @@ performReassignments(rd_kafka_t *rk, } /* Check if a better-suited consumer exists for the - * partition; if so, reassign it. */ + * partition; if so, reassign it. Use consumer within + * rack if possible. */ + if (rkri) { + consumer_rack = RD_MAP_GET( + &rkri->member_id_to_rack_id, consumer); + mdpi = RD_MAP_GET(&rkri->toppar_to_mdpi, + partition); + } + found_rack = rd_false; + + if (consumer_rack != NULL && mdpi != NULL && + mdpi->racks_cnt > 0 && + rd_kafka_partition_internal_find_rack( + mdpi, consumer_rack)) { + RD_LIST_FOREACH(otherConsumer, consumers, j) { + /* No need for rkri == NULL check, that + * is guaranteed if we're inside this if + * block. */ + const char *other_consumer_rack = + RD_MAP_GET( + &rkri->member_id_to_rack_id, + otherConsumer); + + if (other_consumer_rack == NULL || + !rd_kafka_partition_internal_find_rack( + mdpi, other_consumer_rack)) + continue; + + if (currAssignment->cnt <= + RD_MAP_GET(currentAssignment, + otherConsumer) + ->cnt + + 1) + continue; + + reassignPartition( + rk, partitionMovements, partition, + currentAssignment, + sortedCurrentSubscriptions, + currentPartitionConsumer, + consumer2AllPotentialPartitions); + + reassignmentPerformed = rd_true; + modified = rd_true; + found_rack = rd_true; + break; + } + } + + if (found_rack) { + continue; + } + RD_LIST_FOREACH(otherConsumer, consumers, j) { if (consumer == otherConsumer) continue; @@ -911,7 +1092,43 @@ static int getBalanceScore(map_str_toppar_list_t *assignment) { return score; } +static void maybeAssign(rd_kafka_topic_partition_list_t *unassignedPartitions, + map_toppar_list_t *partition2AllPotentialConsumers, + rd_list_t *sortedCurrentSubscriptions /*rd_map_elem_t*/, + map_str_toppar_list_t *currentAssignment, + map_str_toppar_list_t *consumer2AllPotentialPartitions, + map_toppar_str_t *currentPartitionConsumer, + rd_bool_t removeAssigned, + rd_kafka_rack_info_t *rkri) { + int i; + const rd_kafka_topic_partition_t *partition; + for (i = 0; i < unassignedPartitions->cnt; i++) { + partition = &unassignedPartitions->elems[i]; + rd_bool_t assigned; + + /* Skip if there is no potential consumer for the partition. + * FIXME: How could this be? */ + if (rd_list_empty(RD_MAP_GET(partition2AllPotentialConsumers, + partition))) { + rd_dassert(!*"sticky assignor bug"); + continue; + } + + assigned = maybeAssignPartition( + partition, sortedCurrentSubscriptions, currentAssignment, + consumer2AllPotentialPartitions, currentPartitionConsumer, + rkri); + if (assigned && removeAssigned) { + rd_kafka_topic_partition_list_del_by_idx( + unassignedPartitions, i); + i--; /* Since the current element was + * removed we need the next for + * loop iteration to stay at the + * same index. */ + } + } +} /** * @brief Balance the current assignment using the data structures @@ -926,7 +1143,8 @@ static void balance(rd_kafka_t *rk, map_str_toppar_list_t *consumer2AllPotentialPartitions, map_toppar_list_t *partition2AllPotentialConsumers, map_toppar_str_t *currentPartitionConsumer, - rd_bool_t revocationRequired) { + rd_bool_t revocationRequired, + rd_kafka_rack_info_t *rkri) { /* If the consumer with most assignments (thus the last element * in the ascendingly ordered sortedCurrentSubscriptions list) has @@ -964,23 +1182,34 @@ static void balance(rd_kafka_t *rk, const void *ignore; const rd_map_elem_t *elem; int i; + rd_kafka_topic_partition_list_t *leftoverUnassignedPartitions; + rd_bool_t leftoverUnassignedPartitions_allocated = rd_false; - /* Assign all unassigned partitions */ - for (i = 0; i < unassignedPartitions->cnt; i++) { - partition = &unassignedPartitions->elems[i]; + leftoverUnassignedPartitions = + unassignedPartitions; /* copy on write. */ - /* Skip if there is no potential consumer for the partition. - * FIXME: How could this be? */ - if (rd_list_empty(RD_MAP_GET(partition2AllPotentialConsumers, - partition))) { - rd_dassert(!*"sticky assignor bug"); - continue; - } - - assignPartition( - partition, sortedCurrentSubscriptions, currentAssignment, - consumer2AllPotentialPartitions, currentPartitionConsumer); + if (rkri != NULL && RD_MAP_CNT(&rkri->member_id_to_rack_id) != 0) { + leftoverUnassignedPartitions_allocated = rd_true; + /* Since maybeAssign is called twice, we keep track of those + * partitions which the first call has taken care of already, + * but we don't want to modify the original + * unassignedPartitions. */ + leftoverUnassignedPartitions = + rd_kafka_topic_partition_list_copy(unassignedPartitions); + maybeAssign(leftoverUnassignedPartitions, + partition2AllPotentialConsumers, + sortedCurrentSubscriptions, currentAssignment, + consumer2AllPotentialPartitions, + currentPartitionConsumer, rd_true, rkri); } + maybeAssign(leftoverUnassignedPartitions, + partition2AllPotentialConsumers, sortedCurrentSubscriptions, + currentAssignment, consumer2AllPotentialPartitions, + currentPartitionConsumer, rd_false, NULL); + + if (leftoverUnassignedPartitions_allocated) + rd_kafka_topic_partition_list_destroy( + leftoverUnassignedPartitions); /* Narrow down the reassignment scope to only those partitions that can @@ -1050,17 +1279,18 @@ static void balance(rd_kafka_t *rk, * changes, first try to balance by only moving newly added partitions. */ if (!revocationRequired && unassignedPartitions->cnt > 0) - performReassignments( - rk, partitionMovements, unassignedPartitions, - currentAssignment, prevAssignment, - sortedCurrentSubscriptions, consumer2AllPotentialPartitions, - partition2AllPotentialConsumers, currentPartitionConsumer); + performReassignments(rk, partitionMovements, + unassignedPartitions, currentAssignment, + prevAssignment, sortedCurrentSubscriptions, + consumer2AllPotentialPartitions, + partition2AllPotentialConsumers, + currentPartitionConsumer, rkri); reassignmentPerformed = performReassignments( rk, partitionMovements, sortedPartitions, currentAssignment, prevAssignment, sortedCurrentSubscriptions, consumer2AllPotentialPartitions, partition2AllPotentialConsumers, - currentPartitionConsumer); + currentPartitionConsumer, rkri); /* If we are not preserving existing assignments and we have made * changes to the current assignment make sure we are getting a more @@ -1180,24 +1410,6 @@ static void prepopulateCurrentAssignments( &sortedPartitionConsumersByGeneration, partition, rd_list_new(10, ConsumerGenerationPair_destroy)); - if (consumer->rkgm_generation != -1 && - rd_list_find( - consumers, &consumer->rkgm_generation, - ConsumerGenerationPair_cmp_generation)) { - rd_kafka_log( - rk, LOG_WARNING, "STICKY", - "Sticky assignor: " - "%s [%" PRId32 - "] is assigned to " - "multiple consumers with same " - "generation %d: " - "skipping member %.*s", - partition->topic, partition->partition, - consumer->rkgm_generation, - RD_KAFKAP_STR_PR(consumer->rkgm_member_id)); - continue; - } - rd_list_add(consumers, ConsumerGenerationPair_new( consumer->rkgm_member_id->str, @@ -1215,24 +1427,55 @@ static void prepopulateCurrentAssignments( RD_MAP_FOREACH(partition, consumers, &sortedPartitionConsumersByGeneration) { /* current and previous are the last two consumers - * of each partition. */ - ConsumerGenerationPair_t *current, *previous; + * of each partition, and found is used to check for duplicate + * consumers of same generation. */ + ConsumerGenerationPair_t *current, *previous, *found; rd_kafka_topic_partition_list_t *partitions; /* Sort the per-partition consumers list by generation */ rd_list_sort(consumers, ConsumerGenerationPair_cmp_generation); + /* In case a partition is claimed by multiple consumers with the + * same generation, invalidate it for all such consumers, and + * log an error for this situation. */ + if ((found = rd_list_find_duplicate( + consumers, ConsumerGenerationPair_cmp_generation))) { + const char *consumer1, *consumer2; + int idx = rd_list_index( + consumers, found, + ConsumerGenerationPair_cmp_generation); + consumer1 = ((ConsumerGenerationPair_t *)rd_list_elem( + consumers, idx)) + ->consumer; + consumer2 = ((ConsumerGenerationPair_t *)rd_list_elem( + consumers, idx + 1)) + ->consumer; + + RD_MAP_DELETE(currentPartitionConsumer, partition); + + rd_kafka_log( + rk, LOG_ERR, "STICKY", + "Sticky assignor: Found multiple consumers %s and " + "%s claiming the same topic partition %s:%d in the " + "same generation %d, this will be invalidated and " + "removed from their previous assignment.", + consumer1, consumer2, partition->topic, + partition->partition, found->generation); + continue; + } + /* Add current (highest generation) consumer * to currentAssignment. */ - current = rd_list_elem(consumers, 0); + current = rd_list_last(consumers); partitions = RD_MAP_GET(currentAssignment, current->consumer); rd_kafka_topic_partition_list_add(partitions, partition->topic, partition->partition); /* Add previous (next highest generation) consumer, if any, * to prevAssignment. */ - previous = rd_list_elem(consumers, 1); - if (previous) + if (rd_list_cnt(consumers) >= 2 && + (previous = + rd_list_elem(consumers, rd_list_cnt(consumers) - 2))) RD_MAP_SET( prevAssignment, rd_kafka_topic_partition_copy(partition), @@ -1590,6 +1833,11 @@ rd_kafka_sticky_assignor_assign_cb(rd_kafka_t *rk, void *opaque) { /* FIXME: Let the cgrp pass the actual eligible partition count */ size_t partition_cnt = member_cnt * 10; /* FIXME */ + const rd_kafka_metadata_internal_t *mdi = + rd_kafka_metadata_get_internal(metadata); + + rd_kafka_rack_info_t *rkri = + rd_kafka_rack_info_new(eligible_topics, eligible_topic_cnt, mdi); /* Map of subscriptions. This is \p member turned into a map. */ map_str_toppar_list_t subscriptions = @@ -1680,6 +1928,10 @@ rd_kafka_sticky_assignor_assign_cb(rd_kafka_t *rk, unassignedPartitions = rd_kafka_topic_partition_list_copy(sortedPartitions); + if (rkri) + rd_kafka_dbg(rk, CGRP, "STICKY", + "Sticky assignor: using rack aware assignment."); + RD_MAP_FOREACH(consumer, partitions, ¤tAssignment) { if (!RD_MAP_GET(&subscriptions, consumer)) { /* If a consumer that existed before @@ -1726,13 +1978,16 @@ rd_kafka_sticky_assignor_assign_cb(rd_kafka_t *rk, RD_MAP_GET(&subscriptions, consumer), partition->topic, - RD_KAFKA_PARTITION_UA)) { + RD_KAFKA_PARTITION_UA) || + rd_kafka_racks_mismatch( + rkri, consumer, partition)) { /* If this partition cannot remain * assigned to its current consumer * because the consumer is no longer - * subscribed to its topic, remove it - * from the currentAssignment of the - * consumer. */ + * subscribed to its topic, or racks + * don't match for rack-aware + * assignment, remove it from the + * currentAssignment of the consumer. */ remove_part = rd_true; revocationRequired = rd_true; } else { @@ -1785,7 +2040,7 @@ rd_kafka_sticky_assignor_assign_cb(rd_kafka_t *rk, sortedPartitions, unassignedPartitions, &sortedCurrentSubscriptions, &consumer2AllPotentialPartitions, &partition2AllPotentialConsumers, ¤tPartitionConsumer, - revocationRequired); + revocationRequired, rkri); /* Transfer currentAssignment (now updated) to each member's * assignment. */ @@ -1798,6 +2053,7 @@ rd_kafka_sticky_assignor_assign_cb(rd_kafka_t *rk, rd_kafka_topic_partition_list_destroy(unassignedPartitions); rd_kafka_topic_partition_list_destroy(sortedPartitions); + rd_kafka_rack_info_destroy(rkri); RD_MAP_DESTROY(¤tPartitionConsumer); RD_MAP_DESTROY(&consumer2AllPotentialPartitions); @@ -1837,7 +2093,8 @@ static rd_kafkap_bytes_t *rd_kafka_sticky_assignor_get_metadata( const rd_kafka_assignor_t *rkas, void *assignor_state, const rd_list_t *topics, - const rd_kafka_topic_partition_list_t *owned_partitions) { + const rd_kafka_topic_partition_list_t *owned_partitions, + const rd_kafkap_str_t *rack_id) { rd_kafka_sticky_assignor_state_t *state; rd_kafka_buf_t *rkbuf; rd_kafkap_bytes_t *metadata; @@ -1855,19 +2112,24 @@ static rd_kafkap_bytes_t *rd_kafka_sticky_assignor_get_metadata( * If there is no previous assignment, UserData is NULL. */ + if (!assignor_state) { return rd_kafka_consumer_protocol_member_metadata_new( - topics, NULL, 0, owned_partitions); + topics, NULL, 0, owned_partitions, -1 /* generation */, + rack_id); } state = (rd_kafka_sticky_assignor_state_t *)assignor_state; rkbuf = rd_kafka_buf_new(1, 100); rd_assert(state->prev_assignment != NULL); + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; rd_kafka_buf_write_topic_partitions( rkbuf, state->prev_assignment, rd_false /*skip invalid offsets*/, - rd_false /*any offset*/, rd_false /*write offsets*/, - rd_false /*write epoch*/, rd_false /*write metadata*/); + rd_false /*any offset*/, rd_false /*don't use topic id*/, + rd_true /*use topic name*/, fields); rd_kafka_buf_write_i32(rkbuf, state->generation_id); /* Get binary buffer and allocate a new Kafka Bytes with a copy. */ @@ -1878,7 +2140,8 @@ static rd_kafkap_bytes_t *rd_kafka_sticky_assignor_get_metadata( rd_kafka_buf_destroy(rkbuf); metadata = rd_kafka_consumer_protocol_member_metadata_new( - topics, kbytes->data, kbytes->len, owned_partitions); + topics, kbytes->data, kbytes->len, owned_partitions, + state->generation_id, rack_id); rd_kafkap_bytes_destroy(kbytes); @@ -1911,296 +2174,97 @@ static void rd_kafka_sticky_assignor_state_destroy(void *assignor_state) { * */ +/* All possible racks used in tests, as well as several common rack configs used + * by consumers */ +static rd_kafkap_str_t + *ALL_RACKS[7]; /* initialized before starting the unit tests. */ +static int RACKS_INITIAL[] = {0, 1, 2}; +static int RACKS_NULL[] = {6, 6, 6}; +static int RACKS_FINAL[] = {4, 5, 6}; +static int RACKS_ONE_NULL[] = {6, 4, 5}; - -/** - * @brief Set a member's owned partitions based on its assignment. - * - * For use between assignor_run(). This is mimicing a consumer receiving - * its new assignment and including it in the next rebalance as its - * owned-partitions. - */ -static void ut_set_owned(rd_kafka_group_member_t *rkgm) { - if (rkgm->rkgm_owned) - rd_kafka_topic_partition_list_destroy(rkgm->rkgm_owned); - - rkgm->rkgm_owned = - rd_kafka_topic_partition_list_copy(rkgm->rkgm_assignment); +/* Helper to get consumer rack based on the index of the consumer. */ +static rd_kafkap_str_t * +ut_get_consumer_rack(int idx, + rd_kafka_assignor_ut_rack_config_t parametrization) { + const int cycle_size = + (parametrization == RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK + ? RD_ARRAYSIZE(ALL_RACKS) + : 3); + return (ALL_RACKS[idx % cycle_size]); } - -/** - * @brief Verify assignment validity and balance. - * - * @remark Also updates the members owned partitions to the assignment. - */ - -static int verifyValidityAndBalance0(const char *func, - int line, - rd_kafka_group_member_t *members, - size_t member_cnt, - const rd_kafka_metadata_t *metadata) { - int fails = 0; - int i; - rd_bool_t verbose = rd_false; /* Enable for troubleshooting */ - - RD_UT_SAY("%s:%d: verifying assignment for %d member(s):", func, line, - (int)member_cnt); - - for (i = 0; i < (int)member_cnt; i++) { - const char *consumer = members[i].rkgm_member_id->str; - const rd_kafka_topic_partition_list_t *partitions = - members[i].rkgm_assignment; - int p, j; - - if (verbose) - RD_UT_SAY( - "%s:%d: " - "consumer \"%s\", %d subscribed topic(s), " - "%d assigned partition(s):", - func, line, consumer, - members[i].rkgm_subscription->cnt, partitions->cnt); - - for (p = 0; p < partitions->cnt; p++) { - const rd_kafka_topic_partition_t *partition = - &partitions->elems[p]; - - if (verbose) - RD_UT_SAY("%s:%d: %s [%" PRId32 "]", func, - line, partition->topic, - partition->partition); - - if (!rd_kafka_topic_partition_list_find( - members[i].rkgm_subscription, partition->topic, - RD_KAFKA_PARTITION_UA)) { - RD_UT_WARN("%s [%" PRId32 - "] is assigned to " - "%s but it is not subscribed to " - "that topic", - partition->topic, - partition->partition, consumer); - fails++; - } - } - - /* Update the member's owned partitions to match - * the assignment. */ - ut_set_owned(&members[i]); - - if (i == (int)member_cnt - 1) - continue; - - for (j = i + 1; j < (int)member_cnt; j++) { - const char *otherConsumer = - members[j].rkgm_member_id->str; - const rd_kafka_topic_partition_list_t *otherPartitions = - members[j].rkgm_assignment; - rd_bool_t balanced = - abs(partitions->cnt - otherPartitions->cnt) <= 1; - - for (p = 0; p < partitions->cnt; p++) { - const rd_kafka_topic_partition_t *partition = - &partitions->elems[p]; - - if (rd_kafka_topic_partition_list_find( - otherPartitions, partition->topic, - partition->partition)) { - RD_UT_WARN( - "Consumer %s and %s are both " - "assigned %s [%" PRId32 "]", - consumer, otherConsumer, - partition->topic, - partition->partition); - fails++; - } - - - /* If assignment is imbalanced and this topic - * is also subscribed by the other consumer - * it means the assignment strategy failed to - * properly balance the partitions. */ - if (!balanced && - rd_kafka_topic_partition_list_find_topic( - otherPartitions, partition->topic)) { - RD_UT_WARN( - "Some %s partition(s) can be " - "moved from " - "%s (%d partition(s)) to " - "%s (%d partition(s)) to " - "achieve a better balance", - partition->topic, consumer, - partitions->cnt, otherConsumer, - otherPartitions->cnt); - fails++; - } - } - } - } - - RD_UT_ASSERT(!fails, "%s:%d: See %d previous errors", func, line, - fails); - - return 0; -} - - -#define verifyValidityAndBalance(members, member_cnt, metadata) \ - do { \ - if (verifyValidityAndBalance0(__FUNCTION__, __LINE__, members, \ - member_cnt, metadata)) \ - return 1; \ - } while (0) - - -/** - * @brief Checks that all assigned partitions are fully balanced. - * - * Only works for symmetrical subscriptions. - */ -static int isFullyBalanced0(const char *function, - int line, - const rd_kafka_group_member_t *members, - size_t member_cnt) { - int min_assignment = INT_MAX; - int max_assignment = -1; +/* Helper to populate a member's owned partitions (accepted as variadic), and + * generation. */ +static void +ut_populate_member_owned_partitions_generation(rd_kafka_group_member_t *rkgm, + int generation, + size_t partition_cnt, + ...) { + va_list ap; size_t i; - for (i = 0; i < member_cnt; i++) { - int size = members[i].rkgm_assignment->cnt; - if (size < min_assignment) - min_assignment = size; - if (size > max_assignment) - max_assignment = size; - } + if (rkgm->rkgm_owned) + rd_kafka_topic_partition_list_destroy(rkgm->rkgm_owned); + rkgm->rkgm_owned = rd_kafka_topic_partition_list_new(partition_cnt); - RD_UT_ASSERT(max_assignment - min_assignment <= 1, - "%s:%d: Assignment not balanced: min %d, max %d", function, - line, min_assignment, max_assignment); - - return 0; -} - -#define isFullyBalanced(members, member_cnt) \ - do { \ - if (isFullyBalanced0(__FUNCTION__, __LINE__, members, \ - member_cnt)) \ - return 1; \ - } while (0) - - -static void -ut_print_toppar_list(const rd_kafka_topic_partition_list_t *partitions) { - int i; - - for (i = 0; i < partitions->cnt; i++) - RD_UT_SAY(" %s [%" PRId32 "]", partitions->elems[i].topic, - partitions->elems[i].partition); -} - - - -/** - * @brief Verify that member's assignment matches the expected partitions. - * - * The va-list is a NULL-terminated list of (const char *topic, int partition) - * tuples. - * - * @returns 0 on success, else raises a unittest error and returns 1. - */ -static int verifyAssignment0(const char *function, - int line, - rd_kafka_group_member_t *rkgm, - ...) { - va_list ap; - int cnt = 0; - const char *topic; - int fails = 0; - - va_start(ap, rkgm); - while ((topic = va_arg(ap, const char *))) { + va_start(ap, partition_cnt); + for (i = 0; i < partition_cnt; i++) { + char *topic = va_arg(ap, char *); int partition = va_arg(ap, int); - cnt++; + rd_kafka_topic_partition_list_add(rkgm->rkgm_owned, topic, + partition); + } + va_end(ap); - if (!rd_kafka_topic_partition_list_find(rkgm->rkgm_assignment, - topic, partition)) { - RD_UT_WARN( - "%s:%d: Expected %s [%d] not found in %s's " - "assignment (%d partition(s))", - function, line, topic, partition, - rkgm->rkgm_member_id->str, - rkgm->rkgm_assignment->cnt); - fails++; + rkgm->rkgm_generation = generation; +} + +/* Helper to create topic partition list from a variadic list of topic, + * partition pairs. */ +static rd_kafka_topic_partition_list_t ** +ut_create_topic_partition_lists(size_t list_cnt, ...) { + va_list ap; + size_t i; + rd_kafka_topic_partition_list_t **lists = + rd_calloc(list_cnt, sizeof(rd_kafka_topic_partition_list_t *)); + + va_start(ap, list_cnt); + for (i = 0; i < list_cnt; i++) { + const char *topic; + lists[i] = rd_kafka_topic_partition_list_new(0); + while ((topic = va_arg(ap, const char *))) { + int partition = va_arg(ap, int); + rd_kafka_topic_partition_list_add(lists[i], topic, + partition); } } va_end(ap); - if (cnt != rkgm->rkgm_assignment->cnt) { - RD_UT_WARN( - "%s:%d: " - "Expected %d assigned partition(s) for %s, not %d", - function, line, cnt, rkgm->rkgm_member_id->str, - rkgm->rkgm_assignment->cnt); - fails++; + return lists; +} + +static int +ut_testOneConsumerNoTopic(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[1]; + + if (parametrization == RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK) { + RD_UT_PASS(); } - if (fails) - ut_print_toppar_list(rkgm->rkgm_assignment); - RD_UT_ASSERT(!fails, "%s:%d: See previous errors", function, line); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 0); - return 0; -} - -#define verifyAssignment(rkgm, ...) \ - do { \ - if (verifyAssignment0(__FUNCTION__, __LINE__, rkgm, \ - __VA_ARGS__)) \ - return 1; \ - } while (0) - - - -/** - * @brief Initialize group member struct for testing. - * - * va-args is a NULL-terminated list of (const char *) topics. - * - * Use rd_kafka_group_member_clear() to free fields. - */ -static void -ut_init_member(rd_kafka_group_member_t *rkgm, const char *member_id, ...) { - va_list ap; - const char *topic; - - memset(rkgm, 0, sizeof(*rkgm)); - - rkgm->rkgm_member_id = rd_kafkap_str_new(member_id, -1); - rkgm->rkgm_group_instance_id = rd_kafkap_str_new(member_id, -1); - rd_list_init(&rkgm->rkgm_eligible, 0, NULL); - - rkgm->rkgm_subscription = rd_kafka_topic_partition_list_new(4); - - va_start(ap, member_id); - while ((topic = va_arg(ap, const char *))) - rd_kafka_topic_partition_list_add(rkgm->rkgm_subscription, - topic, RD_KAFKA_PARTITION_UA); - va_end(ap); - - rkgm->rkgm_assignment = - rd_kafka_topic_partition_list_new(rkgm->rkgm_subscription->size); -} - - - -static int ut_testOneConsumerNoTopic(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { - rd_kafka_resp_err_t err; - char errstr[512]; - rd_kafka_metadata_t *metadata; - rd_kafka_group_member_t members[1]; - - metadata = rd_kafka_metadata_new_topic_mock(NULL, 0); - ut_init_member(&members[0], "consumer1", "topic1", NULL); + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2212,21 +2276,32 @@ static int ut_testOneConsumerNoTopic(rd_kafka_t *rk, isFullyBalanced(members, RD_ARRAYSIZE(members)); rd_kafka_group_member_clear(&members[0]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int ut_testOneConsumerNonexistentTopic(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testOneConsumerNonexistentTopic( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[1]; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 0); - ut_init_member(&members[0], "consumer1", "topic1", NULL); + if (parametrization == RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK) { + RD_UT_PASS(); + } + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 0); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2238,22 +2313,29 @@ static int ut_testOneConsumerNonexistentTopic(rd_kafka_t *rk, isFullyBalanced(members, RD_ARRAYSIZE(members)); rd_kafka_group_member_clear(&members[0]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int ut_testOneConsumerOneTopic(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int +ut_testOneConsumerOneTopic(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[1]; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 3); - ut_init_member(&members[0], "consumer1", "topic1", NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2270,7 +2352,7 @@ static int ut_testOneConsumerOneTopic(rd_kafka_t *rk, isFullyBalanced(members, RD_ARRAYSIZE(members)); rd_kafka_group_member_clear(&members[0]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } @@ -2278,16 +2360,20 @@ static int ut_testOneConsumerOneTopic(rd_kafka_t *rk, static int ut_testOnlyAssignsPartitionsFromSubscribedTopics( rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { - + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[1]; - metadata = - rd_kafka_metadata_new_topic_mockv(2, "topic1", 3, "topic2", 3); - ut_init_member(&members[0], "consumer1", "topic1", NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 3, "topic2", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2301,22 +2387,28 @@ static int ut_testOnlyAssignsPartitionsFromSubscribedTopics( isFullyBalanced(members, RD_ARRAYSIZE(members)); rd_kafka_group_member_clear(&members[0]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int ut_testOneConsumerMultipleTopics(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testOneConsumerMultipleTopics( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[1]; - metadata = - rd_kafka_metadata_new_topic_mockv(2, "topic1", 1, "topic2", 2); - ut_init_member(&members[0], "consumer1", "topic1", "topic2", NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 1, "topic2", 2); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2330,22 +2422,30 @@ static int ut_testOneConsumerMultipleTopics(rd_kafka_t *rk, isFullyBalanced(members, RD_ARRAYSIZE(members)); rd_kafka_group_member_clear(&members[0]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int -ut_testTwoConsumersOneTopicOnePartition(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testTwoConsumersOneTopicOnePartition( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[2]; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 1); - ut_init_member(&members[0], "consumer1", "topic1", NULL); - ut_init_member(&members[1], "consumer2", "topic1", NULL); + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 1); + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2360,23 +2460,31 @@ ut_testTwoConsumersOneTopicOnePartition(rd_kafka_t *rk, rd_kafka_group_member_clear(&members[0]); rd_kafka_group_member_clear(&members[1]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int -ut_testTwoConsumersOneTopicTwoPartitions(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testTwoConsumersOneTopicTwoPartitions( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[2]; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 2); - ut_init_member(&members[0], "consumer1", "topic1", NULL); - ut_init_member(&members[1], "consumer2", "topic1", NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 2); + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2391,7 +2499,7 @@ ut_testTwoConsumersOneTopicTwoPartitions(rd_kafka_t *rk, rd_kafka_group_member_clear(&members[0]); rd_kafka_group_member_clear(&members[1]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } @@ -2399,18 +2507,27 @@ ut_testTwoConsumersOneTopicTwoPartitions(rd_kafka_t *rk, static int ut_testMultipleConsumersMixedTopicSubscriptions( rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[3]; - metadata = - rd_kafka_metadata_new_topic_mockv(2, "topic1", 3, "topic2", 2); - ut_init_member(&members[0], "consumer1", "topic1", NULL); - ut_init_member(&members[1], "consumer2", "topic1", "topic2", NULL); - ut_init_member(&members[2], "consumer3", "topic1", NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 3, "topic2", 2); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_initMemberConditionalRack(&members[2], "consumer3", + ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2427,24 +2544,31 @@ static int ut_testMultipleConsumersMixedTopicSubscriptions( rd_kafka_group_member_clear(&members[0]); rd_kafka_group_member_clear(&members[1]); rd_kafka_group_member_clear(&members[2]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int -ut_testTwoConsumersTwoTopicsSixPartitions(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testTwoConsumersTwoTopicsSixPartitions( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[2]; - metadata = - rd_kafka_metadata_new_topic_mockv(2, "topic1", 3, "topic2", 3); - ut_init_member(&members[0], "consumer1", "topic1", "topic2", NULL); - ut_init_member(&members[1], "consumer2", "topic1", "topic2", NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 3, "topic2", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2461,21 +2585,29 @@ ut_testTwoConsumersTwoTopicsSixPartitions(rd_kafka_t *rk, rd_kafka_group_member_clear(&members[0]); rd_kafka_group_member_clear(&members[1]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int ut_testAddRemoveConsumerOneTopic(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testAddRemoveConsumerOneTopic( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[2]; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 3); - ut_init_member(&members[0], "consumer1", "topic1", NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, 1, errstr, sizeof(errstr)); @@ -2488,7 +2620,9 @@ static int ut_testAddRemoveConsumerOneTopic(rd_kafka_t *rk, isFullyBalanced(members, 1); /* Add consumer2 */ - ut_init_member(&members[1], "consumer2", "topic1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2517,7 +2651,7 @@ static int ut_testAddRemoveConsumerOneTopic(rd_kafka_t *rk, rd_kafka_group_member_clear(&members[0]); rd_kafka_group_member_clear(&members[1]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } @@ -2543,25 +2677,35 @@ static int ut_testAddRemoveConsumerOneTopic(rd_kafka_t *rk, * - consumer3: topic1-1, topic5-0 * - consumer4: topic4-0, topic5-1 */ -static int -ut_testPoorRoundRobinAssignmentScenario(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testPoorRoundRobinAssignmentScenario( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[4]; - metadata = rd_kafka_metadata_new_topic_mockv( - 5, "topic1", 2, "topic2", 1, "topic3", 2, "topic4", 1, "topic5", 2); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 5, "topic1", 2, "topic2", 1, "topic3", 2, + "topic4", 1, "topic5", 2); - ut_init_member(&members[0], "consumer1", "topic1", "topic2", "topic3", - "topic4", "topic5", NULL); - ut_init_member(&members[1], "consumer2", "topic1", "topic3", "topic5", - NULL); - ut_init_member(&members[2], "consumer3", "topic1", "topic3", "topic5", - NULL); - ut_init_member(&members[3], "consumer4", "topic1", "topic2", "topic3", - "topic4", "topic5", NULL); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", + "topic3", "topic4", "topic5", NULL); + ut_initMemberConditionalRack( + &members[1], "consumer2", ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic3", "topic5", NULL); + ut_initMemberConditionalRack( + &members[2], "consumer3", ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", "topic3", "topic5", NULL); + ut_initMemberConditionalRack(&members[3], "consumer4", + ut_get_consumer_rack(3, parametrization), + parametrization, "topic1", "topic2", + "topic3", "topic4", "topic5", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2580,23 +2724,32 @@ ut_testPoorRoundRobinAssignmentScenario(rd_kafka_t *rk, rd_kafka_group_member_clear(&members[1]); rd_kafka_group_member_clear(&members[2]); rd_kafka_group_member_clear(&members[3]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int ut_testAddRemoveTopicTwoConsumers(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testAddRemoveTopicTwoConsumers( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[2]; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 3); - ut_init_member(&members[0], "consumer1", "topic1", "topic2", NULL); - ut_init_member(&members[1], "consumer2", "topic1", "topic2", NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2613,9 +2766,11 @@ static int ut_testAddRemoveTopicTwoConsumers(rd_kafka_t *rk, * Add topic2 */ RD_UT_SAY("Adding topic2"); - rd_kafka_metadata_destroy(metadata); - metadata = - rd_kafka_metadata_new_topic_mockv(2, "topic1", 3, "topic2", 3); + ut_destroy_metadata(metadata); + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 3, "topic2", 3); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2636,8 +2791,11 @@ static int ut_testAddRemoveTopicTwoConsumers(rd_kafka_t *rk, * Remove topic1 */ RD_UT_SAY("Removing topic1"); - rd_kafka_metadata_destroy(metadata); - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic2", 3); + ut_destroy_metadata(metadata); + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic2", 3); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -2653,15 +2811,16 @@ static int ut_testAddRemoveTopicTwoConsumers(rd_kafka_t *rk, rd_kafka_group_member_clear(&members[0]); rd_kafka_group_member_clear(&members[1]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int -ut_testReassignmentAfterOneConsumerLeaves(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testReassignmentAfterOneConsumerLeaves( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; @@ -2678,8 +2837,9 @@ ut_testReassignmentAfterOneConsumerLeaves(rd_kafka_t *rk, mt[i].partition_cnt = i + 1; } - metadata = rd_kafka_metadata_new_topic_mock(mt, topic_cnt); - + ut_initMetadataConditionalRack0(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), + parametrization, mt, topic_cnt); for (i = 1; i <= member_cnt; i++) { char name[20]; @@ -2693,7 +2853,12 @@ ut_testReassignmentAfterOneConsumerLeaves(rd_kafka_t *rk, subscription, topic, RD_KAFKA_PARTITION_UA); } rd_snprintf(name, sizeof(name), "consumer%d", i); - ut_init_member(&members[i - 1], name, NULL); + + ut_initMemberConditionalRack( + &members[i - 1], name, + ut_get_consumer_rack(i, parametrization), parametrization, + NULL); + rd_kafka_topic_partition_list_destroy( members[i - 1].rkgm_subscription); members[i - 1].rkgm_subscription = subscription; @@ -2723,15 +2888,16 @@ ut_testReassignmentAfterOneConsumerLeaves(rd_kafka_t *rk, for (i = 0; i < member_cnt; i++) rd_kafka_group_member_clear(&members[i]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int -ut_testReassignmentAfterOneConsumerAdded(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testReassignmentAfterOneConsumerAdded( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; @@ -2739,7 +2905,9 @@ ut_testReassignmentAfterOneConsumerAdded(rd_kafka_t *rk, int member_cnt = RD_ARRAYSIZE(members); int i; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 20); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 20); for (i = 1; i <= member_cnt; i++) { char name[20]; @@ -2748,7 +2916,10 @@ ut_testReassignmentAfterOneConsumerAdded(rd_kafka_t *rk, rd_kafka_topic_partition_list_add(subscription, "topic1", RD_KAFKA_PARTITION_UA); rd_snprintf(name, sizeof(name), "consumer%d", i); - ut_init_member(&members[i - 1], name, NULL); + ut_initMemberConditionalRack( + &members[i - 1], name, + ut_get_consumer_rack(i, parametrization), parametrization, + NULL); rd_kafka_topic_partition_list_destroy( members[i - 1].rkgm_subscription); members[i - 1].rkgm_subscription = subscription; @@ -2776,14 +2947,16 @@ ut_testReassignmentAfterOneConsumerAdded(rd_kafka_t *rk, for (i = 0; i < member_cnt; i++) rd_kafka_group_member_clear(&members[i]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int ut_testSameSubscriptions(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int +ut_testSameSubscriptions(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; @@ -2804,12 +2977,17 @@ static int ut_testSameSubscriptions(rd_kafka_t *rk, RD_KAFKA_PARTITION_UA); } - metadata = rd_kafka_metadata_new_topic_mock(mt, topic_cnt); + ut_initMetadataConditionalRack0(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), + parametrization, mt, topic_cnt); for (i = 1; i <= member_cnt; i++) { char name[16]; rd_snprintf(name, sizeof(name), "consumer%d", i); - ut_init_member(&members[i - 1], name, NULL); + ut_initMemberConditionalRack( + &members[i - 1], name, + ut_get_consumer_rack(i, parametrization), parametrization, + NULL); rd_kafka_topic_partition_list_destroy( members[i - 1].rkgm_subscription); members[i - 1].rkgm_subscription = @@ -2838,7 +3016,7 @@ static int ut_testSameSubscriptions(rd_kafka_t *rk, for (i = 0; i < member_cnt; i++) rd_kafka_group_member_clear(&members[i]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); rd_kafka_topic_partition_list_destroy(subscription); RD_UT_PASS(); @@ -2847,7 +3025,11 @@ static int ut_testSameSubscriptions(rd_kafka_t *rk, static int ut_testLargeAssignmentWithMultipleConsumersLeaving( rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + if (rd_unittest_with_valgrind) + RD_UT_SKIP( + "Skipping large assignment test when using Valgrind"); rd_kafka_resp_err_t err; char errstr[512]; @@ -2865,7 +3047,9 @@ static int ut_testLargeAssignmentWithMultipleConsumersLeaving( mt[i].partition_cnt = i + 1; } - metadata = rd_kafka_metadata_new_topic_mock(mt, topic_cnt); + ut_initMetadataConditionalRack0(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), + parametrization, mt, topic_cnt); for (i = 0; i < member_cnt; i++) { /* Java tests use a random set, this is more deterministic. */ @@ -2882,7 +3066,10 @@ static int ut_testLargeAssignmentWithMultipleConsumersLeaving( RD_KAFKA_PARTITION_UA); rd_snprintf(name, sizeof(name), "consumer%d", i + 1); - ut_init_member(&members[i], name, NULL); + ut_initMemberConditionalRack( + &members[i], name, ut_get_consumer_rack(i, parametrization), + parametrization, NULL); + rd_kafka_topic_partition_list_destroy( members[i].rkgm_subscription); members[i].rkgm_subscription = subscription; @@ -2913,14 +3100,16 @@ static int ut_testLargeAssignmentWithMultipleConsumersLeaving( for (i = 0; i < member_cnt; i++) rd_kafka_group_member_clear(&members[i]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int ut_testNewSubscription(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int +ut_testNewSubscription(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; @@ -2928,15 +3117,19 @@ static int ut_testNewSubscription(rd_kafka_t *rk, int member_cnt = RD_ARRAYSIZE(members); int i; - metadata = rd_kafka_metadata_new_topic_mockv( - 5, "topic1", 1, "topic2", 2, "topic3", 3, "topic4", 4, "topic5", 5); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 5, "topic1", 1, "topic2", 2, "topic3", 3, + "topic4", 4, "topic5", 5); for (i = 0; i < member_cnt; i++) { char name[16]; int j; rd_snprintf(name, sizeof(name), "consumer%d", i); - ut_init_member(&members[i], name, NULL); + ut_initMemberConditionalRack( + &members[i], name, ut_get_consumer_rack(i, parametrization), + parametrization, NULL); rd_kafka_topic_partition_list_destroy( members[i].rkgm_subscription); @@ -2975,14 +3168,16 @@ static int ut_testNewSubscription(rd_kafka_t *rk, for (i = 0; i < member_cnt; i++) rd_kafka_group_member_clear(&members[i]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int ut_testMoveExistingAssignments(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testMoveExistingAssignments( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; @@ -2992,12 +3187,22 @@ static int ut_testMoveExistingAssignments(rd_kafka_t *rk, int i; int fails = 0; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 3); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 3); - ut_init_member(&members[0], "consumer1", "topic1", NULL); - ut_init_member(&members[1], "consumer2", "topic1", NULL); - ut_init_member(&members[2], "consumer3", "topic1", NULL); - ut_init_member(&members[3], "consumer4", "topic1", NULL); + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[2], "consumer3", + ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[3], "consumer4", + ut_get_consumer_rack(3, parametrization), + parametrization, "topic1", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, member_cnt, errstr, sizeof(errstr)); @@ -3058,14 +3263,75 @@ static int ut_testMoveExistingAssignments(rd_kafka_t *rk, if (assignments[i]) rd_kafka_topic_partition_list_destroy(assignments[i]); } - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } +/* The original version of this test diverged from the Java implementaion in + * what it was testing. It's not certain whether it was by mistake, or by + * design, but the new version matches the Java implementation, and the old one + * is retained as well, since it provides extra coverage. + */ +static int ut_testMoveExistingAssignments_j( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[3]; + int member_cnt = RD_ARRAYSIZE(members); + rd_kafka_topic_partition_list_t *assignments[4] = RD_ZERO_INIT; + int i; -static int ut_testStickiness(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas) { + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 6, "topic1", 1, "topic2", 1, "topic3", 1, + "topic4", 1, "topic5", 1, "topic6", 1); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], 1 /* generation */, 1, "topic1", 0); + + ut_initMemberConditionalRack( + &members[1], "consumer2", ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", "topic3", "topic4", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], 1 /* generation */, 2, "topic2", 0, "topic3", 0); + + ut_initMemberConditionalRack(&members[2], "consumer3", + ut_get_consumer_rack(2, parametrization), + parametrization, "topic2", "topic3", + "topic4", "topic5", "topic6", NULL); + ut_populate_member_owned_partitions_generation( + &members[2], 1 /* generation */, 3, "topic4", 0, "topic5", 0, + "topic6", 0); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, member_cnt, metadata); + + for (i = 0; i < member_cnt; i++) { + rd_kafka_group_member_clear(&members[i]); + if (assignments[i]) + rd_kafka_topic_partition_list_destroy(assignments[i]); + } + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +static int +ut_testStickiness(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; @@ -3073,18 +3339,22 @@ static int ut_testStickiness(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas) { int member_cnt = RD_ARRAYSIZE(members); int i; - metadata = rd_kafka_metadata_new_topic_mockv( - 6, "topic1", 1, "topic2", 1, "topic3", 1, "topic4", 1, "topic5", 1, - "topic6", 1); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 6, "topic1", 1, "topic2", 1, "topic3", 1, + "topic4", 1, "topic5", 1, "topic6", 1); - ut_init_member(&members[0], "consumer1", "topic1", "topic2", NULL); + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", NULL); rd_kafka_topic_partition_list_destroy(members[0].rkgm_assignment); members[0].rkgm_assignment = rd_kafka_topic_partition_list_new(1); rd_kafka_topic_partition_list_add(members[0].rkgm_assignment, "topic1", 0); - ut_init_member(&members[1], "consumer2", "topic1", "topic2", "topic3", - "topic4", NULL); + ut_initMemberConditionalRack( + &members[1], "consumer2", ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", "topic3", "topic4", NULL); rd_kafka_topic_partition_list_destroy(members[1].rkgm_assignment); members[1].rkgm_assignment = rd_kafka_topic_partition_list_new(2); rd_kafka_topic_partition_list_add(members[1].rkgm_assignment, "topic2", @@ -3092,8 +3362,9 @@ static int ut_testStickiness(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas) { rd_kafka_topic_partition_list_add(members[1].rkgm_assignment, "topic3", 0); - ut_init_member(&members[2], "consumer3", "topic4", "topic5", "topic6", - NULL); + ut_initMemberConditionalRack( + &members[2], "consumer3", ut_get_consumer_rack(1, parametrization), + parametrization, "topic4", "topic5", "topic6", NULL); rd_kafka_topic_partition_list_destroy(members[2].rkgm_assignment); members[2].rkgm_assignment = rd_kafka_topic_partition_list_new(3); rd_kafka_topic_partition_list_add(members[2].rkgm_assignment, "topic4", @@ -3110,10 +3381,113 @@ static int ut_testStickiness(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas) { verifyValidityAndBalance(members, RD_ARRAYSIZE(members), metadata); - for (i = 0; i < member_cnt; i++) rd_kafka_group_member_clear(&members[i]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +/* The original version of this test diverged from the Java implementaion in + * what it was testing. It's not certain whether it was by mistake, or by + * design, but the new version matches the Java implementation, and the old one + * is retained as well, for extra coverage. + */ +static int +ut_testStickiness_j(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[4]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + rd_kafka_topic_partition_list_t *assignments[4] = RD_ZERO_INIT; + int fails = 0; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[2], "consumer3", + ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[3], "consumer4", + ut_get_consumer_rack(3, parametrization), + parametrization, "topic1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, member_cnt, metadata); + + for (i = 0; i < member_cnt; i++) { + if (members[i].rkgm_assignment->cnt > 1) { + RD_UT_WARN("%s assigned %d partitions, expected <= 1", + members[i].rkgm_member_id->str, + members[i].rkgm_assignment->cnt); + fails++; + } else if (members[i].rkgm_assignment->cnt == 1) { + assignments[i] = rd_kafka_topic_partition_list_copy( + members[i].rkgm_assignment); + } + } + + /* + * Remove potential group leader consumer1, by starting members at + * index 1. + * Owned partitions of the members are already set to the assignment by + * verifyValidityAndBalance above to simulate the fact that the assignor + * has already run once. + */ + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, &members[1], + member_cnt - 1, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(&members[1], member_cnt - 1, metadata); + // FIXME: isSticky() + + for (i = 1; i < member_cnt; i++) { + if (members[i].rkgm_assignment->cnt != 1) { + RD_UT_WARN("%s assigned %d partitions, expected 1", + members[i].rkgm_member_id->str, + members[i].rkgm_assignment->cnt); + fails++; + } else if (assignments[i] && + !rd_kafka_topic_partition_list_find( + assignments[i], + members[i].rkgm_assignment->elems[0].topic, + members[i] + .rkgm_assignment->elems[0] + .partition)) { + RD_UT_WARN( + "Stickiness was not honored for %s, " + "%s [%" PRId32 "] not in previous assignment", + members[i].rkgm_member_id->str, + members[i].rkgm_assignment->elems[0].topic, + members[i].rkgm_assignment->elems[0].partition); + fails++; + } + } + + RD_UT_ASSERT(!fails, "See previous errors"); + + + for (i = 0; i < member_cnt; i++) { + rd_kafka_group_member_clear(&members[i]); + if (assignments[i]) + rd_kafka_topic_partition_list_destroy(assignments[i]); + } + ut_destroy_metadata(metadata); RD_UT_PASS(); } @@ -3122,7 +3496,10 @@ static int ut_testStickiness(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas) { /** * @brief Verify stickiness across three rebalances. */ -static int ut_testStickiness2(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas) { +static int +ut_testStickiness2(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; @@ -3130,11 +3507,19 @@ static int ut_testStickiness2(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas) { int member_cnt = RD_ARRAYSIZE(members); int i; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 6); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 6); - ut_init_member(&members[0], "consumer1", "topic1", NULL); - ut_init_member(&members[1], "consumer2", "topic1", NULL); - ut_init_member(&members[2], "consumer3", "topic1", NULL); + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); + ut_initMemberConditionalRack(&members[2], "consumer3", + ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", NULL); /* Just consumer1 */ err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, 1, @@ -3196,24 +3581,28 @@ static int ut_testStickiness2(rd_kafka_t *rk, const rd_kafka_assignor_t *rkas) { for (i = 0; i < member_cnt; i++) rd_kafka_group_member_clear(&members[i]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int -ut_testAssignmentUpdatedForDeletedTopic(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testAssignmentUpdatedForDeletedTopic( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[1]; - metadata = - rd_kafka_metadata_new_topic_mockv(2, "topic1", 1, "topic3", 100); - ut_init_member(&members[0], "consumer1", "topic1", "topic2", "topic3", - NULL); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 1, "topic3", 100); + + ut_initMemberConditionalRack( + &members[0], "consumer1", ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", "topic3", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -3228,7 +3617,7 @@ ut_testAssignmentUpdatedForDeletedTopic(rd_kafka_t *rk, members[0].rkgm_assignment->cnt); rd_kafka_group_member_clear(&members[0]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } @@ -3236,16 +3625,21 @@ ut_testAssignmentUpdatedForDeletedTopic(rd_kafka_t *rk, static int ut_testNoExceptionThrownWhenOnlySubscribedTopicDeleted( rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { - + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; rd_kafka_group_member_t members[1]; - metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 3); + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); - ut_init_member(&members[0], "consumer1", "topic", NULL); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -3258,8 +3652,8 @@ static int ut_testNoExceptionThrownWhenOnlySubscribedTopicDeleted( /* * Remove topic */ - rd_kafka_metadata_destroy(metadata); - metadata = rd_kafka_metadata_new_topic_mock(NULL, 0); + ut_destroy_metadata(metadata); + metadata = rd_kafka_metadata_new_topic_mock(NULL, 0, -1, 0); err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, RD_ARRAYSIZE(members), errstr, @@ -3270,15 +3664,16 @@ static int ut_testNoExceptionThrownWhenOnlySubscribedTopicDeleted( isFullyBalanced(members, RD_ARRAYSIZE(members)); rd_kafka_group_member_clear(&members[0]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } -static int -ut_testConflictingPreviousAssignments(rd_kafka_t *rk, - const rd_kafka_assignor_t *rkas) { +static int ut_testConflictingPreviousAssignments( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { rd_kafka_resp_err_t err; char errstr[512]; rd_kafka_metadata_t *metadata; @@ -3287,6 +3682,8 @@ ut_testConflictingPreviousAssignments(rd_kafka_t *rk, int i; // FIXME: removed from Java test suite, and fails for us, why, why? + // NOTE: rack-awareness changes aren't made to this test because of + // the FIXME above. RD_UT_PASS(); metadata = rd_kafka_metadata_new_topic_mockv(1, "topic1", 2); @@ -3331,7 +3728,7 @@ ut_testConflictingPreviousAssignments(rd_kafka_t *rk, for (i = 0; i < member_cnt; i++) rd_kafka_group_member_clear(&members[i]); - rd_kafka_metadata_destroy(metadata); + ut_destroy_metadata(metadata); RD_UT_PASS(); } @@ -3340,13 +3737,947 @@ ut_testConflictingPreviousAssignments(rd_kafka_t *rk, * from Java since random tests don't provide meaningful test coverage. */ +static int ut_testAllConsumersReachExpectedQuotaAndAreConsideredFilled( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[3]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 4); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], 1 /* generation */, 2, "topic1", 0, "topic1", 1); + + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], 1 /* generation */, 1, "topic1", 2); + + ut_initMemberConditionalRack(&members[2], "consumer3", + ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", NULL); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, RD_ARRAYSIZE(members), metadata); + verifyAssignment(&members[0], "topic1", 0, "topic1", 1, NULL); + verifyAssignment(&members[1], "topic1", 2, NULL); + verifyAssignment(&members[2], "topic1", 3, NULL); + + for (i = 0; i < member_cnt; i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +static int ut_testOwnedPartitionsAreInvalidatedForConsumerWithStaleGeneration( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[2]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + int current_generation = 10; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 3, "topic2", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], current_generation, 3, "topic1", 0, "topic1", 2, + "topic2", 1); + + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], current_generation - 1, 3, "topic1", 0, "topic1", 2, + "topic2", 1); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, RD_ARRAYSIZE(members), metadata); + verifyAssignment(&members[0], "topic1", 0, "topic1", 2, "topic2", 1, + NULL); + verifyAssignment(&members[1], "topic1", 1, "topic2", 0, "topic2", 2, + NULL); + + + for (i = 0; i < member_cnt; i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int ut_testOwnedPartitionsAreInvalidatedForConsumerWithNoGeneration( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[2]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + int current_generation = 10; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 3, "topic2", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], current_generation, 3, "topic1", 0, "topic1", 2, + "topic2", 1); + + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], -1 /* default generation*/, 3, "topic1", 0, "topic1", + 2, "topic2", 1); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, RD_ARRAYSIZE(members), metadata); + verifyAssignment(&members[0], "topic1", 0, "topic1", 2, "topic2", 1, + NULL); + verifyAssignment(&members[1], "topic1", 1, "topic2", 0, "topic2", 2, + NULL); + + + for (i = 0; i < member_cnt; i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +static int +ut_testPartitionsTransferringOwnershipIncludeThePartitionClaimedByMultipleConsumersInSameGeneration( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[3]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 3); + + // partition topic-0 is owned by multiple consumers + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], 1 /* generation */, 2, "topic1", 0, "topic1", 1); + + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], 1 /* generation */, 2, "topic1", 0, "topic1", 2); + + ut_initMemberConditionalRack(&members[2], "consumer3", + ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", NULL); + + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, RD_ARRAYSIZE(members), metadata); + verifyAssignment(&members[0], "topic1", 1, NULL); + verifyAssignment(&members[1], "topic1", 2, NULL); + verifyAssignment(&members[2], "topic1", 0, NULL); + + for (i = 0; i < member_cnt; i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +/* In Java, there is a way to check what partition transferred ownership. + * We don't have anything like that for our UTs, so in lieue of that, this + * test is added along with the previous test to make sure that we move the + * right partition. Our solution in case of two consumers owning the same + * partitions with the same generation id was differing from the Java + * implementation earlier. (Check #4252.) */ +static int +ut_testPartitionsTransferringOwnershipIncludeThePartitionClaimedByMultipleConsumersInSameGeneration2( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[3]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 1, "topic1", 3); + + // partition topic-0 is owned by multiple consumers + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], 1 /* generation */, 2, "topic1", 0, "topic1", 1); + + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], 1 /* generation */, 2, "topic1", 1, "topic1", 2); + + ut_initMemberConditionalRack(&members[2], "consumer3", + ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", NULL); + + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, RD_ARRAYSIZE(members), metadata); + verifyAssignment(&members[0], "topic1", 0, NULL); + verifyAssignment(&members[1], "topic1", 2, NULL); + verifyAssignment(&members[2], "topic1", 1, NULL); + + for (i = 0; i < member_cnt; i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +static int ut_testEnsurePartitionsAssignedToHighestGeneration( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[3]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + int currentGeneration = 10; + + ut_initMetadataConditionalRack( + &metadata, 3, 3, ALL_RACKS, RD_ARRAYSIZE(ALL_RACKS), + parametrization, 3, "topic1", 3, "topic2", 3, "topic3", 3); + + ut_initMemberConditionalRack( + &members[0], "consumer1", ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", "topic3", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], currentGeneration, 3, "topic1", 0, "topic2", 0, + "topic3", 0); + + + ut_initMemberConditionalRack( + &members[1], "consumer2", ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", "topic3", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], currentGeneration - 1, 3, "topic1", 1, "topic2", 1, + "topic3", 1); + + + ut_initMemberConditionalRack( + &members[2], "consumer3", ut_get_consumer_rack(2, parametrization), + parametrization, "topic1", "topic2", "topic3", NULL); + ut_populate_member_owned_partitions_generation( + &members[2], currentGeneration - 2, 3, "topic2", 1, "topic3", 0, + "topic3", 2); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + verifyAssignment(&members[0], "topic1", 0, "topic2", 0, "topic3", 0, + NULL); + verifyAssignment(&members[1], "topic1", 1, "topic2", 1, "topic3", 1, + NULL); + verifyAssignment(&members[2], "topic1", 2, "topic2", 2, "topic3", 2, + NULL); + + verifyValidityAndBalance(members, RD_ARRAYSIZE(members), metadata); + + for (i = 0; i < member_cnt; i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +static int ut_testNoReassignmentOnCurrentMembers( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[4]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + int currentGeneration = 10; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 4, "topic0", 3, "topic1", 3, "topic2", 3, + "topic3", 3); + + ut_initMemberConditionalRack( + &members[0], "consumer1", ut_get_consumer_rack(0, parametrization), + parametrization, "topic0", "topic1", "topic2", "topic3", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], -1 /* default generation */, 0); + + ut_initMemberConditionalRack( + &members[1], "consumer2", ut_get_consumer_rack(1, parametrization), + parametrization, "topic0", "topic1", "topic2", "topic3", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], currentGeneration - 1, 3, "topic0", 0, "topic2", 0, + "topic1", 0); + + ut_initMemberConditionalRack( + &members[2], "consumer3", ut_get_consumer_rack(2, parametrization), + parametrization, "topic0", "topic1", "topic2", "topic3", NULL); + ut_populate_member_owned_partitions_generation( + &members[2], currentGeneration - 2, 3, "topic3", 2, "topic2", 2, + "topic1", 1); + + ut_initMemberConditionalRack( + &members[3], "consumer4", ut_get_consumer_rack(3, parametrization), + parametrization, "topic0", "topic1", "topic2", "topic3", NULL); + ut_populate_member_owned_partitions_generation( + &members[3], currentGeneration - 3, 3, "topic3", 1, "topic0", 1, + "topic0", 2); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, member_cnt, metadata); + verifyAssignment(&members[0], "topic1", 2, "topic2", 1, "topic3", 0, + NULL); + + for (i = 0; i < member_cnt; i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + + +static int +ut_testOwnedPartitionsAreInvalidatedForConsumerWithMultipleGeneration( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata; + rd_kafka_group_member_t members[2]; + int member_cnt = RD_ARRAYSIZE(members); + int i; + int currentGeneration = 10; + + ut_initMetadataConditionalRack(&metadata, 3, 3, ALL_RACKS, + RD_ARRAYSIZE(ALL_RACKS), parametrization, + 2, "topic1", 3, "topic2", 3); + + ut_initMemberConditionalRack(&members[0], "consumer1", + ut_get_consumer_rack(0, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_populate_member_owned_partitions_generation( + &members[0], currentGeneration, 3, "topic1", 0, "topic2", 1, + "topic1", 1); + + ut_initMemberConditionalRack(&members[1], "consumer2", + ut_get_consumer_rack(1, parametrization), + parametrization, "topic1", "topic2", NULL); + ut_populate_member_owned_partitions_generation( + &members[1], currentGeneration - 2, 3, "topic1", 0, "topic2", 1, + "topic2", 2); + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + verifyValidityAndBalance(members, member_cnt, metadata); + verifyAssignment(&members[0], "topic1", 0, "topic2", 1, "topic1", 1, + NULL); + verifyAssignment(&members[1], "topic1", 2, "topic2", 2, "topic2", 0, + NULL); + + for (i = 0; i < member_cnt; i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + + RD_UT_PASS(); +} + +/* Helper for setting up metadata and members, and running the assignor, and + * verifying validity and balance of the assignment. Does not check the results + * of the assignment on a per member basis.. + */ +static int +setupRackAwareAssignment0(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_group_member_t *members, + size_t member_cnt, + int replication_factor, + int num_broker_racks, + size_t topic_cnt, + char *topics[], + int *partitions, + int *subscriptions_count, + char **subscriptions[], + int *consumer_racks, + rd_kafka_topic_partition_list_t **owned_tp_list, + rd_bool_t initialize_members, + rd_kafka_metadata_t **metadata) { + rd_kafka_resp_err_t err; + char errstr[512]; + rd_kafka_metadata_t *metadata_local = NULL; + + size_t i = 0; + const int num_brokers = num_broker_racks > 0 + ? replication_factor * num_broker_racks + : replication_factor; + if (!metadata) + metadata = &metadata_local; + + /* The member naming for tests is consumerN where N is a single + * character. */ + rd_assert(member_cnt <= 9); + + *metadata = rd_kafka_metadata_new_topic_with_partition_replicas_mock( + replication_factor, num_brokers, topics, partitions, topic_cnt); + ut_populate_internal_broker_metadata( + rd_kafka_metadata_get_internal(*metadata), num_broker_racks, + ALL_RACKS, RD_ARRAYSIZE(ALL_RACKS)); + ut_populate_internal_topic_metadata( + rd_kafka_metadata_get_internal(*metadata)); + + for (i = 0; initialize_members && i < member_cnt; i++) { + char member_id[10]; + snprintf(member_id, 10, "consumer%d", (int)(i + 1)); + ut_init_member_with_rack( + &members[i], member_id, ALL_RACKS[consumer_racks[i]], + subscriptions[i], subscriptions_count[i]); + + if (!owned_tp_list || !owned_tp_list[i]) + continue; + + if (members[i].rkgm_owned) + rd_kafka_topic_partition_list_destroy( + members[i].rkgm_owned); + + members[i].rkgm_owned = + rd_kafka_topic_partition_list_copy(owned_tp_list[i]); + } + + err = rd_kafka_assignor_run(rk->rk_cgrp, rkas, *metadata, members, + member_cnt, errstr, sizeof(errstr)); + RD_UT_ASSERT(!err, "assignor run failed: %s", errstr); + + /* Note that verifyValidityAndBalance also sets rkgm_owned for each + * member to rkgm_assignment, so if the members are used without + * clearing, in another assignor_run, the result should be stable. */ + verifyValidityAndBalance(members, member_cnt, *metadata); + + if (metadata_local) + ut_destroy_metadata(metadata_local); + return 0; +} + +static int +setupRackAwareAssignment(rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_group_member_t *members, + size_t member_cnt, + int replication_factor, + int num_broker_racks, + size_t topic_cnt, + char *topics[], + int *partitions, + int *subscriptions_count, + char **subscriptions[], + int *consumer_racks, + rd_kafka_topic_partition_list_t **owned_tp_list, + rd_bool_t initialize_members) { + return setupRackAwareAssignment0( + rk, rkas, members, member_cnt, replication_factor, num_broker_racks, + topic_cnt, topics, partitions, subscriptions_count, subscriptions, + consumer_racks, owned_tp_list, initialize_members, NULL); +} + +/* Helper for testing cases where rack-aware assignment should not be triggered, + * and assignment should be the same as the pre-rack-aware assignor. Each case + * is run twice, once with owned partitions set to empty, and in the second + * case, with owned partitions set to the result of the previous run, to check + * that the assignment is stable. */ +#define verifyNonRackAwareAssignment(rk, rkas, members, member_cnt, topic_cnt, \ + topics, partitions, subscriptions_count, \ + subscriptions, ...) \ + do { \ + size_t idx = 0; \ + int init_members = 1; \ + rd_kafka_metadata_t *metadata; \ + \ + /* num_broker_racks = 0, implies that brokers have no \ + * configured racks. */ \ + for (init_members = 1; init_members >= 0; init_members--) { \ + setupRackAwareAssignment( \ + rk, rkas, members, member_cnt, 3, 0, topic_cnt, \ + topics, partitions, subscriptions_count, \ + subscriptions, RACKS_INITIAL, NULL, init_members); \ + verifyMultipleAssignment(members, member_cnt, \ + __VA_ARGS__); \ + } \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + /* consumer_racks = RACKS_NULL implies that consumers have no \ + * racks. */ \ + for (init_members = 1; init_members >= 0; init_members--) { \ + setupRackAwareAssignment( \ + rk, rkas, members, member_cnt, 3, 3, topic_cnt, \ + topics, partitions, subscriptions_count, \ + subscriptions, RACKS_NULL, NULL, init_members); \ + verifyMultipleAssignment(members, member_cnt, \ + __VA_ARGS__); \ + } \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + /* replication_factor = 3 and num_broker_racks = 3 means that \ + * all partitions are replicated on all racks.*/ \ + for (init_members = 1; init_members >= 0; init_members--) { \ + setupRackAwareAssignment0( \ + rk, rkas, members, member_cnt, 3, 3, topic_cnt, \ + topics, partitions, subscriptions_count, \ + subscriptions, RACKS_INITIAL, NULL, init_members, \ + &metadata); \ + verifyMultipleAssignment(members, member_cnt, \ + __VA_ARGS__); \ + verifyNumPartitionsWithRackMismatch( \ + metadata, members, RD_ARRAYSIZE(members), 0); \ + ut_destroy_metadata(metadata); \ + } \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + /* replication_factor = 4 and num_broker_racks = 4 means that \ + * all partitions are replicated on all racks. */ \ + for (init_members = 1; init_members >= 0; init_members--) { \ + setupRackAwareAssignment0( \ + rk, rkas, members, member_cnt, 4, 4, topic_cnt, \ + topics, partitions, subscriptions_count, \ + subscriptions, RACKS_INITIAL, NULL, init_members, \ + &metadata); \ + verifyMultipleAssignment(members, member_cnt, \ + __VA_ARGS__); \ + verifyNumPartitionsWithRackMismatch( \ + metadata, members, RD_ARRAYSIZE(members), 0); \ + ut_destroy_metadata(metadata); \ + } \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + /* There's no overap between broker racks and consumer racks, \ + * since num_broker_racks = 3, they'll be picked from a,b,c \ + * and consumer racks are d,e,f. */ \ + for (init_members = 1; init_members >= 0; init_members--) { \ + setupRackAwareAssignment( \ + rk, rkas, members, member_cnt, 3, 3, topic_cnt, \ + topics, partitions, subscriptions_count, \ + subscriptions, RACKS_FINAL, NULL, init_members); \ + verifyMultipleAssignment(members, member_cnt, \ + __VA_ARGS__); \ + } \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + /* There's no overap between broker racks and consumer racks, \ + * since num_broker_racks = 3, they'll be picked from a,b,c \ + * and consumer racks are d,e,NULL. */ \ + for (init_members = 1; init_members >= 0; init_members--) { \ + setupRackAwareAssignment( \ + rk, rkas, members, member_cnt, 3, 3, topic_cnt, \ + topics, partitions, subscriptions_count, \ + subscriptions, RACKS_ONE_NULL, NULL, \ + init_members); \ + verifyMultipleAssignment(members, member_cnt, \ + __VA_ARGS__); \ + } \ + for (idx = 0; idx < member_cnt; idx++) \ + rd_kafka_group_member_clear(&members[idx]); \ + } while (0) + + +static int ut_testRackAwareAssignmentWithUniformSubscription( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + char *topics[] = {"t1", "t2", "t3"}; + int partitions[] = {6, 7, 2}; + rd_kafka_group_member_t members[3]; + size_t member_cnt = RD_ARRAYSIZE(members); + size_t i = 0; + int subscriptions_count[] = {3, 3, 3}; + char **subscriptions[] = {topics, topics, topics}; + int init_members = 0; + rd_kafka_topic_partition_list_t **owned; + rd_kafka_metadata_t *metadata; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + verifyNonRackAwareAssignment( + rk, rkas, members, RD_ARRAYSIZE(members), RD_ARRAYSIZE(topics), + topics, partitions, subscriptions_count, subscriptions, + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t2", 2, "t2", 5, "t3", 1, NULL); + + /* Verify assignment is rack-aligned for lower replication factor where + * brokers have a subset of partitions */ + for (init_members = 1; init_members >= 0; init_members--) { + setupRackAwareAssignment0( + rk, rkas, members, RD_ARRAYSIZE(members), 1, 3, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, RACKS_INITIAL, NULL, + init_members, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t2", 2, "t2", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + ut_destroy_metadata(metadata); + } + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + + for (init_members = 1; init_members >= 0; init_members--) { + setupRackAwareAssignment0( + rk, rkas, members, RD_ARRAYSIZE(members), 2, 3, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, RACKS_INITIAL, NULL, + init_members, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t2", 2, "t2", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + ut_destroy_metadata(metadata); + } + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + /* One consumer on a rack with no partitions. We allocate with + * misaligned rack to this consumer to maintain balance. */ + for (init_members = 1; init_members >= 0; init_members--) { + setupRackAwareAssignment0( + rk, rkas, members, RD_ARRAYSIZE(members), 3, 2, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, RACKS_INITIAL, NULL, + init_members, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t2", 2, "t2", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 5); + ut_destroy_metadata(metadata); + } + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + /* Verify that rack-awareness is improved if already owned partitions + * are misaligned */ + owned = ut_create_topic_partition_lists( + 3, + /* consumer1 */ + "t1", 0, "t1", 1, "t1", 2, "t1", 3, "t1", 4, NULL, + /* consumer2 */ + "t1", 5, "t2", 0, "t2", 1, "t2", 2, "t2", 3, NULL, + /* consumer3 */ + "t2", 4, "t2", 5, "t2", 6, "t3", 0, "t3", 1, NULL); + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 1, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, owned, rd_true, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t2", 2, "t2", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + ut_destroy_metadata(metadata); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + for (i = 0; i < member_cnt; i++) + rd_kafka_topic_partition_list_destroy(owned[i]); + rd_free(owned); + + + /* Verify that stickiness is retained when racks match */ + owned = ut_create_topic_partition_lists( + 3, + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t2", 2, "t2", 5, "t3", 1, NULL); + + /* This test deviates slightly from Java, in that we test with two + * additional replication factors, 1 and 2, which are not tested in + * Java. This is because in Java, there is a way to turn rack aware + * logic on or off for tests. We don't have that, and to test with rack + * aware logic, we need to change something, in this case, the + * replication factor. */ + for (i = 1; i <= 3; i++) { + setupRackAwareAssignment0( + rk, rkas, members, RD_ARRAYSIZE(members), + i /* replication factor */, 3, RD_ARRAYSIZE(topics), topics, + partitions, subscriptions_count, subscriptions, + RACKS_INITIAL, owned, rd_true, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 0, "t1", 3, "t2", 0, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 1, "t1", 4, "t2", 1, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 2, "t1", 5, "t2", 2, "t2", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + ut_destroy_metadata(metadata); + } + + for (i = 0; i < member_cnt; i++) + rd_kafka_topic_partition_list_destroy(owned[i]); + rd_free(owned); + + RD_UT_PASS(); +} + + +static int ut_testRackAwareAssignmentWithNonEqualSubscription( + rd_kafka_t *rk, + const rd_kafka_assignor_t *rkas, + rd_kafka_assignor_ut_rack_config_t parametrization) { + char *topics[] = {"t1", "t2", "t3"}; + char *topics0[] = {"t1", "t3"}; + int partitions[] = {6, 7, 2}; + rd_kafka_group_member_t members[3]; + size_t member_cnt = RD_ARRAYSIZE(members); + size_t i = 0; + int subscriptions_count[] = {3, 3, 2}; + char **subscriptions[] = {topics, topics, topics0}; + int with_owned = 0; + rd_kafka_topic_partition_list_t **owned; + rd_kafka_metadata_t *metadata; + + if (parametrization != + RD_KAFKA_RANGE_ASSIGNOR_UT_BROKER_AND_CONSUMER_RACK) { + RD_UT_PASS(); + } + + verifyNonRackAwareAssignment( + rk, rkas, members, RD_ARRAYSIZE(members), RD_ARRAYSIZE(topics), + topics, partitions, subscriptions_count, subscriptions, "t1", 5, + "t2", 0, "t2", 2, "t2", 4, "t2", 6, NULL, + /* consumer2 */ + "t1", 3, "t2", 1, "t2", 3, "t2", 5, "t3", 0, NULL, + /* consumer3 */ + "t1", 0, "t1", 1, "t1", 2, "t1", 4, "t3", 1, NULL); + + // Verify assignment is rack-aligned for lower replication factor where + // brokers have a subset of partitions + for (with_owned = 0; with_owned <= 1; with_owned++) { + setupRackAwareAssignment0( + rk, rkas, members, RD_ARRAYSIZE(members), 1, 3, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, RACKS_INITIAL, NULL, + !with_owned, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 3, "t2", 0, "t2", 2, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 4, "t2", 1, "t2", 4, "t2", 5, "t3", 0, NULL, + /* consumer3 */ + "t1", 0, "t1", 1, "t1", 2, "t1", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 4); + ut_destroy_metadata(metadata); + } + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + + + for (with_owned = 0; with_owned <= 1; with_owned++) { + setupRackAwareAssignment0( + rk, rkas, members, RD_ARRAYSIZE(members), 2, 3, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, RACKS_INITIAL, NULL, + !with_owned, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 3, "t2", 0, "t2", 2, "t2", 5, "t2", 6, NULL, + /* consumer2 */ + "t1", 0, "t2", 1, "t2", 3, "t2", 4, "t3", 0, NULL, + /* consumer3 */ + "t1", 1, "t1", 2, "t1", 4, "t1", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 0); + ut_destroy_metadata(metadata); + } + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + /* One consumer on a rack with no partitions. We allocate with + * misaligned rack to this consumer to maintain balance. */ + for (with_owned = 0; with_owned <= 1; with_owned++) { + setupRackAwareAssignment0( + rk, rkas, members, RD_ARRAYSIZE(members), 3, 2, + RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, RACKS_INITIAL, NULL, + !with_owned, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 5, "t2", 0, "t2", 2, "t2", 4, "t2", 6, NULL, + /* consumer2 */ + "t1", 3, "t2", 1, "t2", 3, "t2", 5, "t3", 0, NULL, + /* consumer3 */ + "t1", 0, "t1", 1, "t1", 2, "t1", 4, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 5); + ut_destroy_metadata(metadata); + } + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + + /* Verify that rack-awareness is improved if already owned partitions + * are misaligned. */ + owned = ut_create_topic_partition_lists( + 3, + /* consumer1 */ + "t1", 0, "t1", 1, "t1", 2, "t1", 3, "t1", 4, NULL, + /* consumer2 */ + "t1", 5, "t2", 0, "t2", 1, "t2", 2, "t2", 3, NULL, + /* consumer3 */ + "t2", 4, "t2", 5, "t2", 6, "t3", 0, "t3", 1, NULL); + + setupRackAwareAssignment0(rk, rkas, members, RD_ARRAYSIZE(members), 1, + 3, RD_ARRAYSIZE(topics), topics, partitions, + subscriptions_count, subscriptions, + RACKS_INITIAL, owned, rd_true, &metadata); + verifyMultipleAssignment( + members, RD_ARRAYSIZE(members), + /* consumer1 */ + "t1", 3, "t2", 0, "t2", 2, "t2", 3, "t2", 6, NULL, + /* consumer2 */ + "t1", 4, "t2", 1, "t2", 4, "t2", 5, "t3", 0, NULL, + /* consumer3 */ + "t1", 0, "t1", 1, "t1", 2, "t1", 5, "t3", 1, NULL); + verifyNumPartitionsWithRackMismatch(metadata, members, + RD_ARRAYSIZE(members), 4); + ut_destroy_metadata(metadata); + + for (i = 0; i < RD_ARRAYSIZE(members); i++) + rd_kafka_group_member_clear(&members[i]); + for (i = 0; i < member_cnt; i++) + rd_kafka_topic_partition_list_destroy(owned[i]); + rd_free(owned); + + /* One of the Java tests is skipped here, which tests if the rack-aware + * logic assigns the same partitions as non-rack aware logic. This is + * because we don't have a way to force rack-aware logic like the Java + * assignor. */ + RD_UT_PASS(); +} + static int rd_kafka_sticky_assignor_unittest(void) { rd_kafka_conf_t *conf; rd_kafka_t *rk; int fails = 0; char errstr[256]; rd_kafka_assignor_t *rkas; - static int (*tests[])(rd_kafka_t *, const rd_kafka_assignor_t *) = { + static int (*tests[])( + rd_kafka_t *, const rd_kafka_assignor_t *, + rd_kafka_assignor_ut_rack_config_t parametrization) = { ut_testOneConsumerNoTopic, ut_testOneConsumerNonexistentTopic, ut_testOneConsumerOneTopic, @@ -3365,14 +4696,26 @@ static int rd_kafka_sticky_assignor_unittest(void) { ut_testLargeAssignmentWithMultipleConsumersLeaving, ut_testNewSubscription, ut_testMoveExistingAssignments, + ut_testMoveExistingAssignments_j, ut_testStickiness, + ut_testStickiness_j, ut_testStickiness2, ut_testAssignmentUpdatedForDeletedTopic, ut_testNoExceptionThrownWhenOnlySubscribedTopicDeleted, ut_testConflictingPreviousAssignments, + ut_testAllConsumersReachExpectedQuotaAndAreConsideredFilled, + ut_testOwnedPartitionsAreInvalidatedForConsumerWithStaleGeneration, + ut_testOwnedPartitionsAreInvalidatedForConsumerWithNoGeneration, + ut_testPartitionsTransferringOwnershipIncludeThePartitionClaimedByMultipleConsumersInSameGeneration, + ut_testPartitionsTransferringOwnershipIncludeThePartitionClaimedByMultipleConsumersInSameGeneration2, + ut_testEnsurePartitionsAssignedToHighestGeneration, + ut_testNoReassignmentOnCurrentMembers, + ut_testOwnedPartitionsAreInvalidatedForConsumerWithMultipleGeneration, + ut_testRackAwareAssignmentWithUniformSubscription, + ut_testRackAwareAssignmentWithNonEqualSubscription, NULL, }; - int i; + size_t i; conf = rd_kafka_conf_new(); @@ -3392,13 +4735,25 @@ static int rd_kafka_sticky_assignor_unittest(void) { rkas = rd_kafka_assignor_find(rk, "cooperative-sticky"); RD_UT_ASSERT(rkas, "sticky assignor not found"); + for (i = 0; i < RD_ARRAY_SIZE(ALL_RACKS) - 1; i++) { + char c = 'a' + i; + ALL_RACKS[i] = rd_kafkap_str_new(&c, 1); + } + ALL_RACKS[i] = NULL; + for (i = 0; tests[i]; i++) { rd_ts_t ts = rd_clock(); - int r; + int r = 0; + rd_kafka_assignor_ut_rack_config_t j; - RD_UT_SAY("[ Test #%d ]", i); - r = tests[i](rk, rkas); - RD_UT_SAY("[ Test #%d ran for %.3fms ]", i, + RD_UT_SAY("[ Test #%" PRIusz " ]", i); + for (j = RD_KAFKA_RANGE_ASSIGNOR_UT_NO_BROKER_RACK; + j != RD_KAFKA_RANGE_ASSIGNOR_UT_CONFIG_CNT; j++) { + RD_UT_SAY("[ Test #%" PRIusz ", RackConfig = %d ]", i, + j); + r += tests[i](rk, rkas, j); + } + RD_UT_SAY("[ Test #%" PRIusz " ran for %.3fms ]", i, (double)(rd_clock() - ts) / 1000.0); RD_UT_ASSERT(!r, "^ failed"); @@ -3406,6 +4761,10 @@ static int rd_kafka_sticky_assignor_unittest(void) { fails += r; } + for (i = 0; i < RD_ARRAY_SIZE(ALL_RACKS) - 1; i++) { + rd_kafkap_str_destroy(ALL_RACKS[i]); + } + rd_kafka_destroy(rk); return fails; diff --git a/src/third_party/librdkafka/dist/src/rdkafka_subscription.c b/src/third_party/librdkafka/dist/src/rdkafka_subscription.c index 08058935876..46ab544ee20 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_subscription.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_subscription.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_telemetry.c b/src/third_party/librdkafka/dist/src/rdkafka_telemetry.c new file mode 100644 index 00000000000..60ba0ff886e --- /dev/null +++ b/src/third_party/librdkafka/dist/src/rdkafka_telemetry.c @@ -0,0 +1,762 @@ +/* + * librdkafka - Apache Kafka C library + * + * Copyright (c) 2023, Confluent Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rdkafka_telemetry.h" +#include "rdkafka_msgset.h" +#include "rdkafka_telemetry_encode.h" +#include "rdkafka_request.h" +#include "nanopb/pb.h" +#include "rdkafka_lz4.h" +#include "snappy.h" +#include "rdunittest.h" + +#if WITH_ZSTD +#include "rdkafka_zstd.h" +#endif + + +#define RD_KAFKA_TELEMETRY_PUSH_JITTER 20 + +/** + * @brief Filters broker by availability of GetTelemetrySubscription. + * + * @return 0 if GetTelemetrySubscription is supported, 1 otherwise. + * + * @locks rd_kafka_broker_lock() + */ +static int +rd_kafka_filter_broker_by_GetTelemetrySubscription(rd_kafka_broker_t *rkb, + void *opaque) { + int features; + if (rd_kafka_broker_ApiVersion_supported0( + rkb, RD_KAFKAP_GetTelemetrySubscriptions, 0, 0, &features, + rd_false) != -1) + return 0; + return 1; +} + +/** + * @brief Returns the preferred metrics broker or NULL if unavailable. + * + * @locks none + * @locks_acquired rk_telemetry.lock, rd_kafka_wrlock() + * @locality main thread + */ +static rd_kafka_broker_t *rd_kafka_get_preferred_broker(rd_kafka_t *rk) { + rd_kafka_broker_t *rkb = NULL; + + mtx_lock(&rk->rk_telemetry.lock); + if (rk->rk_telemetry.preferred_broker) + rkb = rk->rk_telemetry.preferred_broker; + else { + /* If there is no preferred broker, that means that our previous + * one failed. Iterate through all available brokers to find + * one. */ + rd_kafka_wrlock(rk); + rkb = rd_kafka_broker_random_up( + rk, rd_kafka_filter_broker_by_GetTelemetrySubscription, + NULL); + rd_kafka_wrunlock(rk); + + /* No need to increase refcnt as broker_random_up does it + * already. */ + rk->rk_telemetry.preferred_broker = rkb; + + rd_kafka_dbg(rk, TELEMETRY, "SETBROKER", + "Lost preferred broker, switching to new " + "preferred broker %" PRId32 "\n", + rkb ? rd_kafka_broker_id(rkb) : -1); + } + mtx_unlock(&rk->rk_telemetry.lock); + + return rkb; +} + +/** + * @brief Cleans up the rk.rk_telemetry struct and frees any allocations. + * + * @param clear_control_flow_fields This determines if the control flow fields + * need to be cleared. This should only be set + * to true if the rk is terminating. + * @locality main thread + * @locks none + * @locks_acquired rk_telemetry.lock + */ +void rd_kafka_telemetry_clear(rd_kafka_t *rk, + rd_bool_t clear_control_flow_fields) { + if (clear_control_flow_fields) { + mtx_lock(&rk->rk_telemetry.lock); + if (rk->rk_telemetry.preferred_broker) { + rd_kafka_broker_destroy( + rk->rk_telemetry.preferred_broker); + rk->rk_telemetry.preferred_broker = NULL; + } + mtx_unlock(&rk->rk_telemetry.lock); + mtx_destroy(&rk->rk_telemetry.lock); + cnd_destroy(&rk->rk_telemetry.termination_cnd); + } + + if (rk->rk_telemetry.accepted_compression_types_cnt) { + rd_free(rk->rk_telemetry.accepted_compression_types); + rk->rk_telemetry.accepted_compression_types = NULL; + rk->rk_telemetry.accepted_compression_types_cnt = 0; + } + + if (rk->rk_telemetry.requested_metrics_cnt) { + size_t i; + for (i = 0; i < rk->rk_telemetry.requested_metrics_cnt; i++) + rd_free(rk->rk_telemetry.requested_metrics[i]); + rd_free(rk->rk_telemetry.requested_metrics); + rd_free(rk->rk_telemetry.matched_metrics); + rk->rk_telemetry.requested_metrics = NULL; + rk->rk_telemetry.requested_metrics_cnt = 0; + rk->rk_telemetry.matched_metrics = NULL; + rk->rk_telemetry.matched_metrics_cnt = 0; + } + rk->rk_telemetry.telemetry_max_bytes = 0; +} + +/** + * @brief Sets the telemetry state to TERMINATED and signals the conditional + * variable + * + * @locality main thread + * @locks none + * @locks_acquired rk_telemetry.lock + */ +static void rd_kafka_telemetry_set_terminated(rd_kafka_t *rk) { + rd_dassert(thrd_is_current(rk->rk_thread)); + + rd_kafka_dbg(rk, TELEMETRY, "TERM", + "Setting state to TERMINATED and signalling"); + + rk->rk_telemetry.state = RD_KAFKA_TELEMETRY_TERMINATED; + rd_kafka_timer_stop(&rk->rk_timers, &rk->rk_telemetry.request_timer, + 1 /*lock*/); + mtx_lock(&rk->rk_telemetry.lock); + cnd_signal(&rk->rk_telemetry.termination_cnd); + mtx_unlock(&rk->rk_telemetry.lock); +} + +static void update_matched_metrics(rd_kafka_t *rk, size_t j) { + rk->rk_telemetry.matched_metrics_cnt++; + rk->rk_telemetry.matched_metrics = + rd_realloc(rk->rk_telemetry.matched_metrics, + sizeof(int) * rk->rk_telemetry.matched_metrics_cnt); + rk->rk_telemetry + .matched_metrics[rk->rk_telemetry.matched_metrics_cnt - 1] = j; +} + +static void rd_kafka_match_requested_metrics(rd_kafka_t *rk) { + size_t metrics_cnt = RD_KAFKA_TELEMETRY_METRIC_CNT(rk), i; + rd_bool_t is_metric_included[RD_MAX( + (int)RD_KAFKA_TELEMETRY_PRODUCER_METRIC__CNT, + (int)RD_KAFKA_TELEMETRY_CONSUMER_METRIC__CNT)] = {0}; + const rd_kafka_telemetry_metric_info_t *info = + RD_KAFKA_TELEMETRY_METRIC_INFO(rk); + + if (rk->rk_telemetry.requested_metrics_cnt == 1 && + !strcmp(rk->rk_telemetry.requested_metrics[0], + RD_KAFKA_TELEMETRY_METRICS_ALL_METRICS_SUBSCRIPTION)) { + size_t j; + rd_kafka_dbg(rk, TELEMETRY, "GETSUBSCRIPTIONS", + "All metrics subscribed"); + + for (j = 0; j < metrics_cnt; j++) + update_matched_metrics(rk, j); + return; + } + + for (i = 0; i < rk->rk_telemetry.requested_metrics_cnt; i++) { + size_t name_len = strlen(rk->rk_telemetry.requested_metrics[i]), + j; + + for (j = 0; j < metrics_cnt; j++) { + if (is_metric_included[j]) + continue; + + /* Prefix matching the requested metrics with the + * available metrics. */ + char full_metric_name + [RD_KAFKA_TELEMETRY_METRIC_NAME_MAX_LEN]; + rd_snprintf(full_metric_name, sizeof(full_metric_name), + "%s%s", RD_KAFKA_TELEMETRY_METRIC_PREFIX, + info[j].name); + bool name_matches = + strncmp(full_metric_name, + rk->rk_telemetry.requested_metrics[i], + name_len) == 0; + + if (name_matches) { + update_matched_metrics(rk, j); + is_metric_included[j] = rd_true; + } + } + } + + rd_kafka_dbg(rk, TELEMETRY, "GETSUBSCRIPTIONS", + "Matched metrics: %" PRIusz, + rk->rk_telemetry.matched_metrics_cnt); +} + +/** + * @brief Enqueues a GetTelemetrySubscriptionsRequest. + * + * @locks none + * @locks_acquired none + * @locality main thread + */ +static void rd_kafka_send_get_telemetry_subscriptions(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + /* Clear out the telemetry struct, free anything that is malloc'd. */ + rd_kafka_telemetry_clear(rk, rd_false /* clear_control_flow_fields */); + + /* Enqueue on broker transmit queue. + * The preferred broker might change in the meanwhile but let it fail. + */ + rd_kafka_dbg(rk, TELEMETRY, "GETSUBSCRIPTIONS", + "Sending GetTelemetryRequest"); + rd_kafka_GetTelemetrySubscriptionsRequest( + rkb, NULL, 0, RD_KAFKA_REPLYQ(rk->rk_ops, 0), + rd_kafka_handle_GetTelemetrySubscriptions, NULL); + + /* Change state */ + rk->rk_telemetry.state = RD_KAFKA_TELEMETRY_GET_SUBSCRIPTIONS_SENT; +} + +/** + * @brief Compresses the telemetry payload using the available compression + * types. + * + * @param rk The rdkafka instance. + * @param rkb The broker to which the payload is being sent. + * @param payload The payload to be compressed. + * @param compressed_payload The compressed payload. + * @param compressed_payload_size The size of the compressed payload. + * + * @return The compression type used. + * + * @locks none + * @locks_acquired none + * @locality main thread + */ +static rd_kafka_compression_t +rd_kafka_push_telemetry_payload_compress(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_buf_t *payload, + void **compressed_payload, + size_t *compressed_payload_size) { + rd_kafka_compression_t compression_used = RD_KAFKA_COMPRESSION_NONE; + rd_slice_t payload_slice; + size_t i; + rd_kafka_resp_err_t r = RD_KAFKA_RESP_ERR_NO_ERROR; + + if (payload->rbuf_len == 0) { + /* We can only initialize the slice to compress + * if not empty. */ + rd_kafka_dbg(rk, TELEMETRY, "PUSH", + "Empty payload. " + "Sending uncompressed payload"); + + /* It's not important the payload isn't actually a segment + * inside the buffer, as size is 0, we can send any allocated + * memory here, but we chose the buffer because it's + * freed like the other COMPRESSION_NONE case, without + * memory leaks. */ + *compressed_payload = payload; + *compressed_payload_size = 0; + return RD_KAFKA_COMPRESSION_NONE; + } + + rd_slice_init_full(&payload_slice, payload); + for (i = 0; i < rk->rk_telemetry.accepted_compression_types_cnt; i++) { + rd_kafka_compression_t compression_type = + rk->rk_telemetry.accepted_compression_types[i]; + switch (compression_type) { +#if WITH_ZLIB + case RD_KAFKA_COMPRESSION_GZIP: + /* TODO: Using 0 for compression level for now. */ + r = rd_kafka_gzip_compress(rkb, 0, &payload_slice, + compressed_payload, + compressed_payload_size); + compression_used = RD_KAFKA_COMPRESSION_GZIP; + break; +#endif + case RD_KAFKA_COMPRESSION_KLZ4: + /* TODO: Using 0 for compression level for now. */ + r = rd_kafka_lz4_compress( + rkb, rd_true, 0, &payload_slice, compressed_payload, + compressed_payload_size); + compression_used = RD_KAFKA_COMPRESSION_KLZ4; + break; +#if WITH_ZSTD + case RD_KAFKA_COMPRESSION_ZSTD: + /* TODO: Using 0 for compression level for now. */ + r = rd_kafka_zstd_compress(rkb, 0, &payload_slice, + compressed_payload, + compressed_payload_size); + compression_used = RD_KAFKA_COMPRESSION_ZSTD; + break; +#endif +#if WITH_SNAPPY + case RD_KAFKA_COMPRESSION_SNAPPY: + r = rd_kafka_snappy_compress_slice( + rkb, &payload_slice, compressed_payload, + compressed_payload_size); + compression_used = RD_KAFKA_COMPRESSION_SNAPPY; + break; +#endif + default: + break; + } + if (compression_used != RD_KAFKA_COMPRESSION_NONE && + r == RD_KAFKA_RESP_ERR_NO_ERROR) { + rd_kafka_dbg( + rk, TELEMETRY, "PUSH", + "Compressed payload of size %" PRIusz " to %" PRIusz + " using compression type " + "%s", + payload->rbuf_size, *compressed_payload_size, + rd_kafka_compression2str(compression_used)); + return compression_used; + } + } + if (compression_used != RD_KAFKA_COMPRESSION_NONE && + r != RD_KAFKA_RESP_ERR_NO_ERROR) { + rd_kafka_dbg(rk, TELEMETRY, "PUSH", + "Failed to compress payload with available " + "compression types"); + } + rd_kafka_dbg(rk, TELEMETRY, "PUSH", "Sending uncompressed payload"); + *compressed_payload = payload->rbuf_wpos->seg_p; + *compressed_payload_size = payload->rbuf_wpos->seg_of; + return RD_KAFKA_COMPRESSION_NONE; +} + +/** + * @brief Enqueues a PushTelemetryRequest. + * + * @locks none + * @locks_acquired none + * @locality main thread + */ +static void rd_kafka_send_push_telemetry(rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + rd_bool_t terminating) { + + rd_buf_t *metrics_payload = rd_kafka_telemetry_encode_metrics(rk); + size_t compressed_metrics_payload_size = 0; + void *compressed_metrics_payload = NULL; + rd_kafka_compression_t compression_used = RD_KAFKA_COMPRESSION_NONE; + if (metrics_payload) { + compression_used = rd_kafka_push_telemetry_payload_compress( + rk, rkb, metrics_payload, &compressed_metrics_payload, + &compressed_metrics_payload_size); + if (compressed_metrics_payload_size > + (size_t)rk->rk_telemetry.telemetry_max_bytes) { + rd_kafka_log(rk, LOG_WARNING, "TELEMETRY", + "Metrics payload size %" PRIusz + " exceeds telemetry_max_bytes %" PRId32 + "specified by the broker.", + compressed_metrics_payload_size, + rk->rk_telemetry.telemetry_max_bytes); + } + + rd_kafka_dbg( + rk, TELEMETRY, "PUSH", + "Sending PushTelemetryRequest with terminating = %s", + RD_STR_ToF(terminating)); + rd_kafka_PushTelemetryRequest( + rkb, &rk->rk_telemetry.client_instance_id, + rk->rk_telemetry.subscription_id, terminating, + compression_used, compressed_metrics_payload, + compressed_metrics_payload_size, NULL, 0, + RD_KAFKA_REPLYQ(rk->rk_ops, 0), + rd_kafka_handle_PushTelemetry, NULL); + } else { + rd_kafka_log(rk, LOG_WARNING, "PUSH", + "Telemetry metrics encode error, not sending " + "metrics"); + } + + if (metrics_payload) + rd_buf_destroy_free(metrics_payload); + if (compression_used != RD_KAFKA_COMPRESSION_NONE) + rd_free(compressed_metrics_payload); + + rk->rk_telemetry.state = terminating + ? RD_KAFKA_TELEMETRY_TERMINATING_PUSH_SENT + : RD_KAFKA_TELEMETRY_PUSH_SENT; +} + +/** + * @brief Progress the telemetry state machine. + * + * @locks none + * @locks_acquired rd_kafka_rdlock() + * @locality main thread + */ +static void rd_kafka_telemetry_fsm(rd_kafka_t *rk) { + rd_kafka_broker_t *preferred_broker = NULL; + + rd_dassert(rk); + rd_dassert(thrd_is_current(rk->rk_thread)); + + switch (rk->rk_telemetry.state) { + case RD_KAFKA_TELEMETRY_AWAIT_BROKER: + rd_dassert(!*"Should never be awaiting a broker when the telemetry fsm is called."); + break; + + case RD_KAFKA_TELEMETRY_GET_SUBSCRIPTIONS_SCHEDULED: + preferred_broker = rd_kafka_get_preferred_broker(rk); + if (!preferred_broker) { + rk->rk_telemetry.state = + RD_KAFKA_TELEMETRY_AWAIT_BROKER; + break; + } + rd_kafka_send_get_telemetry_subscriptions(rk, preferred_broker); + break; + + case RD_KAFKA_TELEMETRY_PUSH_SCHEDULED: + preferred_broker = rd_kafka_get_preferred_broker(rk); + if (!preferred_broker) { + rk->rk_telemetry.state = + RD_KAFKA_TELEMETRY_AWAIT_BROKER; + break; + } + rd_kafka_send_push_telemetry(rk, preferred_broker, rd_false); + break; + + case RD_KAFKA_TELEMETRY_PUSH_SENT: + case RD_KAFKA_TELEMETRY_GET_SUBSCRIPTIONS_SENT: + case RD_KAFKA_TELEMETRY_TERMINATING_PUSH_SENT: + rd_dassert(!*"Should never be awaiting response when the telemetry fsm is called."); + break; + + case RD_KAFKA_TELEMETRY_TERMINATING_PUSH_SCHEDULED: + preferred_broker = rd_kafka_get_preferred_broker(rk); + if (!preferred_broker) { + /* If there's no preferred broker, set state to + * terminated immediately to stop the app thread from + * waiting indefinitely. */ + rd_kafka_telemetry_set_terminated(rk); + break; + } + rd_kafka_send_push_telemetry(rk, preferred_broker, rd_true); + break; + + case RD_KAFKA_TELEMETRY_TERMINATED: + rd_dassert(!*"Should not be terminated when the telemetry fsm is called."); + break; + + default: + rd_assert(!*"Unknown state"); + } +} + +/** + * @brief Callback for FSM timer. + * + * @locks none + * @locks_acquired none + * @locality main thread + */ +void rd_kafka_telemetry_fsm_tmr_cb(rd_kafka_timers_t *rkts, void *rk) { + rd_kafka_telemetry_fsm(rk); +} + +/** + * @brief Handles parsed GetTelemetrySubscriptions response. + * + * @locks none + * @locks_acquired rd_kafka_rdlock() + * @locality main thread + */ +void rd_kafka_handle_get_telemetry_subscriptions(rd_kafka_t *rk, + rd_kafka_resp_err_t err) { + rd_ts_t next_scheduled; + double jitter_multiplier = + rd_jitter(100 - RD_KAFKA_TELEMETRY_PUSH_JITTER, + 100 + RD_KAFKA_TELEMETRY_PUSH_JITTER) / + 100.0; + rd_ts_t now_ns = rd_uclock() * 1000; + rd_kafka_broker_t *rkb = NULL; + + if (err != RD_KAFKA_RESP_ERR_NO_ERROR) { + rd_kafka_dbg(rk, TELEMETRY, "GETSUBSCRIPTIONS", + "GetTelemetrySubscriptionsRequest failed: %s", + rd_kafka_err2str(err)); + if (rk->rk_telemetry.push_interval_ms == 0) { + rk->rk_telemetry.push_interval_ms = + 30000; /* Default: 5min */ + } + } + + if (err == RD_KAFKA_RESP_ERR_NO_ERROR && + rk->rk_telemetry.requested_metrics_cnt) { + rd_kafka_match_requested_metrics(rk); + + /* Some metrics are requested. Start the timer accordingly */ + next_scheduled = (int)(jitter_multiplier * 1000 * + rk->rk_telemetry.push_interval_ms); + + rk->rk_telemetry.state = RD_KAFKA_TELEMETRY_PUSH_SCHEDULED; + + /* Set for the first push */ + if (rk->rk_telemetry.rk_historic_c.ts_start == 0) { + rk->rk_telemetry.rk_historic_c.ts_start = now_ns; + rk->rk_telemetry.rk_historic_c.ts_last = now_ns; + rd_kafka_rdlock(rk); + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + rkb->rkb_telemetry.rkb_historic_c.connects = + rd_atomic32_get(&rkb->rkb_c.connects); + } + rd_kafka_rdunlock(rk); + } + + } else { + /* No metrics requested, or we're in error. */ + next_scheduled = rk->rk_telemetry.push_interval_ms * 1000; + rk->rk_telemetry.state = + RD_KAFKA_TELEMETRY_GET_SUBSCRIPTIONS_SCHEDULED; + } + + rd_kafka_dbg(rk, TELEMETRY, "GETSUBSCRIPTIONS", + "Handled GetTelemetrySubscriptions, scheduling FSM after " + "%" PRId64 + " microseconds, state = %s, err = %s, metrics = %" PRIusz, + next_scheduled, + rd_kafka_telemetry_state2str(rk->rk_telemetry.state), + rd_kafka_err2str(err), + rk->rk_telemetry.requested_metrics_cnt); + + rd_kafka_timer_start_oneshot( + &rk->rk_timers, &rk->rk_telemetry.request_timer, rd_false, + next_scheduled, rd_kafka_telemetry_fsm_tmr_cb, rk); +} + +void rd_kafka_handle_push_telemetry(rd_kafka_t *rk, rd_kafka_resp_err_t err) { + + /* We only make a best-effort attempt to push telemetry while + * terminating, and don't care about any errors. */ + if (rk->rk_telemetry.state == + RD_KAFKA_TELEMETRY_TERMINATING_PUSH_SENT) { + rd_kafka_telemetry_set_terminated(rk); + return; + } + + /* There's a possiblity that we sent a PushTelemetryRequest, and + * scheduled a termination before getting the response. In that case, we + * will enter this method in the TERMINATED state when/if we get a + * response, and we should not take any action. */ + if (rk->rk_telemetry.state != RD_KAFKA_TELEMETRY_PUSH_SENT) + return; + + if (err == RD_KAFKA_RESP_ERR_NO_ERROR) { + rd_kafka_dbg(rk, TELEMETRY, "PUSH", + "PushTelemetryRequest succeeded"); + rk->rk_telemetry.state = RD_KAFKA_TELEMETRY_PUSH_SCHEDULED; + rd_kafka_timer_start_oneshot( + &rk->rk_timers, &rk->rk_telemetry.request_timer, rd_false, + rk->rk_telemetry.push_interval_ms * 1000, + rd_kafka_telemetry_fsm_tmr_cb, (void *)rk); + } else { /* error */ + rd_kafka_dbg(rk, TELEMETRY, "PUSH", + "PushTelemetryRequest failed: %s", + rd_kafka_err2str(err)); + /* Non-retriable errors */ + if (err == RD_KAFKA_RESP_ERR_INVALID_REQUEST || + err == RD_KAFKA_RESP_ERR_INVALID_RECORD) { + rd_kafka_log( + rk, LOG_WARNING, "TELEMETRY", + "PushTelemetryRequest failed with non-retriable " + "error: %s. Stopping telemetry.", + rd_kafka_err2str(err)); + rd_kafka_telemetry_set_terminated(rk); + return; + } + + if (err == RD_KAFKA_RESP_ERR_TELEMETRY_TOO_LARGE) { + rd_kafka_log( + rk, LOG_WARNING, "TELEMETRY", + "PushTelemetryRequest failed because of payload " + "size too large: %s. Continuing telemetry.", + rd_kafka_err2str(err)); + rk->rk_telemetry.state = + RD_KAFKA_TELEMETRY_PUSH_SCHEDULED; + rd_kafka_timer_start_oneshot( + &rk->rk_timers, &rk->rk_telemetry.request_timer, + rd_false, rk->rk_telemetry.push_interval_ms * 1000, + rd_kafka_telemetry_fsm_tmr_cb, (void *)rk); + return; + } + + rd_ts_t next_scheduled = + err == RD_KAFKA_RESP_ERR_UNKNOWN_SUBSCRIPTION_ID + ? 0 + : rk->rk_telemetry.push_interval_ms * 1000; + + rk->rk_telemetry.state = + RD_KAFKA_TELEMETRY_GET_SUBSCRIPTIONS_SCHEDULED; + rd_kafka_timer_start_oneshot( + &rk->rk_timers, &rk->rk_telemetry.request_timer, rd_false, + next_scheduled, rd_kafka_telemetry_fsm_tmr_cb, (void *)rk); + } +} + +/** + * @brief This method starts the termination for telemetry and awaits + * completion. + * + * @locks none + * @locks_acquired rk_telemetry.lock + * @locality app thread (normal case) or the main thread (when terminated + * during creation). + */ +void rd_kafka_telemetry_await_termination(rd_kafka_t *rk) { + rd_kafka_op_t *rko; + + /* In the case where we have a termination during creation, we can't + * send any telemetry. */ + if (thrd_is_current(rk->rk_thread) || + !rk->rk_conf.enable_metrics_push) { + rd_kafka_telemetry_set_terminated(rk); + return; + } + + mtx_lock(&rk->rk_telemetry.lock); + rko = rd_kafka_op_new(RD_KAFKA_OP_TERMINATE_TELEMETRY); + rko->rko_rk = rk; + rd_kafka_q_enq(rk->rk_ops, rko); + + /* Await termination sequence completion. */ + rd_kafka_dbg(rk, TELEMETRY, "TERM", + "Awaiting termination of telemetry."); + cnd_timedwait_ms(&rk->rk_telemetry.termination_cnd, + &rk->rk_telemetry.lock, + 1000 /* timeout for waiting */); + mtx_unlock(&rk->rk_telemetry.lock); + rd_kafka_dbg(rk, TELEMETRY, "TERM", + "Ended waiting for termination of telemetry."); +} + +/** + * @brief Send a final push request before terminating. + * + * @locks none + * @locks_acquired none + * @locality main thread + * @note This method is on a best-effort basis. + */ +void rd_kafka_telemetry_schedule_termination(rd_kafka_t *rk) { + rd_kafka_dbg( + rk, TELEMETRY, "TERM", + "Starting rd_kafka_telemetry_schedule_termination in state %s", + rd_kafka_telemetry_state2str(rk->rk_telemetry.state)); + + if (rk->rk_telemetry.state != RD_KAFKA_TELEMETRY_PUSH_SCHEDULED) { + rd_kafka_telemetry_set_terminated(rk); + return; + } + + rk->rk_telemetry.state = RD_KAFKA_TELEMETRY_TERMINATING_PUSH_SCHEDULED; + + rd_kafka_dbg(rk, TELEMETRY, "TERM", "Sending final request for Push"); + rd_kafka_timer_override_once( + &rk->rk_timers, &rk->rk_telemetry.request_timer, 0 /* immediate */); +} + + +/** + * @brief Sets telemetry broker if we are in AWAIT_BROKER state. + * + * @locks none + * @locks_acquired rk_telemetry.lock + * @locality main thread + */ +void rd_kafka_set_telemetry_broker_maybe(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_dassert(thrd_is_current(rk->rk_thread)); + + /* The op triggering this method is scheduled by brokers without knowing + * if a preferred broker is already set. If it is set, this method is a + * no-op. */ + if (rk->rk_telemetry.state != RD_KAFKA_TELEMETRY_AWAIT_BROKER) + return; + + mtx_lock(&rk->rk_telemetry.lock); + + if (rk->rk_telemetry.preferred_broker) { + mtx_unlock(&rk->rk_telemetry.lock); + return; + } + + rd_kafka_broker_keep(rkb); + rk->rk_telemetry.preferred_broker = rkb; + + mtx_unlock(&rk->rk_telemetry.lock); + + rd_kafka_dbg(rk, TELEMETRY, "SETBROKER", + "Setting telemetry broker to %s\n", rkb->rkb_name); + + rk->rk_telemetry.state = RD_KAFKA_TELEMETRY_GET_SUBSCRIPTIONS_SCHEDULED; + + rd_kafka_timer_start_oneshot( + &rk->rk_timers, &rk->rk_telemetry.request_timer, rd_false, + 0 /* immediate */, rd_kafka_telemetry_fsm_tmr_cb, (void *)rk); +} + +/** + * @brief Overlapping prefixes should not match the metrics + * multiple times. + */ +int unit_test_telemetry_match_requested_metrics_no_duplicates(void) { + rd_kafka_t *rk = rd_kafka_new(RD_KAFKA_PRODUCER, NULL, NULL, 0); + rk->rk_telemetry.requested_metrics_cnt = 3; + rk->rk_telemetry.requested_metrics = + rd_calloc(rk->rk_telemetry.requested_metrics_cnt, sizeof(char *)); + rk->rk_telemetry.requested_metrics[0] = rd_strdup("org"); + rk->rk_telemetry.requested_metrics[1] = rd_strdup("org.apache"); + rk->rk_telemetry.requested_metrics[2] = rd_strdup("org.apache.kafka"); + rd_kafka_match_requested_metrics(rk); + + RD_UT_ASSERT(rk->rk_telemetry.matched_metrics_cnt == + RD_KAFKA_TELEMETRY_PRODUCER_METRIC__CNT, + "Expected %d matched metrics, got %" PRIusz, + RD_KAFKA_TELEMETRY_PRODUCER_METRIC__CNT, + rk->rk_telemetry.matched_metrics_cnt); + rd_kafka_destroy(rk); + return 0; +} + + +int unittest_telemetry(void) { + int fails = 0; + fails += unit_test_telemetry_match_requested_metrics_no_duplicates(); + return fails; +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_telemetry.h b/src/third_party/librdkafka/dist/src/rdkafka_telemetry.h new file mode 100644 index 00000000000..e7ab0b7eb3b --- /dev/null +++ b/src/third_party/librdkafka/dist/src/rdkafka_telemetry.h @@ -0,0 +1,52 @@ +/* + * librdkafka - Apache Kafka C library + * + * Copyright (c) 2023, Confluent Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _RD_KAFKA_TELEMETRY_H_ +#define _RD_KAFKA_TELEMETRY_H_ + +#include "rdkafka_int.h" + +#define RD_KAFKA_TELEMETRY_METRICS_ALL_METRICS_SUBSCRIPTION "*" +#define RD_KAFKA_TELEMETRY_METRIC_NAME_MAX_LEN 128 + +void rd_kafka_handle_get_telemetry_subscriptions(rd_kafka_t *rk, + rd_kafka_resp_err_t err); + +void rd_kafka_handle_push_telemetry(rd_kafka_t *rk, rd_kafka_resp_err_t err); + +void rd_kafka_telemetry_clear(rd_kafka_t *rk, + rd_bool_t clear_control_flow_fields); + +void rd_kafka_telemetry_await_termination(rd_kafka_t *rk); + +void rd_kafka_telemetry_schedule_termination(rd_kafka_t *rk); + +void rd_kafka_set_telemetry_broker_maybe(rd_kafka_t *rk, + rd_kafka_broker_t *rkb); +#endif /* _RD_KAFKA_TELEMETRY_H_ */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_telemetry_decode.c b/src/third_party/librdkafka/dist/src/rdkafka_telemetry_decode.c new file mode 100644 index 00000000000..1563b2bb5fb --- /dev/null +++ b/src/third_party/librdkafka/dist/src/rdkafka_telemetry_decode.c @@ -0,0 +1,1053 @@ +/* + * librdkafka - Apache Kafka C library + * + * Copyright (c) 2023, Confluent Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rdkafka_telemetry_decode.h" +#include "nanopb/pb_decode.h" +#include "rdunittest.h" +#include "rdkafka_lz4.h" +#include "rdgz.h" +#include "rdkafka_zstd.h" +#include "snappy.h" +#include "rdfloat.h" + + +#define _NANOPB_STRING_DECODE_MAX_BUFFER_SIZE 1024 +#define MAX_LABELS 10 +#define UNITTEST_MARKER "unittest" + +enum unit_test_string_decoding_state { + STATE_LABELS, + STATE_VERSION, + STATE_METRIC_NAME, + STATE_METRIC_DESCRIPTION, + STATE_COMPLETE +}; + +struct unit_test_metric_label { + char key[_NANOPB_STRING_DECODE_MAX_BUFFER_SIZE]; + char value[_NANOPB_STRING_DECODE_MAX_BUFFER_SIZE]; +}; + +struct unit_test_data { + rd_kafka_telemetry_metric_type_t type; + int32_t current_field; + struct unit_test_metric_label labels[MAX_LABELS]; + int label_count; + char version[_NANOPB_STRING_DECODE_MAX_BUFFER_SIZE]; + char metric_name[_NANOPB_STRING_DECODE_MAX_BUFFER_SIZE]; + char metric_description[_NANOPB_STRING_DECODE_MAX_BUFFER_SIZE]; + int64_t metric_value_int; + int64_t expected_metric_value_int; + double metric_value_double; + double expected_metric_value_double; + int64_t int64_value; + uint64_t metric_time; + enum unit_test_string_decoding_state state; + bool expecting_label_value; +}; + +static struct unit_test_data unit_test_data; + +static void clear_unit_test_data(int64_t expected_value_int, + double expected_value_double) { + memset(&unit_test_data, 0, sizeof(unit_test_data)); + + unit_test_data.type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE; + unit_test_data.state = STATE_LABELS; + unit_test_data.expecting_label_value = false; + unit_test_data.version[0] = '\0'; + unit_test_data.metric_name[0] = '\0'; + unit_test_data.metric_description[0] = '\0'; + unit_test_data.current_field = 0; + unit_test_data.label_count = 0; + unit_test_data.metric_value_int = 0; + unit_test_data.metric_value_double = 0.0; + unit_test_data.metric_time = 0; + unit_test_data.int64_value = 0; + unit_test_data.expected_metric_value_int = expected_value_int; + unit_test_data.expected_metric_value_double = expected_value_double; +} + +static bool +decode_string(pb_istream_t *stream, const pb_field_t *field, void **arg) { + rd_kafka_telemetry_decode_interface_t *decode_interface = *arg; + uint8_t buffer[_NANOPB_STRING_DECODE_MAX_BUFFER_SIZE] = {0}; + + if (stream->bytes_left > sizeof(buffer) - 1) { + RD_INTERFACE_CALL(decode_interface, decode_error, + "String too long for buffer"); + return false; + } + + if (!pb_read(stream, buffer, stream->bytes_left)) { + RD_INTERFACE_CALL(decode_interface, decode_error, + "Failed to read string"); + return false; + } + + RD_INTERFACE_CALL(decode_interface, decoded_string, buffer); + return true; +} + +static bool +decode_key_value(pb_istream_t *stream, const pb_field_t *field, void **arg) { + rd_kafka_telemetry_decode_interface_t *decode_interface = *arg; + opentelemetry_proto_common_v1_KeyValue key_value = + opentelemetry_proto_common_v1_KeyValue_init_zero; + key_value.key.funcs.decode = &decode_string; + key_value.key.arg = decode_interface; + key_value.value.value.string_value.funcs.decode = &decode_string; + key_value.value.value.string_value.arg = decode_interface; + if (!pb_decode(stream, opentelemetry_proto_common_v1_KeyValue_fields, + &key_value)) { + RD_INTERFACE_CALL(decode_interface, decode_error, + "Failed to decode KeyValue: %s", + PB_GET_ERROR(stream)); + return false; + } + + if (key_value.value.which_value == + opentelemetry_proto_common_v1_AnyValue_int_value_tag) { + RD_INTERFACE_CALL(decode_interface, decoded_int64, + key_value.value.value.int_value); + } + + return true; +} + +static bool decode_number_data_point(pb_istream_t *stream, + const pb_field_t *field, + void **arg) { + rd_kafka_telemetry_decode_interface_t *decode_interface = *arg; + opentelemetry_proto_metrics_v1_NumberDataPoint data_point = + opentelemetry_proto_metrics_v1_NumberDataPoint_init_zero; + data_point.attributes.funcs.decode = &decode_key_value; + data_point.attributes.arg = decode_interface; + if (!pb_decode(stream, + opentelemetry_proto_metrics_v1_NumberDataPoint_fields, + &data_point)) { + RD_INTERFACE_CALL(decode_interface, decode_error, + "Failed to decode NumberDataPoint: %s", + PB_GET_ERROR(stream)); + return false; + } + + RD_INTERFACE_CALL(decode_interface, decoded_NumberDataPoint, + &data_point); + return true; +} + +// TODO: add support for other data types +static bool +data_msg_callback(pb_istream_t *stream, const pb_field_t *field, void **arg) { + rd_kafka_telemetry_decode_interface_t *decode_interface = *arg; + if (field->tag == opentelemetry_proto_metrics_v1_Metric_sum_tag) { + opentelemetry_proto_metrics_v1_Sum *sum = field->pData; + sum->data_points.funcs.decode = &decode_number_data_point; + sum->data_points.arg = decode_interface; + if (decode_interface->decoded_type) { + RD_INTERFACE_CALL(decode_interface, decoded_type, + RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM); + } + } else if (field->tag == + opentelemetry_proto_metrics_v1_Metric_gauge_tag) { + opentelemetry_proto_metrics_v1_Gauge *gauge = field->pData; + gauge->data_points.funcs.decode = &decode_number_data_point; + gauge->data_points.arg = decode_interface; + if (decode_interface->decoded_type) { + RD_INTERFACE_CALL(decode_interface, decoded_type, + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE); + } + } + return true; +} + + +static bool +decode_metric(pb_istream_t *stream, const pb_field_t *field, void **arg) { + rd_kafka_telemetry_decode_interface_t *decode_interface = *arg; + opentelemetry_proto_metrics_v1_Metric metric = + opentelemetry_proto_metrics_v1_Metric_init_zero; + metric.name.funcs.decode = &decode_string; + metric.name.arg = decode_interface; + metric.description.funcs.decode = &decode_string; + metric.description.arg = decode_interface; + metric.cb_data.funcs.decode = &data_msg_callback; + metric.cb_data.arg = decode_interface; + + if (!pb_decode(stream, opentelemetry_proto_metrics_v1_Metric_fields, + &metric)) { + RD_INTERFACE_CALL(decode_interface, decode_error, + "Failed to decode Metric: %s", + PB_GET_ERROR(stream)); + return false; + } + + return true; +} + +static bool decode_scope_metrics(pb_istream_t *stream, + const pb_field_t *field, + void **arg) { + rd_kafka_telemetry_decode_interface_t *decode_interface = *arg; + opentelemetry_proto_metrics_v1_ScopeMetrics scope_metrics = + opentelemetry_proto_metrics_v1_ScopeMetrics_init_zero; + scope_metrics.scope.name.funcs.decode = &decode_string; + scope_metrics.scope.name.arg = decode_interface; + scope_metrics.scope.version.funcs.decode = &decode_string; + scope_metrics.scope.version.arg = decode_interface; + scope_metrics.metrics.funcs.decode = &decode_metric; + scope_metrics.metrics.arg = decode_interface; + + if (!pb_decode(stream, + opentelemetry_proto_metrics_v1_ScopeMetrics_fields, + &scope_metrics)) { + RD_INTERFACE_CALL(decode_interface, decode_error, + "Failed to decode ScopeMetrics: %s", + PB_GET_ERROR(stream)); + return false; + } + return true; +} + +static bool decode_resource_metrics(pb_istream_t *stream, + const pb_field_t *field, + void **arg) { + rd_kafka_telemetry_decode_interface_t *decode_interface = *arg; + opentelemetry_proto_metrics_v1_ResourceMetrics resource_metrics = + opentelemetry_proto_metrics_v1_ResourceMetrics_init_zero; + resource_metrics.resource.attributes.funcs.decode = &decode_key_value; + resource_metrics.resource.attributes.arg = decode_interface; + resource_metrics.scope_metrics.funcs.decode = &decode_scope_metrics; + resource_metrics.scope_metrics.arg = decode_interface; + if (!pb_decode(stream, + opentelemetry_proto_metrics_v1_ResourceMetrics_fields, + &resource_metrics)) { + RD_INTERFACE_CALL(decode_interface, decode_error, + "Failed to decode ResourceMetrics: %s", + PB_GET_ERROR(stream)); + return false; + } + return true; +} + +#if WITH_SNAPPY + +static int rd_kafka_snappy_decompress(rd_kafka_broker_t *rkb, + const char *compressed, + size_t compressed_size, + void **outbuf, + size_t *outbuf_len) { + struct iovec iov = {.iov_base = NULL, .iov_len = 0}; + + const char *inbuf = compressed; + size_t inlen = compressed_size; + int r; + static const unsigned char snappy_java_magic[] = {0x82, 'S', 'N', 'A', + 'P', 'P', 'Y', 0}; + static const size_t snappy_java_hdrlen = 8 + 4 + 4; + + /* snappy-java adds its own header (SnappyCodec) + * which is not compatible with the official Snappy + * implementation. + * 8: magic, 4: version, 4: compatible + * followed by any number of chunks: + * 4: length + * ...: snappy-compressed data. */ + if (likely(inlen > snappy_java_hdrlen + 4 && + !memcmp(inbuf, snappy_java_magic, 8))) { + /* snappy-java framing */ + char errstr[128]; + + inbuf = inbuf + snappy_java_hdrlen; + inlen -= snappy_java_hdrlen; + iov.iov_base = rd_kafka_snappy_java_uncompress( + inbuf, inlen, &iov.iov_len, errstr, sizeof(errstr)); + + if (unlikely(!iov.iov_base)) { + rd_rkb_dbg( + rkb, MSG, "SNAPPY", + "Snappy decompression for message failed: %s: " + "ignoring message", + errstr); + return -1; // Indicates decompression error + } + + + } else { + /* No framing */ + + /* Acquire uncompressed length */ + if (unlikely(!rd_kafka_snappy_uncompressed_length( + inbuf, inlen, &iov.iov_len))) { + rd_rkb_dbg( + rkb, MSG, "SNAPPY", + "Failed to get length of Snappy compressed payload " + "for message (%" PRIusz + " bytes): " + "ignoring message", + inlen); + return -1; // Indicates decompression error + } + + /* Allocate output buffer for uncompressed data */ + iov.iov_base = rd_malloc(iov.iov_len); + if (unlikely(!iov.iov_base)) { + rd_rkb_dbg(rkb, MSG, "SNAPPY", + "Failed to allocate Snappy decompress " + "buffer of size %" PRIusz + " for message (%" PRIusz + " bytes): %s: " + "ignoring message", + *outbuf_len, inlen, rd_strerror(errno)); + return -1; // Indicates memory allocation error + } + + /* Uncompress to outbuf */ + if (unlikely((r = rd_kafka_snappy_uncompress(inbuf, inlen, + iov.iov_base)))) { + rd_rkb_dbg( + rkb, MSG, "SNAPPY", + "Failed to decompress Snappy payload for message " + "(%" PRIusz + " bytes): %s: " + "ignoring message", + inlen, rd_strerror(errno)); + rd_free(iov.iov_base); + return -1; // Indicates decompression error + } + } + *outbuf = iov.iov_base; + *outbuf_len = iov.iov_len; + return 0; +} +#endif + +/* + * Decompress a payload using the specified compression type. Allocates memory + * for uncompressed payload. + * @returns 0 on success, -1 on failure. Allocated memory in + * uncompressed_payload and its size in uncompressed_payload_size. + */ +int rd_kafka_telemetry_uncompress_metrics_payload( + rd_kafka_broker_t *rkb, + rd_kafka_compression_t compression_type, + void *compressed_payload, + size_t compressed_payload_size, + void **uncompressed_payload, + size_t *uncompressed_payload_size) { + int r = -1; + switch (compression_type) { +#if WITH_ZLIB + case RD_KAFKA_COMPRESSION_GZIP: + *uncompressed_payload = rd_gz_decompress( + compressed_payload, (int)compressed_payload_size, + (uint64_t *)uncompressed_payload_size); + if (*uncompressed_payload == NULL) + r = -1; + else + r = 0; + break; +#endif + case RD_KAFKA_COMPRESSION_KLZ4: + r = rd_kafka_lz4_decompress( + rkb, 0, 0, compressed_payload, compressed_payload_size, + uncompressed_payload, uncompressed_payload_size); + break; +#if WITH_ZSTD + case RD_KAFKA_COMPRESSION_ZSTD: + r = rd_kafka_zstd_decompress( + rkb, compressed_payload, compressed_payload_size, + uncompressed_payload, uncompressed_payload_size); + break; +#endif +#if WITH_SNAPPY + case RD_KAFKA_COMPRESSION_SNAPPY: + r = rd_kafka_snappy_decompress( + rkb, compressed_payload, compressed_payload_size, + uncompressed_payload, uncompressed_payload_size); + break; +#endif + default: + rd_kafka_log(rkb->rkb_rk, LOG_WARNING, "TELEMETRY", + "Unknown compression type: %d", compression_type); + break; + } + return r; +} + +/** + * Decode a metric from a buffer encoded with + * opentelemetry_proto_metrics_v1_MetricsData datatype. Used for testing and + * debugging. + * + * @param decode_interface The decode_interface to pass as arg when decoding the + * buffer. + * @param buffer The buffer to decode. + * @param size The size of the buffer. + */ +int rd_kafka_telemetry_decode_metrics( + rd_kafka_telemetry_decode_interface_t *decode_interface, + void *buffer, + size_t size) { + opentelemetry_proto_metrics_v1_MetricsData metricsData = + opentelemetry_proto_metrics_v1_MetricsData_init_zero; + + pb_istream_t stream = pb_istream_from_buffer(buffer, size); + metricsData.resource_metrics.arg = decode_interface; + metricsData.resource_metrics.funcs.decode = &decode_resource_metrics; + + bool status = pb_decode( + &stream, opentelemetry_proto_metrics_v1_MetricsData_fields, + &metricsData); + if (!status) { + RD_INTERFACE_CALL(decode_interface, decode_error, + "Failed to decode MetricsData: %s", + PB_GET_ERROR(&stream)); + } + return status; +} + +static void unit_test_telemetry_decoded_string(void *opaque, + const uint8_t *decoded) { + + switch (unit_test_data.state) { + case STATE_LABELS: + if (strcmp((const char *)decoded, UNITTEST_MARKER) == 0) { + unit_test_data.state = STATE_VERSION; + unit_test_data.expecting_label_value = false; + } else if (unit_test_data.expecting_label_value) { + rd_snprintf(unit_test_data + .labels[unit_test_data.label_count - 1] + .value, + sizeof(unit_test_data.labels[0].value), + "%s", decoded); + unit_test_data.expecting_label_value = false; + } else { + if (unit_test_data.label_count < MAX_LABELS) { + rd_snprintf( + unit_test_data + .labels[unit_test_data.label_count] + .key, + sizeof(unit_test_data.labels[0].key), "%s", + decoded); + unit_test_data.label_count++; + unit_test_data.expecting_label_value = true; + } + } + unit_test_data.current_field++; + break; + + case STATE_VERSION: + rd_snprintf(unit_test_data.version, + sizeof(unit_test_data.version), "%s", decoded); + unit_test_data.state = STATE_METRIC_NAME; + unit_test_data.current_field++; + break; + + case STATE_METRIC_NAME: + rd_snprintf(unit_test_data.metric_name, + sizeof(unit_test_data.metric_name), "%s", decoded); + unit_test_data.state = STATE_METRIC_DESCRIPTION; + unit_test_data.current_field++; + break; + + case STATE_METRIC_DESCRIPTION: + rd_snprintf(unit_test_data.metric_description, + sizeof(unit_test_data.metric_description), "%s", + decoded); + unit_test_data.state = STATE_COMPLETE; + unit_test_data.current_field++; + break; + + case STATE_COMPLETE: + break; + } +} + +static void unit_test_telemetry_decoded_NumberDataPoint( + void *opaque, + const opentelemetry_proto_metrics_v1_NumberDataPoint *decoded) { + unit_test_data.metric_value_int = decoded->value.as_int; + unit_test_data.metric_value_double = decoded->value.as_double; + unit_test_data.metric_time = decoded->time_unix_nano; + unit_test_data.current_field++; +} + +static void unit_test_telemetry_decoded_int64(void *opaque, + int64_t int64_value) { + unit_test_data.int64_value = int64_value; +} + +static void +unit_test_telemetry_decoded_type(void *opaque, + rd_kafka_telemetry_metric_type_t type) { + unit_test_data.type = type; + unit_test_data.current_field++; +} + +static void +unit_test_telemetry_decode_error(void *opaque, const char *error, ...) { + char buffer[1024]; + va_list ap; + va_start(ap, error); + rd_vsnprintf(buffer, sizeof(buffer), error, ap); + va_end(ap); + RD_UT_SAY("%s", buffer); + rd_assert(!*"Failure while decoding telemetry data"); +} + +int unit_test_telemetry(rd_kafka_type_t rk_type, + rd_kafka_telemetry_producer_metric_name_t metric_name, + const char *expected_name, + const char *expected_description, + rd_kafka_telemetry_metric_type_t expected_type, + rd_bool_t is_double, + rd_bool_t is_per_broker, + void (*set_metric_value)(rd_kafka_t *, + rd_kafka_broker_t *), + int64_t expected_value_int, + double expected_value_double) { + rd_kafka_t *rk = rd_calloc(1, sizeof(*rk)); + rwlock_init(&rk->rk_lock); + rd_kafka_conf_t *conf = rd_kafka_conf_new(); + char *client_rack = "rack1", *transactional_id = "tx-id", + *group_id = "group-id", *group_instance_id = "group-instance-id"; + rd_kafka_conf_set(conf, "client.rack", client_rack, NULL, 0); + rd_kafka_conf_set(conf, "transactional.id", transactional_id, NULL, 0); + rd_kafka_conf_set(conf, "group.id", group_id, NULL, 0); + rd_kafka_conf_set(conf, "group.instance.id", group_instance_id, NULL, + 0); + rk->rk_conf = *conf; + rd_free(conf); + + rk->rk_type = rk_type; + rk->rk_cgrp = rd_calloc(1, sizeof(*rk->rk_cgrp)); + rk->rk_broker_cnt.val = 1; + rk->rk_telemetry.matched_metrics_cnt = 1; + rk->rk_telemetry.matched_metrics = + rd_malloc(sizeof(rd_kafka_telemetry_producer_metric_name_t) * + rk->rk_telemetry.matched_metrics_cnt); + rk->rk_telemetry.matched_metrics[0] = metric_name; + rk->rk_telemetry.rk_historic_c.ts_start = + (rd_uclock() - 1000 * 1000) * 1000; + rk->rk_telemetry.rk_historic_c.ts_last = + (rd_uclock() - 1000 * 1000) * 1000; + + rd_avg_init(&rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rk->rk_telemetry.rd_avg_current.rk_avg_commit_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rk->rk_telemetry.rd_avg_current.rk_avg_rebalance_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + + rd_avg_init(&rk->rk_telemetry.rd_avg_rollover.rk_avg_poll_idle_ratio, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rk->rk_telemetry.rd_avg_rollover.rk_avg_commit_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + + rd_strlcpy(rk->rk_name, "unittest", sizeof(rk->rk_name)); + clear_unit_test_data(expected_value_int, expected_value_double); + + rd_kafka_telemetry_decode_interface_t decode_interface = { + .decoded_string = unit_test_telemetry_decoded_string, + .decoded_NumberDataPoint = + unit_test_telemetry_decoded_NumberDataPoint, + .decoded_int64 = unit_test_telemetry_decoded_int64, + .decoded_type = unit_test_telemetry_decoded_type, + .decode_error = unit_test_telemetry_decode_error, + .opaque = &unit_test_data, + }; + + TAILQ_INIT(&rk->rk_brokers); + + rd_kafka_broker_t *rkb = rd_calloc(1, sizeof(*rkb)); + rkb->rkb_nodeid = 1001; + mtx_init(&rkb->rkb_lock, mtx_plain); + + rd_avg_init(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_fetch_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + + rd_avg_init(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_rtt, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_outbuf_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_throttle, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_fetch_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_produce_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + rd_avg_init(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_produce_latency, + RD_AVG_GAUGE, 0, 500 * 1000, 2, rd_true); + + set_metric_value(rk, rkb); + + TAILQ_INSERT_HEAD(&rk->rk_brokers, rkb, rkb_link); + rd_buf_t *rbuf = rd_kafka_telemetry_encode_metrics(rk); + void *metrics_payload = rbuf->rbuf_wpos->seg_p; + size_t metrics_payload_size = rbuf->rbuf_wpos->seg_of; + RD_UT_SAY("metrics_payload_size: %" PRIusz, metrics_payload_size); + + RD_UT_ASSERT(metrics_payload_size != 0, "Metrics payload zero"); + + bool decode_status = rd_kafka_telemetry_decode_metrics( + &decode_interface, metrics_payload, metrics_payload_size); + + RD_UT_ASSERT(decode_status == 1, "Decoding failed"); + RD_UT_ASSERT(unit_test_data.type == expected_type, + "Metric type mismatch"); + RD_UT_ASSERT(strcmp(unit_test_data.metric_name, expected_name) == 0, + "Metric name mismatch %s != %s", + unit_test_data.metric_name, expected_name); + RD_UT_ASSERT(strcmp(unit_test_data.metric_description, + expected_description) == 0, + "Metric description mismatch"); + if (is_double) + RD_UT_ASSERT( + rd_dbl_eq0(unit_test_data.metric_value_double, + unit_test_data.expected_metric_value_double, + 0.01), + "Metric value mismatch"); + else + RD_UT_ASSERT(unit_test_data.metric_value_int == + unit_test_data.expected_metric_value_int, + "Metric value mismatch"); + if (is_per_broker) + RD_UT_ASSERT(unit_test_data.int64_value == 1001, + "Expected broker mismatch"); + RD_UT_ASSERT(unit_test_data.metric_time != 0, "Metric time mismatch"); + if (rk_type == RD_KAFKA_PRODUCER) { + RD_UT_ASSERT(unit_test_data.label_count == 2, + "Label count mismatch"); + RD_UT_ASSERT( + strcmp(unit_test_data.labels[0].key, "client_rack") == 0, + "Client rack key mismatch"); + RD_UT_ASSERT( + strcmp(unit_test_data.labels[0].value, client_rack) == 0, + "Client rack value mismatch"); + RD_UT_ASSERT(strcmp(unit_test_data.labels[1].key, + "transactional_id") == 0, + "Transactional id key mismatch"); + RD_UT_ASSERT(strcmp(unit_test_data.labels[1].value, + transactional_id) == 0, + "Transactional id value mismatch"); + } else { + RD_UT_ASSERT(unit_test_data.label_count == 3, + "Label count mismatch"); + RD_UT_ASSERT( + strcmp(unit_test_data.labels[0].key, "client_rack") == 0, + "Client rack key mismatch"); + RD_UT_ASSERT( + strcmp(unit_test_data.labels[0].value, client_rack) == 0, + "Client rack value mismatch"); + RD_UT_ASSERT(strcmp(unit_test_data.labels[1].key, "group_id") == + 0, + "Group id key mismatch"); + RD_UT_ASSERT(strcmp(unit_test_data.labels[1].value, group_id) == + 0, + "Group id value mismatch"); + RD_UT_ASSERT(strcmp(unit_test_data.labels[2].key, + "group_instance_id") == 0, + "Group instance id key mismatch"); + RD_UT_ASSERT(strcmp(unit_test_data.labels[2].value, + group_instance_id) == 0, + "Group instance id value mismatch"); + } + + rd_free(rk->rk_telemetry.matched_metrics); + rd_buf_destroy_free(rbuf); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_rtt); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_outbuf_latency); + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_throttle); + + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_fetch_latency); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_fetch_latency); + + rd_avg_destroy(&rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio); + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_poll_idle_ratio); + + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_current.rk_avg_rebalance_latency); + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency); + + rd_avg_destroy(&rk->rk_telemetry.rd_avg_current.rk_avg_commit_latency); + rd_avg_destroy(&rk->rk_telemetry.rd_avg_rollover.rk_avg_commit_latency); + + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_produce_latency); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_produce_latency); + + mtx_destroy(&rkb->rkb_lock); + rd_free(rkb); + rwlock_destroy(&rk->rk_lock); + rd_free(rk->rk_cgrp); + rd_kafka_anyconf_destroy(_RK_GLOBAL, &rk->rk_conf); + rd_free(rk); + RD_UT_PASS(); + return 0; +} + +void unit_test_telemetry_set_connects(rd_kafka_t *rk, rd_kafka_broker_t *rkb) { + rkb->rkb_c.connects.val = 1; +} + +void unit_test_telemetry_set_connects2(rd_kafka_t *rk, rd_kafka_broker_t *rkb) { + rkb->rkb_c.connects.val = 2; +} + +void unit_test_telemetry_set_rtt(rd_kafka_t *rk, rd_kafka_broker_t *rkb) { + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt, 1000); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt, 1000); +} + +void unit_test_telemetry_set_throttle_time(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle, 1); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle, 1); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle, 1); +} + +void unit_test_telemetry_set_queue_time(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency, + 1000); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency, + 1000); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency, + 1000); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency, + 1000); +} + +void unit_test_telemetry_set_produce_latency(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_produce_latency, + 1000); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_produce_latency, + 1000); +} + +void unit_test_telemetry_set_coordinator_assigned_partitions( + rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rk->rk_cgrp->rkcg_c.assignment_size = 1; +} + +void unit_test_telemetry_set_rebalance_latency(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_avg_add(&rk->rk_telemetry.rd_avg_current.rk_avg_rebalance_latency, + 1000); +} + +void unit_test_telemetry_set_fetch_latency(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_fetch_latency, + 1000); + rd_avg_add(&rkb->rkb_telemetry.rd_avg_current.rkb_avg_fetch_latency, + 1000); +} + +void unit_test_telemetry_set_poll_idle_ratio(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_avg_add(&rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio, + 1000000); + rd_avg_add(&rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio, + 1000000); + rd_avg_add(&rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio, + 1000000); +} + +void unit_test_telemetry_set_commit_latency(rd_kafka_t *rk, + rd_kafka_broker_t *rkb) { + rd_avg_add(&rk->rk_telemetry.rd_avg_current.rk_avg_commit_latency, + 1000); + rd_avg_add(&rk->rk_telemetry.rd_avg_current.rk_avg_commit_latency, + 1000); +} + +int unit_test_telemetry_gauge(void) { + int fails = 0; + int64_t default_expected_value_int = 1; + double default_expected_value_double = 1.0; + /* Producer metrics */ + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_CONNECTION_CREATION_RATE, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "producer.connection.creation.rate", + "The rate of connections established per second.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_connects, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_NODE_REQUEST_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "producer.node.request.latency.avg", + "The average request latency in ms for a node.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_true, + unit_test_telemetry_set_rtt, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_NODE_REQUEST_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "producer.node.request.latency.max", + "The maximum request latency in ms for a node.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_true, + unit_test_telemetry_set_rtt, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_THROTTLE_TIME_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "producer.produce.throttle.time.avg", + "The average throttle time in ms for a node.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_throttle_time, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_THROTTLE_TIME_MAX, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "producer.produce.throttle.time.max", + "The maximum throttle time in ms for a node.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_false, + unit_test_telemetry_set_throttle_time, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_RECORD_QUEUE_TIME_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX "producer.record.queue.time.avg", + "The average time in ms a record spends in the producer queue.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_queue_time, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_RECORD_QUEUE_TIME_MAX, + RD_KAFKA_TELEMETRY_METRIC_PREFIX "producer.record.queue.time.max", + "The maximum time in ms a record spends in the producer queue.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_false, + unit_test_telemetry_set_queue_time, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX "producer.request.latency.avg", + "The average request latency in ms for produce requests.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_produce_latency, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_PREFIX "producer.request.latency.max", + "The maximum request latency in ms for produce requests.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_false, + unit_test_telemetry_set_produce_latency, default_expected_value_int, + default_expected_value_double); + + /* Consumer metrics */ + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_RATE, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.connection.creation.rate", + "The rate of connections established per second.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_connects, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_NODE_REQUEST_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.node.request.latency.avg", + "The average request latency in ms for a node.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_true, + unit_test_telemetry_set_rtt, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_NODE_REQUEST_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.node.request.latency.max", + "The maximum request latency in ms for a node.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_true, + unit_test_telemetry_set_rtt, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_ASSIGNED_PARTITIONS, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.coordinator.assigned.partitions", + "The number of partitions currently assigned to this consumer.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_false, + unit_test_telemetry_set_coordinator_assigned_partitions, + default_expected_value_int, default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.coordinator.rebalance.latency.avg", + "The average rebalance latency in ms for the " + "consumer coordinator.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_rebalance_latency, + default_expected_value_int, default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.coordinator.rebalance.latency.max", + "The maximum rebalance latency in ms for the " + "consumer coordinator.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_false, + unit_test_telemetry_set_rebalance_latency, + default_expected_value_int, default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_FETCH_MANAGER_FETCH_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.fetch.manager.fetch.latency.avg", + "The average fetch latency in ms for the fetch manager.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_fetch_latency, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_FETCH_MANAGER_FETCH_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.fetch.manager.fetch.latency.max", + "The maximum fetch latency in ms for the fetch manager.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_false, + unit_test_telemetry_set_fetch_latency, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_POLL_IDLE_RATIO_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX "consumer.poll.idle.ratio.avg", + "The average ratio of idle to poll for a consumer.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_poll_idle_ratio, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_COMMIT_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.coordinator.commit.latency.avg", + "The average commit latency in ms for the consumer coordinator.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_true, rd_false, + unit_test_telemetry_set_commit_latency, default_expected_value_int, + default_expected_value_double); + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_COMMIT_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.coordinator.commit.latency.max", + "The maximum commit latency in ms for the consumer coordinator.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, rd_false, rd_false, + unit_test_telemetry_set_commit_latency, default_expected_value_int, + default_expected_value_double); + return fails; +} + +int unit_test_telemetry_sum(void) { + int fails = 0; + int64_t default_expected_value_int = 1; + double default_expected_value_double = 1.0; + + /* Producer metrics */ + fails += unit_test_telemetry( + RD_KAFKA_PRODUCER, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_CONNECTION_CREATION_TOTAL, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "producer.connection.creation.total", + "The total number of connections established.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM, rd_false, rd_false, + unit_test_telemetry_set_connects, default_expected_value_int, + default_expected_value_double); + + /* Consumer metrics */ + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_TOTAL, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.connection.creation.total", + "The total number of connections established.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM, rd_false, rd_false, + unit_test_telemetry_set_connects, default_expected_value_int, + default_expected_value_double); + /* Test with expected value 2 */ + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_TOTAL, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.connection.creation.total", + "The total number of connections established.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM, rd_false, rd_false, + unit_test_telemetry_set_connects2, 2, 0.0); + + fails += unit_test_telemetry( + RD_KAFKA_CONSUMER, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_TOTAL, + RD_KAFKA_TELEMETRY_METRIC_PREFIX + "consumer.coordinator.rebalance.latency.total", + "The total rebalance latency in ms for the " + "consumer coordinator.", + RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM, rd_false, rd_false, + unit_test_telemetry_set_rebalance_latency, + default_expected_value_int, default_expected_value_double); + return fails; +} + +int unittest_telemetry_decode(void) { + int fails = 0; + fails += unit_test_telemetry_gauge(); + fails += unit_test_telemetry_sum(); + return fails; +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_telemetry_decode.h b/src/third_party/librdkafka/dist/src/rdkafka_telemetry_decode.h new file mode 100644 index 00000000000..25f25a7d4fd --- /dev/null +++ b/src/third_party/librdkafka/dist/src/rdkafka_telemetry_decode.h @@ -0,0 +1,59 @@ +/* + * librdkafka - Apache Kafka C library + * + * Copyright (c) 2023, Confluent Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RDKAFKA_RDKAFKA_TELEMETRY_DECODE_H +#define _RDKAFKA_RDKAFKA_TELEMETRY_DECODE_H +#include "rd.h" +#include "opentelemetry/metrics.pb.h" +#include "rdkafka_telemetry_encode.h" + +typedef struct rd_kafka_telemetry_decode_interface_s { + void (*decoded_string)(void *opaque, const uint8_t *decoded); + void (*decoded_NumberDataPoint)( + void *opaque, + const opentelemetry_proto_metrics_v1_NumberDataPoint *decoded); + void (*decoded_int64)(void *opaque, int64_t decoded); + void (*decoded_type)(void *opaque, + rd_kafka_telemetry_metric_type_t type); + void (*decode_error)(void *opaque, const char *error, ...); + void *opaque; +} rd_kafka_telemetry_decode_interface_t; + +int rd_kafka_telemetry_uncompress_metrics_payload( + rd_kafka_broker_t *rkb, + rd_kafka_compression_t compression_type, + void *compressed_payload, + size_t compressed_payload_size, + void **uncompressed_payload, + size_t *uncompressed_payload_size); +int rd_kafka_telemetry_decode_metrics( + rd_kafka_telemetry_decode_interface_t *decode_interface, + void *buffer, + size_t size); + +#endif /* _RDKAFKA_RDKAFKA_TELEMETRY_DECODE_H */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_telemetry_encode.c b/src/third_party/librdkafka/dist/src/rdkafka_telemetry_encode.c new file mode 100644 index 00000000000..da1a341d8a1 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/rdkafka_telemetry_encode.c @@ -0,0 +1,997 @@ +/* + * librdkafka - Apache Kafka C library + * + * Copyright (c) 2023, Confluent Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rdkafka_telemetry_encode.h" +#include "nanopb/pb_encode.h" +#include "opentelemetry/metrics.pb.h" + +#define THREE_ORDERS_MAGNITUDE 1000 + +typedef struct { + opentelemetry_proto_metrics_v1_Metric **metrics; + size_t count; +} rd_kafka_telemetry_metrics_repeated_t; + +typedef struct { + opentelemetry_proto_common_v1_KeyValue **key_values; + size_t count; +} rd_kafka_telemetry_key_values_repeated_t; + +#define calculate_avg(_avg_, _scale_factor_) \ + ((_avg_).ra_v.avg / (double)_scale_factor_) + +#define calculate_max(_avg_, _scale_factor_) \ + RD_CEIL_INTEGER_DIVISION((_avg_).ra_v.maxv, _scale_factor_) + +#define brokers_avg(_rk_, _avg_name_, _scale_factor_, _metric_) \ + do { \ + rd_kafka_broker_t *_rkb_; \ + double avg = 0; \ + int count = 0; \ + TAILQ_FOREACH(_rkb_, &(_rk_)->rk_brokers, rkb_link) { \ + rd_avg_t *rd_avg_rollover = \ + &_rkb_->rkb_telemetry.rd_avg_rollover._avg_name_; \ + if (rd_avg_rollover->ra_v.cnt) { \ + avg = (avg * count + \ + rd_avg_rollover->ra_v.sum) / \ + (double)(count + \ + rd_avg_rollover->ra_v.cnt); \ + count += rd_avg_rollover->ra_v.cnt; \ + } \ + } \ + if (_scale_factor_ > 1) \ + (_metric_).double_value = avg / _scale_factor_; \ + else \ + (_metric_).double_value = avg; \ + } while (0) + +#define brokers_max(_rk_, _avg_name_, _scale_factor_, _metric_) \ + do { \ + rd_kafka_broker_t *_rkb_; \ + _metric_.int_value = 0; \ + TAILQ_FOREACH(_rkb_, &(_rk_)->rk_brokers, rkb_link) { \ + _metric_.int_value = \ + RD_MAX(_metric_.int_value, \ + _rkb_->rkb_telemetry.rd_avg_rollover \ + ._avg_name_.ra_v.maxv); \ + } \ + if (_scale_factor_ > 1) \ + (_metric_).int_value = RD_CEIL_INTEGER_DIVISION( \ + (_metric_).int_value, _scale_factor_); \ + } while (0) + +static rd_kafka_telemetry_metric_value_t +calculate_connection_creation_total(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t total; + rd_kafka_broker_t *rkb; + + total.int_value = 0; + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + const int32_t connects = rd_atomic32_get(&rkb->rkb_c.connects); + if (!rk->rk_telemetry.delta_temporality) + total.int_value += connects; + else + total.int_value += + connects - + rkb->rkb_telemetry.rkb_historic_c.connects; + } + + return total; +} + +static rd_kafka_telemetry_metric_value_t +calculate_connection_creation_rate(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t total; + rd_kafka_broker_t *rkb; + rd_ts_t ts_last = rk->rk_telemetry.rk_historic_c.ts_last; + + total.double_value = 0; + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + total.double_value += + rd_atomic32_get(&rkb->rkb_c.connects) - + rkb->rkb_telemetry.rkb_historic_c.connects; + } + double seconds = (now_ns - ts_last) / 1e9; + if (seconds > 1.0) + total.double_value /= seconds; + return total; +} + +static rd_kafka_telemetry_metric_value_t +calculate_broker_avg_rtt(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t avg_rtt = RD_ZERO_INIT; + avg_rtt.double_value = calculate_avg( + rkb_selected->rkb_telemetry.rd_avg_rollover.rkb_avg_rtt, + THREE_ORDERS_MAGNITUDE); + return avg_rtt; +} + +static rd_kafka_telemetry_metric_value_t +calculate_broker_max_rtt(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t max_rtt = RD_ZERO_INIT; + max_rtt.int_value = calculate_max( + rkb_selected->rkb_telemetry.rd_avg_rollover.rkb_avg_rtt, + THREE_ORDERS_MAGNITUDE); + return max_rtt; +} + +static rd_kafka_telemetry_metric_value_t +calculate_produce_latency_avg(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t avg_rtt = RD_ZERO_INIT; + brokers_avg(rk, rkb_avg_produce_latency, THREE_ORDERS_MAGNITUDE, + avg_rtt); + return avg_rtt; +} + +static rd_kafka_telemetry_metric_value_t +calculate_produce_latency_max(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t max_rtt = RD_ZERO_INIT; + brokers_max(rk, rkb_avg_produce_latency, THREE_ORDERS_MAGNITUDE, + max_rtt); + return max_rtt; +} + +static rd_kafka_telemetry_metric_value_t +calculate_throttle_avg(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t avg_throttle; + brokers_avg(rk, rkb_avg_throttle, 1, avg_throttle); + return avg_throttle; +} + + +static rd_kafka_telemetry_metric_value_t +calculate_throttle_max(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t max_throttle; + brokers_max(rk, rkb_avg_throttle, 1, max_throttle); + return max_throttle; +} + +static rd_kafka_telemetry_metric_value_t +calculate_queue_time_avg(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t avg_queue_time; + brokers_avg(rk, rkb_avg_outbuf_latency, THREE_ORDERS_MAGNITUDE, + avg_queue_time); + return avg_queue_time; +} + +static rd_kafka_telemetry_metric_value_t +calculate_queue_time_max(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t max_queue_time; + brokers_max(rk, rkb_avg_outbuf_latency, THREE_ORDERS_MAGNITUDE, + max_queue_time); + return max_queue_time; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_assigned_partitions(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t assigned_partitions; + + assigned_partitions.int_value = + rk->rk_cgrp ? rk->rk_cgrp->rkcg_c.assignment_size : 0; + return assigned_partitions; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_rebalance_latency_avg(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t avg_rebalance_time; + avg_rebalance_time.double_value = calculate_avg( + rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency, + THREE_ORDERS_MAGNITUDE); + return avg_rebalance_time; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_rebalance_latency_max(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t max_rebalance_time; + max_rebalance_time.int_value = calculate_max( + rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency, + THREE_ORDERS_MAGNITUDE); + return max_rebalance_time; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_rebalance_latency_total(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t total_rebalance_time; + total_rebalance_time.int_value = RD_CEIL_INTEGER_DIVISION( + rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency.ra_v.sum, + THREE_ORDERS_MAGNITUDE); + if (!rk->rk_telemetry.delta_temporality) { + total_rebalance_time.int_value += + rk->rk_telemetry.rk_historic_c.rebalance_latency_total; + } + return total_rebalance_time; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_fetch_latency_avg(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t avg_fetch_time; + brokers_avg(rk, rkb_avg_fetch_latency, THREE_ORDERS_MAGNITUDE, + avg_fetch_time); + return avg_fetch_time; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_fetch_latency_max(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t max_fetch_time; + brokers_max(rk, rkb_avg_fetch_latency, THREE_ORDERS_MAGNITUDE, + max_fetch_time); + return max_fetch_time; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_poll_idle_ratio_avg(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t avg_poll_idle_avg; + avg_poll_idle_avg.double_value = calculate_avg( + rk->rk_telemetry.rd_avg_rollover.rk_avg_poll_idle_ratio, 1e6); + return avg_poll_idle_avg; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_commit_latency_avg(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t avg_commit_time; + avg_commit_time.double_value = calculate_avg( + rk->rk_telemetry.rd_avg_rollover.rk_avg_commit_latency, + THREE_ORDERS_MAGNITUDE); + return avg_commit_time; +} + +static rd_kafka_telemetry_metric_value_t +calculate_consumer_commit_latency_max(rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_ns) { + rd_kafka_telemetry_metric_value_t max_commit_time; + max_commit_time.int_value = calculate_max( + rk->rk_telemetry.rd_avg_rollover.rk_avg_commit_latency, + THREE_ORDERS_MAGNITUDE); + return max_commit_time; +} + +static void reset_historical_metrics(rd_kafka_t *rk, rd_ts_t now_ns) { + rd_kafka_broker_t *rkb; + + rk->rk_telemetry.rk_historic_c.ts_last = now_ns; + rk->rk_telemetry.rk_historic_c.rebalance_latency_total += + RD_CEIL_INTEGER_DIVISION(rk->rk_telemetry.rd_avg_rollover + .rk_avg_rebalance_latency.ra_v.sum, + THREE_ORDERS_MAGNITUDE); + + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + rkb->rkb_telemetry.rkb_historic_c.connects = + rd_atomic32_get(&rkb->rkb_c.connects); + } +} + +static const rd_kafka_telemetry_metric_value_calculator_t + PRODUCER_METRIC_VALUE_CALCULATORS[RD_KAFKA_TELEMETRY_PRODUCER_METRIC__CNT] = + { + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_CONNECTION_CREATION_RATE] = + &calculate_connection_creation_rate, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_CONNECTION_CREATION_TOTAL] = + &calculate_connection_creation_total, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_NODE_REQUEST_LATENCY_AVG] = + &calculate_broker_avg_rtt, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_NODE_REQUEST_LATENCY_MAX] = + &calculate_broker_max_rtt, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_THROTTLE_TIME_AVG] = + &calculate_throttle_avg, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_THROTTLE_TIME_MAX] = + &calculate_throttle_max, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_RECORD_QUEUE_TIME_AVG] = + &calculate_queue_time_avg, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_RECORD_QUEUE_TIME_MAX] = + &calculate_queue_time_max, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_LATENCY_AVG] = + &calculate_produce_latency_avg, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_LATENCY_MAX] = + &calculate_produce_latency_max, +}; + +static const rd_kafka_telemetry_metric_value_calculator_t + CONSUMER_METRIC_VALUE_CALCULATORS[RD_KAFKA_TELEMETRY_CONSUMER_METRIC__CNT] = { + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_RATE] = + &calculate_connection_creation_rate, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_TOTAL] = + &calculate_connection_creation_total, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_NODE_REQUEST_LATENCY_AVG] = + &calculate_broker_avg_rtt, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_NODE_REQUEST_LATENCY_MAX] = + &calculate_broker_max_rtt, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_ASSIGNED_PARTITIONS] = + &calculate_consumer_assigned_partitions, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_AVG] = + &calculate_consumer_rebalance_latency_avg, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_MAX] = + &calculate_consumer_rebalance_latency_max, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_TOTAL] = + &calculate_consumer_rebalance_latency_total, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_FETCH_MANAGER_FETCH_LATENCY_AVG] = + &calculate_consumer_fetch_latency_avg, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_FETCH_MANAGER_FETCH_LATENCY_MAX] = + &calculate_consumer_fetch_latency_max, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_POLL_IDLE_RATIO_AVG] = + &calculate_consumer_poll_idle_ratio_avg, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_COMMIT_LATENCY_AVG] = + &calculate_consumer_commit_latency_avg, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_COMMIT_LATENCY_MAX] = + &calculate_consumer_commit_latency_max, +}; + +static const char *get_client_rack(const rd_kafka_t *rk) { + return rk->rk_conf.client_rack && + RD_KAFKAP_STR_LEN(rk->rk_conf.client_rack) + ? (const char *)rk->rk_conf.client_rack->str + : NULL; +} + +static const char *get_group_id(const rd_kafka_t *rk) { + return rk->rk_conf.group_id_str ? (const char *)rk->rk_conf.group_id_str + : NULL; +} + +static const char *get_group_instance_id(const rd_kafka_t *rk) { + return rk->rk_conf.group_instance_id + ? (const char *)rk->rk_conf.group_instance_id + : NULL; +} + +static const char *get_member_id(const rd_kafka_t *rk) { + return rk->rk_cgrp && rk->rk_cgrp->rkcg_member_id && + rk->rk_cgrp->rkcg_member_id->len > 0 + ? (const char *)rk->rk_cgrp->rkcg_member_id->str + : NULL; +} + +static const char *get_transactional_id(const rd_kafka_t *rk) { + return rk->rk_conf.eos.transactional_id + ? (const char *)rk->rk_conf.eos.transactional_id + : NULL; +} + +static const rd_kafka_telemetry_attribute_config_t producer_attributes[] = { + {"client_rack", get_client_rack}, + {"transactional_id", get_transactional_id}, +}; + +static const rd_kafka_telemetry_attribute_config_t consumer_attributes[] = { + {"client_rack", get_client_rack}, + {"group_id", get_group_id}, + {"group_instance_id", get_group_instance_id}, + {"member_id", get_member_id}, +}; + +static int +count_attributes(rd_kafka_t *rk, + const rd_kafka_telemetry_attribute_config_t *configs, + int config_count) { + int count = 0, i; + for (i = 0; i < config_count; ++i) { + if (configs[i].getValue(rk)) { + count++; + } + } + return count; +} + +static void set_attributes(rd_kafka_t *rk, + rd_kafka_telemetry_resource_attribute_t *attributes, + const rd_kafka_telemetry_attribute_config_t *configs, + int config_count) { + int attr_idx = 0, i; + for (i = 0; i < config_count; ++i) { + const char *value = configs[i].getValue(rk); + if (value) { + attributes[attr_idx].name = configs[i].name; + attributes[attr_idx].value = value; + attr_idx++; + } + } +} + +static int +resource_attributes(rd_kafka_t *rk, + rd_kafka_telemetry_resource_attribute_t **attributes) { + int count = 0; + const rd_kafka_telemetry_attribute_config_t *configs; + int config_count; + + if (rk->rk_type == RD_KAFKA_PRODUCER) { + configs = producer_attributes; + config_count = RD_ARRAY_SIZE(producer_attributes); + } else if (rk->rk_type == RD_KAFKA_CONSUMER) { + configs = consumer_attributes; + config_count = RD_ARRAY_SIZE(consumer_attributes); + } else { + *attributes = NULL; + return 0; + } + + count = count_attributes(rk, configs, config_count); + + if (count == 0) { + *attributes = NULL; + return 0; + } + + *attributes = + rd_malloc(sizeof(rd_kafka_telemetry_resource_attribute_t) * count); + + set_attributes(rk, *attributes, configs, config_count); + + return count; +} + +static bool +encode_string(pb_ostream_t *stream, const pb_field_t *field, void *const *arg) { + if (!pb_encode_tag_for_field(stream, field)) + return false; + return pb_encode_string(stream, (uint8_t *)(*arg), strlen(*arg)); +} + +// TODO: Update to handle multiple data points. +static bool encode_number_data_point(pb_ostream_t *stream, + const pb_field_t *field, + void *const *arg) { + opentelemetry_proto_metrics_v1_NumberDataPoint *data_point = + (opentelemetry_proto_metrics_v1_NumberDataPoint *)*arg; + if (!pb_encode_tag_for_field(stream, field)) + return false; + + return pb_encode_submessage( + stream, opentelemetry_proto_metrics_v1_NumberDataPoint_fields, + data_point); +} + +static bool +encode_metric(pb_ostream_t *stream, const pb_field_t *field, void *const *arg) { + rd_kafka_telemetry_metrics_repeated_t *metricArr = + (rd_kafka_telemetry_metrics_repeated_t *)*arg; + size_t i; + + for (i = 0; i < metricArr->count; i++) { + + opentelemetry_proto_metrics_v1_Metric *metric = + metricArr->metrics[i]; + if (!pb_encode_tag_for_field(stream, field)) + return false; + + if (!pb_encode_submessage( + stream, opentelemetry_proto_metrics_v1_Metric_fields, + metric)) + return false; + } + return true; +} + +static bool encode_scope_metrics(pb_ostream_t *stream, + const pb_field_t *field, + void *const *arg) { + opentelemetry_proto_metrics_v1_ScopeMetrics *scope_metrics = + (opentelemetry_proto_metrics_v1_ScopeMetrics *)*arg; + if (!pb_encode_tag_for_field(stream, field)) + return false; + + return pb_encode_submessage( + stream, opentelemetry_proto_metrics_v1_ScopeMetrics_fields, + scope_metrics); +} + +static bool encode_resource_metrics(pb_ostream_t *stream, + const pb_field_t *field, + void *const *arg) { + opentelemetry_proto_metrics_v1_ResourceMetrics *resource_metrics = + (opentelemetry_proto_metrics_v1_ResourceMetrics *)*arg; + if (!pb_encode_tag_for_field(stream, field)) + return false; + + return pb_encode_submessage( + stream, opentelemetry_proto_metrics_v1_ResourceMetrics_fields, + resource_metrics); +} + +static bool encode_key_value(pb_ostream_t *stream, + const pb_field_t *field, + void *const *arg) { + if (!pb_encode_tag_for_field(stream, field)) + return false; + opentelemetry_proto_common_v1_KeyValue *key_value = + (opentelemetry_proto_common_v1_KeyValue *)*arg; + return pb_encode_submessage( + stream, opentelemetry_proto_common_v1_KeyValue_fields, key_value); +} + +static bool encode_key_values(pb_ostream_t *stream, + const pb_field_t *field, + void *const *arg) { + rd_kafka_telemetry_key_values_repeated_t *kv_arr = + (rd_kafka_telemetry_key_values_repeated_t *)*arg; + size_t i; + + for (i = 0; i < kv_arr->count; i++) { + + opentelemetry_proto_common_v1_KeyValue *kv = + kv_arr->key_values[i]; + if (!pb_encode_tag_for_field(stream, field)) + return false; + + if (!pb_encode_submessage( + stream, opentelemetry_proto_common_v1_KeyValue_fields, + kv)) + return false; + } + return true; +} + +static void free_metrics( + opentelemetry_proto_metrics_v1_Metric **metrics, + char **metric_names, + opentelemetry_proto_metrics_v1_NumberDataPoint **data_points, + opentelemetry_proto_common_v1_KeyValue *datapoint_attributes_key_values, + size_t count) { + size_t i; + for (i = 0; i < count; i++) { + rd_free(data_points[i]); + rd_free(metric_names[i]); + rd_free(metrics[i]); + } + rd_free(data_points); + rd_free(metric_names); + rd_free(metrics); + rd_free(datapoint_attributes_key_values); +} + +static void free_resource_attributes( + opentelemetry_proto_common_v1_KeyValue **resource_attributes_key_values, + rd_kafka_telemetry_resource_attribute_t *resource_attributes_struct, + size_t count) { + size_t i; + if (count == 0) + return; + for (i = 0; i < count; i++) + rd_free(resource_attributes_key_values[i]); + rd_free(resource_attributes_struct); + rd_free(resource_attributes_key_values); +} + +static void serialize_Metric( + rd_kafka_t *rk, + rd_kafka_broker_t *rkb, + const rd_kafka_telemetry_metric_info_t *info, + opentelemetry_proto_metrics_v1_Metric **metric, + opentelemetry_proto_metrics_v1_NumberDataPoint **data_point, + opentelemetry_proto_common_v1_KeyValue *data_point_attribute, + rd_kafka_telemetry_metric_value_calculator_t metric_value_calculator, + char **metric_name, + bool is_per_broker, + rd_ts_t now_ns) { + rd_ts_t ts_last = rk->rk_telemetry.rk_historic_c.ts_last, + ts_start = rk->rk_telemetry.rk_historic_c.ts_start; + size_t metric_name_len; + if (info->is_int) { + (*data_point)->which_value = + opentelemetry_proto_metrics_v1_NumberDataPoint_as_int_tag; + (*data_point)->value.as_int = + metric_value_calculator(rk, rkb, now_ns).int_value; + } else { + (*data_point)->which_value = + opentelemetry_proto_metrics_v1_NumberDataPoint_as_double_tag; + (*data_point)->value.as_double = + metric_value_calculator(rk, rkb, now_ns).double_value; + } + + + (*data_point)->time_unix_nano = now_ns; + if (info->type == RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE || + (info->type == RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM && + rk->rk_telemetry.delta_temporality)) + (*data_point)->start_time_unix_nano = ts_last; + else + (*data_point)->start_time_unix_nano = ts_start; + + if (is_per_broker) { + data_point_attribute->key.funcs.encode = &encode_string; + data_point_attribute->key.arg = + RD_KAFKA_TELEMETRY_METRIC_NODE_ID_ATTRIBUTE; + data_point_attribute->has_value = true; + data_point_attribute->value.which_value = + opentelemetry_proto_common_v1_AnyValue_int_value_tag; + + rd_kafka_broker_lock(rkb); + data_point_attribute->value.value.int_value = rkb->rkb_nodeid; + rd_kafka_broker_unlock(rkb); + + (*data_point)->attributes.funcs.encode = &encode_key_value; + (*data_point)->attributes.arg = data_point_attribute; + } + + + switch (info->type) { + + case RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM: { + (*metric)->which_data = + opentelemetry_proto_metrics_v1_Metric_sum_tag; + (*metric)->data.sum.data_points.funcs.encode = + &encode_number_data_point; + (*metric)->data.sum.data_points.arg = *data_point; + (*metric)->data.sum.aggregation_temporality = + rk->rk_telemetry.delta_temporality + ? opentelemetry_proto_metrics_v1_AggregationTemporality_AGGREGATION_TEMPORALITY_DELTA + : opentelemetry_proto_metrics_v1_AggregationTemporality_AGGREGATION_TEMPORALITY_CUMULATIVE; + (*metric)->data.sum.is_monotonic = true; + break; + } + case RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE: { + (*metric)->which_data = + opentelemetry_proto_metrics_v1_Metric_gauge_tag; + (*metric)->data.gauge.data_points.funcs.encode = + &encode_number_data_point; + (*metric)->data.gauge.data_points.arg = *data_point; + break; + } + default: + rd_assert(!"Unknown metric type"); + break; + } + + (*metric)->description.funcs.encode = &encode_string; + (*metric)->description.arg = (void *)info->description; + + metric_name_len = + strlen(RD_KAFKA_TELEMETRY_METRIC_PREFIX) + strlen(info->name) + 1; + *metric_name = rd_calloc(1, metric_name_len); + rd_snprintf(*metric_name, metric_name_len, "%s%s", + RD_KAFKA_TELEMETRY_METRIC_PREFIX, info->name); + + + (*metric)->name.funcs.encode = &encode_string; + (*metric)->name.arg = *metric_name; + + /* Skipping unit as Java client does the same */ +} + +/** + * @brief Encodes the metrics to opentelemetry_proto_metrics_v1_MetricsData and + * returns the serialized data. Currently only supports encoding of connection + * creation total by default + * + * @locks none + * @locks_acquired rd_kafka_rdlock() + * @locality main thread + */ +rd_buf_t *rd_kafka_telemetry_encode_metrics(rd_kafka_t *rk) { + rd_buf_t *rbuf = NULL; + rd_kafka_broker_t *rkb; + size_t message_size; + void *buffer = NULL; + pb_ostream_t stream; + bool status; + char **metric_names; + const int *metrics_to_encode = rk->rk_telemetry.matched_metrics; + const size_t metrics_to_encode_count = + rk->rk_telemetry.matched_metrics_cnt; + const rd_kafka_telemetry_metric_info_t *info = + RD_KAFKA_TELEMETRY_METRIC_INFO(rk); + size_t total_metrics_count = metrics_to_encode_count; + size_t i, metric_idx = 0; + + if (!metrics_to_encode_count) + return rd_buf_new(1, 1); + + opentelemetry_proto_metrics_v1_MetricsData metrics_data = + opentelemetry_proto_metrics_v1_MetricsData_init_zero; + + opentelemetry_proto_metrics_v1_ResourceMetrics resource_metrics = + opentelemetry_proto_metrics_v1_ResourceMetrics_init_zero; + + opentelemetry_proto_metrics_v1_Metric **metrics; + opentelemetry_proto_common_v1_KeyValue * + *resource_attributes_key_values = NULL; + opentelemetry_proto_common_v1_KeyValue + *datapoint_attributes_key_values = NULL; + opentelemetry_proto_metrics_v1_NumberDataPoint **data_points; + rd_kafka_telemetry_metrics_repeated_t metrics_repeated; + rd_kafka_telemetry_key_values_repeated_t resource_attributes_repeated; + rd_kafka_telemetry_resource_attribute_t *resource_attributes_struct = + NULL; + rd_ts_t now_ns = rd_uclock() * 1000; + rd_kafka_rdlock(rk); + + for (i = 0; i < metrics_to_encode_count; i++) { + if (info[metrics_to_encode[i]].is_per_broker) { + total_metrics_count += rk->rk_broker_cnt.val - 1; + } + } + + rd_kafka_dbg(rk, TELEMETRY, "PUSH", "Serializing metrics"); + + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_rtt); + rd_avg_rollover(&rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_rtt, + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_rtt); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_outbuf_latency); + rd_avg_rollover( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_outbuf_latency, + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_outbuf_latency); + rd_avg_destroy( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_throttle); + rd_avg_rollover( + &rkb->rkb_telemetry.rd_avg_rollover.rkb_avg_throttle, + &rkb->rkb_telemetry.rd_avg_current.rkb_avg_throttle); + if (rk->rk_type == RD_KAFKA_CONSUMER) { + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_rollover + .rkb_avg_fetch_latency); + rd_avg_rollover(&rkb->rkb_telemetry.rd_avg_rollover + .rkb_avg_fetch_latency, + &rkb->rkb_telemetry.rd_avg_current + .rkb_avg_fetch_latency); + } else if (rk->rk_type == RD_KAFKA_PRODUCER) { + rd_avg_destroy(&rkb->rkb_telemetry.rd_avg_rollover + .rkb_avg_produce_latency); + rd_avg_rollover(&rkb->rkb_telemetry.rd_avg_rollover + .rkb_avg_produce_latency, + &rkb->rkb_telemetry.rd_avg_current + .rkb_avg_produce_latency); + } + } + + if (rk->rk_type == RD_KAFKA_CONSUMER) { + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_poll_idle_ratio); + rd_avg_rollover( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_poll_idle_ratio, + &rk->rk_telemetry.rd_avg_current.rk_avg_poll_idle_ratio); + + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency); + rd_avg_rollover( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_rebalance_latency, + &rk->rk_telemetry.rd_avg_current.rk_avg_rebalance_latency); + + rd_avg_destroy( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_commit_latency); + rd_avg_rollover( + &rk->rk_telemetry.rd_avg_rollover.rk_avg_commit_latency, + &rk->rk_telemetry.rd_avg_current.rk_avg_commit_latency); + } + + int resource_attributes_count = + resource_attributes(rk, &resource_attributes_struct); + rd_kafka_dbg(rk, TELEMETRY, "PUSH", "Resource attributes count: %d", + resource_attributes_count); + if (resource_attributes_count > 0) { + resource_attributes_key_values = + rd_malloc(sizeof(opentelemetry_proto_common_v1_KeyValue *) * + resource_attributes_count); + int ind; + for (ind = 0; ind < resource_attributes_count; ++ind) { + resource_attributes_key_values[ind] = rd_calloc( + 1, sizeof(opentelemetry_proto_common_v1_KeyValue)); + resource_attributes_key_values[ind]->key.funcs.encode = + &encode_string; + resource_attributes_key_values[ind]->key.arg = + (void *)resource_attributes_struct[ind].name; + + resource_attributes_key_values[ind]->has_value = true; + resource_attributes_key_values[ind]->value.which_value = + opentelemetry_proto_common_v1_AnyValue_string_value_tag; + resource_attributes_key_values[ind] + ->value.value.string_value.funcs.encode = + &encode_string; + resource_attributes_key_values[ind] + ->value.value.string_value.arg = + (void *)resource_attributes_struct[ind].value; + } + resource_attributes_repeated.key_values = + resource_attributes_key_values; + resource_attributes_repeated.count = resource_attributes_count; + resource_metrics.has_resource = true; + resource_metrics.resource.attributes.funcs.encode = + &encode_key_values; + resource_metrics.resource.attributes.arg = + &resource_attributes_repeated; + } + + opentelemetry_proto_metrics_v1_ScopeMetrics scope_metrics = + opentelemetry_proto_metrics_v1_ScopeMetrics_init_zero; + + opentelemetry_proto_common_v1_InstrumentationScope + instrumentation_scope = + opentelemetry_proto_common_v1_InstrumentationScope_init_zero; + instrumentation_scope.name.funcs.encode = &encode_string; + instrumentation_scope.name.arg = (void *)rd_kafka_name(rk); + instrumentation_scope.version.funcs.encode = &encode_string; + instrumentation_scope.version.arg = (void *)rd_kafka_version_str(); + + scope_metrics.has_scope = true; + scope_metrics.scope = instrumentation_scope; + + metrics = rd_malloc(sizeof(opentelemetry_proto_metrics_v1_Metric *) * + total_metrics_count); + data_points = + rd_malloc(sizeof(opentelemetry_proto_metrics_v1_NumberDataPoint *) * + total_metrics_count); + datapoint_attributes_key_values = + rd_malloc(sizeof(opentelemetry_proto_common_v1_KeyValue) * + total_metrics_count); + metric_names = rd_malloc(sizeof(char *) * total_metrics_count); + rd_kafka_dbg(rk, TELEMETRY, "PUSH", + "Total metrics to be encoded count: %" PRIusz, + total_metrics_count); + + + for (i = 0; i < metrics_to_encode_count; i++) { + + rd_kafka_telemetry_metric_value_calculator_t + metric_value_calculator = + (rk->rk_type == RD_KAFKA_PRODUCER) + ? PRODUCER_METRIC_VALUE_CALCULATORS + [metrics_to_encode[i]] + : CONSUMER_METRIC_VALUE_CALCULATORS + [metrics_to_encode[i]]; + if (info[metrics_to_encode[i]].is_per_broker) { + rd_kafka_broker_t *rkb; + + TAILQ_FOREACH(rkb, &rk->rk_brokers, rkb_link) { + metrics[metric_idx] = rd_calloc( + 1, + sizeof( + opentelemetry_proto_metrics_v1_Metric)); + data_points[metric_idx] = rd_calloc( + 1, + sizeof( + opentelemetry_proto_metrics_v1_NumberDataPoint)); + serialize_Metric( + rk, rkb, &info[metrics_to_encode[i]], + &metrics[metric_idx], + &data_points[metric_idx], + &datapoint_attributes_key_values + [metric_idx], + metric_value_calculator, + &metric_names[metric_idx], true, now_ns); + metric_idx++; + } + continue; + } + + metrics[metric_idx] = + rd_calloc(1, sizeof(opentelemetry_proto_metrics_v1_Metric)); + data_points[metric_idx] = rd_calloc( + 1, sizeof(opentelemetry_proto_metrics_v1_NumberDataPoint)); + + serialize_Metric(rk, NULL, &info[metrics_to_encode[i]], + &metrics[metric_idx], &data_points[metric_idx], + &datapoint_attributes_key_values[metric_idx], + metric_value_calculator, + &metric_names[metric_idx], false, now_ns); + metric_idx++; + } + + /* Send empty metrics blob if no metrics are matched */ + if (total_metrics_count > 0) { + metrics_repeated.metrics = metrics; + metrics_repeated.count = total_metrics_count; + + scope_metrics.metrics.funcs.encode = &encode_metric; + scope_metrics.metrics.arg = &metrics_repeated; + + + resource_metrics.scope_metrics.funcs.encode = + &encode_scope_metrics; + resource_metrics.scope_metrics.arg = &scope_metrics; + + metrics_data.resource_metrics.funcs.encode = + &encode_resource_metrics; + metrics_data.resource_metrics.arg = &resource_metrics; + } + + status = pb_get_encoded_size( + &message_size, opentelemetry_proto_metrics_v1_MetricsData_fields, + &metrics_data); + if (!status) { + rd_kafka_dbg(rk, TELEMETRY, "PUSH", + "Failed to get encoded size"); + goto fail; + } + + rbuf = rd_buf_new(1, message_size); + rd_buf_write_ensure(rbuf, message_size, message_size); + message_size = rd_buf_get_writable(rbuf, &buffer); + + stream = pb_ostream_from_buffer(buffer, message_size); + status = pb_encode(&stream, + opentelemetry_proto_metrics_v1_MetricsData_fields, + &metrics_data); + + if (!status) { + rd_kafka_dbg(rk, TELEMETRY, "PUSH", "Encoding failed: %s", + PB_GET_ERROR(&stream)); + rd_buf_destroy_free(rbuf); + goto fail; + } + rd_kafka_dbg(rk, TELEMETRY, "PUSH", + "Push Telemetry metrics encoded, size: %" PRIusz, + stream.bytes_written); + rd_buf_write(rbuf, NULL, stream.bytes_written); + + reset_historical_metrics(rk, now_ns); + + free_metrics(metrics, metric_names, data_points, + datapoint_attributes_key_values, total_metrics_count); + free_resource_attributes(resource_attributes_key_values, + resource_attributes_struct, + resource_attributes_count); + rd_kafka_rdunlock(rk); + + return rbuf; + +fail: + free_metrics(metrics, metric_names, data_points, + datapoint_attributes_key_values, total_metrics_count); + free_resource_attributes(resource_attributes_key_values, + resource_attributes_struct, + resource_attributes_count); + rd_kafka_rdunlock(rk); + + return NULL; +} diff --git a/src/third_party/librdkafka/dist/src/rdkafka_telemetry_encode.h b/src/third_party/librdkafka/dist/src/rdkafka_telemetry_encode.h new file mode 100644 index 00000000000..75c6bc896b6 --- /dev/null +++ b/src/third_party/librdkafka/dist/src/rdkafka_telemetry_encode.h @@ -0,0 +1,301 @@ +/* + * librdkafka - Apache Kafka C library + * + * Copyright (c) 2023, Confluent Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RDKAFKA_RDKAFKA_TELEMETRY_ENCODE_H +#define _RDKAFKA_RDKAFKA_TELEMETRY_ENCODE_H + +#include "rdkafka_int.h" +#include "rdtypes.h" + +#define RD_KAFKA_TELEMETRY_METRIC_PREFIX "org.apache.kafka." +#define RD_KAFKA_TELEMETRY_METRIC_NODE_ID_ATTRIBUTE "node.id" + +#define RD_KAFKA_TELEMETRY_METRIC_INFO(rk) \ + (rk->rk_type == RD_KAFKA_PRODUCER \ + ? RD_KAFKA_TELEMETRY_PRODUCER_METRICS_INFO \ + : RD_KAFKA_TELEMETRY_CONSUMER_METRICS_INFO) + +#define RD_KAFKA_TELEMETRY_METRIC_CNT(rk) \ + (rk->rk_type == RD_KAFKA_PRODUCER \ + ? RD_KAFKA_TELEMETRY_PRODUCER_METRIC__CNT \ + : RD_KAFKA_TELEMETRY_CONSUMER_METRIC__CNT) + + +typedef enum { + RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM, + RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE, +} rd_kafka_telemetry_metric_type_t; + +typedef enum { + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_CONNECTION_CREATION_RATE, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_CONNECTION_CREATION_TOTAL, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_NODE_REQUEST_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_NODE_REQUEST_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_THROTTLE_TIME_AVG, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_THROTTLE_TIME_MAX, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_RECORD_QUEUE_TIME_AVG, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_RECORD_QUEUE_TIME_MAX, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_LATENCY_MAX, + RD_KAFKA_TELEMETRY_PRODUCER_METRIC__CNT +} rd_kafka_telemetry_producer_metric_name_t; + +typedef enum { + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_RATE, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_TOTAL, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_NODE_REQUEST_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_NODE_REQUEST_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_ASSIGNED_PARTITIONS, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_TOTAL, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_FETCH_MANAGER_FETCH_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_FETCH_MANAGER_FETCH_LATENCY_MAX, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_POLL_IDLE_RATIO_AVG, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_COMMIT_LATENCY_AVG, + RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_COMMIT_LATENCY_MAX, + RD_KAFKA_TELEMETRY_CONSUMER_METRIC__CNT +} rd_kafka_telemetry_consumer_metric_name_t; + +typedef union { + int64_t int_value; + double double_value; +} rd_kafka_telemetry_metric_value_t; + +typedef rd_kafka_telemetry_metric_value_t ( + *rd_kafka_telemetry_metric_value_calculator_t)( + rd_kafka_t *rk, + rd_kafka_broker_t *rkb_selected, + rd_ts_t now_nanos); + +typedef struct { + const char *name; + const char *value; +} rd_kafka_telemetry_resource_attribute_t; + +typedef struct { + const char *name; + const char *description; + const char *unit; + const rd_bool_t is_int; + const rd_bool_t is_per_broker; + rd_kafka_telemetry_metric_type_t type; + rd_kafka_telemetry_metric_value_calculator_t calculate_value; +} rd_kafka_telemetry_metric_info_t; + +typedef struct { + const char *name; + const char *(*getValue)(const rd_kafka_t *rk); +} rd_kafka_telemetry_attribute_config_t; + +static const rd_kafka_telemetry_metric_info_t + RD_KAFKA_TELEMETRY_PRODUCER_METRICS_INFO + [RD_KAFKA_TELEMETRY_PRODUCER_METRIC__CNT] = { + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_CONNECTION_CREATION_RATE] = + {.name = "producer.connection.creation.rate", + .description = + "The rate of connections established per second.", + .unit = "1", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_CONNECTION_CREATION_TOTAL] = + {.name = "producer.connection.creation.total", + .description = "The total number of connections established.", + .unit = "1", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_NODE_REQUEST_LATENCY_AVG] = + {.name = "producer.node.request.latency.avg", + .description = "The average request latency in ms for a node.", + .unit = "ms", + .is_int = rd_false, + .is_per_broker = rd_true, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_NODE_REQUEST_LATENCY_MAX] = + {.name = "producer.node.request.latency.max", + .description = "The maximum request latency in ms for a node.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_true, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_THROTTLE_TIME_AVG] = + {.name = "producer.produce.throttle.time.avg", + .description = "The average throttle time in ms for a node.", + .unit = "ms", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_THROTTLE_TIME_MAX] = + {.name = "producer.produce.throttle.time.max", + .description = "The maximum throttle time in ms for a node.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_RECORD_QUEUE_TIME_AVG] = + {.name = "producer.record.queue.time.avg", + .description = "The average time in ms a record spends in the " + "producer queue.", + .unit = "ms", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_RECORD_QUEUE_TIME_MAX] = + {.name = "producer.record.queue.time.max", + .description = "The maximum time in ms a record spends in the " + "producer queue.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_LATENCY_AVG] = + {.name = "producer.request.latency.avg", + .description = + "The average request latency in ms for produce requests.", + .unit = "ms", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_PRODUCER_PRODUCE_LATENCY_MAX] = + {.name = "producer.request.latency.max", + .description = + "The maximum request latency in ms for produce requests.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, +}; + +static const rd_kafka_telemetry_metric_info_t RD_KAFKA_TELEMETRY_CONSUMER_METRICS_INFO + [RD_KAFKA_TELEMETRY_CONSUMER_METRIC__CNT] = { + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_RATE] = + {.name = "consumer.connection.creation.rate", + .description = "The rate of connections established per second.", + .unit = "1", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_CONNECTION_CREATION_TOTAL] = + {.name = "consumer.connection.creation.total", + .description = "The total number of connections established.", + .unit = "1", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_NODE_REQUEST_LATENCY_AVG] = + {.name = "consumer.node.request.latency.avg", + .description = "The average request latency in ms for a node.", + .unit = "ms", + .is_int = rd_false, + .is_per_broker = rd_true, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_NODE_REQUEST_LATENCY_MAX] = + {.name = "consumer.node.request.latency.max", + .description = "The maximum request latency in ms for a node.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_true, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_ASSIGNED_PARTITIONS] = + {.name = "consumer.coordinator.assigned.partitions", + .description = "The number of partitions currently assigned " + "to this consumer.", + .unit = "1", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_AVG] = + {.name = "consumer.coordinator.rebalance.latency.avg", + .description = "The average rebalance latency in ms for the " + "consumer coordinator.", + .unit = "ms", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_MAX] = + {.name = "consumer.coordinator.rebalance.latency.max", + .description = "The maximum rebalance latency in ms for the " + "consumer coordinator.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_REBALANCE_LATENCY_TOTAL] = + {.name = "consumer.coordinator.rebalance.latency.total", + .description = "The total rebalance latency in ms for the " + "consumer coordinator.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_SUM}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_FETCH_MANAGER_FETCH_LATENCY_AVG] = + {.name = "consumer.fetch.manager.fetch.latency.avg", + .description = + "The average fetch latency in ms for the fetch manager.", + .unit = "ms", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_FETCH_MANAGER_FETCH_LATENCY_MAX] = + {.name = "consumer.fetch.manager.fetch.latency.max", + .description = + "The maximum fetch latency in ms for the fetch manager.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_POLL_IDLE_RATIO_AVG] = + {.name = "consumer.poll.idle.ratio.avg", + .description = "The average ratio of idle to poll for a consumer.", + .unit = "1", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_COMMIT_LATENCY_AVG] = + {.name = "consumer.coordinator.commit.latency.avg", + .description = "The average commit latency in ms for the consumer " + "coordinator.", + .unit = "ms", + .is_int = rd_false, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, + [RD_KAFKA_TELEMETRY_METRIC_CONSUMER_COORDINATOR_COMMIT_LATENCY_MAX] = + {.name = "consumer.coordinator.commit.latency.max", + .description = "The maximum commit latency in ms for the consumer " + "coordinator.", + .unit = "ms", + .is_int = rd_true, + .is_per_broker = rd_false, + .type = RD_KAFKA_TELEMETRY_METRIC_TYPE_GAUGE}, +}; + +rd_buf_t *rd_kafka_telemetry_encode_metrics(rd_kafka_t *rk); + +#endif /* _RDKAFKA_RDKAFKA_TELEMETRY_ENCODE_H */ diff --git a/src/third_party/librdkafka/dist/src/rdkafka_timer.c b/src/third_party/librdkafka/dist/src/rdkafka_timer.c index 5240af78578..b62343269dd 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_timer.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_timer.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,6 +29,7 @@ #include "rdkafka_int.h" #include "rd.h" #include "rdtime.h" +#include "rdrand.h" #include "rdsysqueue.h" #include "rdkafka_queue.h" @@ -198,15 +199,32 @@ void rd_kafka_timer_start0(rd_kafka_timers_t *rkts, /** * Delay the next timer invocation by '2 * rtmr->rtmr_interval' + * @param minimum_backoff the minimum backoff to be applied + * @param maximum_backoff the maximum backoff to be applied + * @param max_jitter the jitter percentage to be applied to the backoff */ void rd_kafka_timer_exp_backoff(rd_kafka_timers_t *rkts, - rd_kafka_timer_t *rtmr) { + rd_kafka_timer_t *rtmr, + rd_ts_t minimum_backoff, + rd_ts_t maximum_backoff, + int max_jitter) { + int64_t jitter; rd_kafka_timers_lock(rkts); if (rd_kafka_timer_scheduled(rtmr)) { - rtmr->rtmr_interval *= 2; rd_kafka_timer_unschedule(rkts, rtmr); } - rd_kafka_timer_schedule(rkts, rtmr, 0); + rtmr->rtmr_interval *= 2; + jitter = + (rd_jitter(-max_jitter, max_jitter) * rtmr->rtmr_interval) / 100; + if (rtmr->rtmr_interval + jitter < minimum_backoff) { + rtmr->rtmr_interval = minimum_backoff; + jitter = 0; + } else if ((maximum_backoff != -1) && + (rtmr->rtmr_interval + jitter) > maximum_backoff) { + rtmr->rtmr_interval = maximum_backoff; + jitter = 0; + } + rd_kafka_timer_schedule(rkts, rtmr, jitter); rd_kafka_timers_unlock(rkts); } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_timer.h b/src/third_party/librdkafka/dist/src/rdkafka_timer.h index e3cadd7b9fa..9a273adcfa6 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_timer.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_timer.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -85,7 +85,10 @@ void rd_kafka_timer_start0(rd_kafka_timers_t *rkts, callback, arg) void rd_kafka_timer_exp_backoff(rd_kafka_timers_t *rkts, - rd_kafka_timer_t *rtmr); + rd_kafka_timer_t *rtmr, + rd_ts_t minimum, + rd_ts_t maximum, + int maxjitter); rd_ts_t rd_kafka_timer_next(rd_kafka_timers_t *rkts, rd_kafka_timer_t *rtmr, int do_lock); diff --git a/src/third_party/librdkafka/dist/src/rdkafka_topic.c b/src/third_party/librdkafka/dist/src/rdkafka_topic.c index e9330e3c42c..0ff862661ab 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_topic.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_topic.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012,2013 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -34,10 +35,12 @@ #include "rdkafka_broker.h" #include "rdkafka_cgrp.h" #include "rdkafka_metadata.h" +#include "rdkafka_offset.h" #include "rdlog.h" #include "rdsysqueue.h" #include "rdtime.h" #include "rdregex.h" +#include "rdkafka_fetcher.h" #if WITH_ZSTD #include @@ -51,7 +54,8 @@ const char *rd_kafka_topic_state_names[] = {"unknown", "exists", "notexists", static int rd_kafka_topic_metadata_update(rd_kafka_topic_t *rkt, const struct rd_kafka_metadata_topic *mdt, - rd_ts_t ts_insert); + const rd_kafka_metadata_topic_internal_t *mdit, + rd_ts_t ts_age); /** @@ -185,6 +189,22 @@ rd_kafka_topic_t *rd_kafka_topic_find0_fl(const char *func, return rkt; } +/** + * Same semantics as ..find() but takes a Uuid instead. + */ +rd_kafka_topic_t *rd_kafka_topic_find_by_topic_id(rd_kafka_t *rk, + rd_kafka_Uuid_t topic_id) { + rd_kafka_topic_t *rkt; + + TAILQ_FOREACH(rkt, &rk->rk_topics, rkt_link) { + if (!rd_kafka_Uuid_cmp(rkt->rkt_topic_id, topic_id)) { + rd_kafka_topic_keep(rkt); + break; + } + } + + return rkt; +} /** * @brief rd_kafka_topic_t comparator. @@ -334,6 +354,7 @@ rd_kafka_topic_t *rd_kafka_topic_new0(rd_kafka_t *rk, rkt->rkt_rk = rk; rkt->rkt_ts_create = rd_clock(); + rkt->rkt_ts_state = rkt->rkt_ts_create; rkt->rkt_conf = *conf; rd_free(conf); /* explicitly not rd_kafka_topic_destroy() @@ -476,8 +497,10 @@ rd_kafka_topic_t *rd_kafka_topic_new0(rd_kafka_t *rk, if (existing) *existing = 1; - rd_kafka_topic_metadata_update(rkt, &rkmce->rkmce_mtopic, - rkmce->rkmce_ts_insert); + rd_kafka_topic_metadata_update( + rkt, &rkmce->rkmce_mtopic, + &rkmce->rkmce_metadata_internal_topic, + rkmce->rkmce_ts_insert); } if (do_lock) @@ -516,7 +539,7 @@ rd_kafka_topic_t *rd_kafka_topic_new(rd_kafka_t *rk, /* Query for the topic leader (async) */ if (!existing) - rd_kafka_topic_leader_query(rk, rkt); + rd_kafka_topic_fast_leader_query(rk, rd_true /* force */); /* Drop our reference since there is already/now an app refcnt */ rd_kafka_topic_destroy0(rkt); @@ -543,7 +566,8 @@ static void rd_kafka_topic_set_state(rd_kafka_topic_t *rkt, int state) { if (rkt->rkt_state == RD_KAFKA_TOPIC_S_ERROR) rkt->rkt_err = RD_KAFKA_RESP_ERR_NO_ERROR; - rkt->rkt_state = state; + rkt->rkt_state = state; + rkt->rkt_ts_state = rd_clock(); } /** @@ -621,10 +645,12 @@ int rd_kafka_toppar_broker_update(rd_kafka_toppar_t *rktp, * @remark If a toppar is currently delegated to a preferred replica, * it will not be delegated to the leader broker unless there * has been a leader change. + * @remark The new leader, if present, should not be terminating. * * @param leader_id The id of the new leader broker. * @param leader A reference to the leader broker or NULL if the * toppar should be undelegated for any reason. + * @param leader_epoch Partition leader's epoch (KIP-320), or -1 if not known. * * @returns 1 if the broker delegation was changed, -1 if the broker * delegation was changed and is now undelegated, else 0. @@ -636,8 +662,10 @@ int rd_kafka_toppar_broker_update(rd_kafka_toppar_t *rktp, static int rd_kafka_toppar_leader_update(rd_kafka_topic_t *rkt, int32_t partition, int32_t leader_id, - rd_kafka_broker_t *leader) { + rd_kafka_broker_t *leader, + int32_t leader_epoch) { rd_kafka_toppar_t *rktp; + rd_bool_t need_epoch_validation = rd_false; rd_bool_t fetching_from_follower; int r = 0; @@ -657,19 +685,56 @@ static int rd_kafka_toppar_leader_update(rd_kafka_topic_t *rkt, rd_kafka_toppar_lock(rktp); + /* -1 (null) is excluded to allow to switch back to a + * leader not supporting KIP-320 still, for example + * during a cluster roll for upgrading brokers to + * a version supporting that KIP. */ + if (leader_epoch != -1 && leader_epoch < rktp->rktp_leader_epoch) { + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "BROKER", + "%s [%" PRId32 + "]: ignoring outdated metadata update with " + "leader epoch %" PRId32 + " which is older than " + "our cached epoch %" PRId32, + rktp->rktp_rkt->rkt_topic->str, + rktp->rktp_partition, leader_epoch, + rktp->rktp_leader_epoch); + rd_kafka_toppar_unlock(rktp); + rd_kafka_toppar_destroy(rktp); /* from get() */ + return 0; + } + + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "BROKER", + "%s [%" PRId32 "]: leader %" PRId32 " epoch %" PRId32 + " -> leader %" PRId32 " epoch %" PRId32, + rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, + rktp->rktp_leader_id, rktp->rktp_leader_epoch, leader_id, + leader_epoch); + + if (leader_epoch > rktp->rktp_leader_epoch || + rktp->rktp_fetch_state == + RD_KAFKA_TOPPAR_FETCH_VALIDATE_EPOCH_WAIT) { + /* Epoch increased and needs to be validated (leader_epoch > -1) + * or we need to complete the validation. */ + need_epoch_validation = rd_true; + } + + rktp->rktp_leader_epoch = leader_epoch; + fetching_from_follower = leader != NULL && rktp->rktp_broker != NULL && rktp->rktp_broker->rkb_source != RD_KAFKA_INTERNAL && rktp->rktp_broker != leader; if (fetching_from_follower && rktp->rktp_leader_id == leader_id) { - rd_kafka_dbg( - rktp->rktp_rkt->rkt_rk, TOPIC, "BROKER", - "Topic %s [%" PRId32 "]: leader %" PRId32 - " unchanged, " - "not migrating away from preferred replica %" PRId32, - rktp->rktp_rkt->rkt_topic->str, rktp->rktp_partition, - leader_id, rktp->rktp_broker_id); + rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "BROKER", + "Topic %s [%" PRId32 "]: leader %" PRId32 + " unchanged, " + "not migrating away from preferred " + "replica %" PRId32, + rktp->rktp_rkt->rkt_topic->str, + rktp->rktp_partition, leader_id, + rktp->rktp_broker_id); r = 0; } else { @@ -690,6 +755,16 @@ static int rd_kafka_toppar_leader_update(rd_kafka_topic_t *rkt, "leader updated"); } + if (need_epoch_validation) { + /* Set offset validation position, + * depending it if should continue with current position or + * with next fetch start position. */ + rd_kafka_toppar_set_offset_validation_position( + rktp, + rd_kafka_toppar_fetch_decide_next_fetch_start_pos(rktp)); + rd_kafka_offset_validate(rktp, "epoch updated from metadata"); + } + rd_kafka_toppar_unlock(rktp); rd_kafka_toppar_destroy(rktp); /* from get() */ @@ -715,7 +790,12 @@ int rd_kafka_toppar_delegate_to_leader(rd_kafka_toppar_t *rktp) { rd_kafka_rdlock(rktp->rktp_rkt->rkt_rk); rd_kafka_toppar_lock(rktp); - rd_assert(rktp->rktp_leader_id != rktp->rktp_broker_id); + if (rktp->rktp_leader_id == rktp->rktp_broker_id) { + /* Given lock was released we need to check again */ + rd_kafka_toppar_unlock(rktp); + rd_kafka_rdunlock(rktp->rktp_rkt->rkt_rk); + return 0; + } rd_kafka_dbg(rktp->rktp_rkt->rkt_rk, TOPIC, "BROKER", "Topic %s [%" PRId32 @@ -743,6 +823,45 @@ int rd_kafka_toppar_delegate_to_leader(rd_kafka_toppar_t *rktp) { } +/** + * @brief Forgets current rktp leader, to reduce reference count + * and allow the broker to be destroyed. + * + * @locks none + * @locks_acquired rk rdlock, rktp + * @locality any + */ +void rd_kafka_toppar_forget_leader(rd_kafka_toppar_t *rktp) { + rd_kafka_rdlock(rktp->rktp_rkt->rkt_rk); + rd_kafka_toppar_lock(rktp); + + if (rktp->rktp_leader) { + rd_kafka_broker_destroy(rktp->rktp_leader); + rktp->rktp_leader = NULL; + rktp->rktp_leader_id = -1; + rktp->rktp_leader_epoch = -1; + } + + rd_kafka_toppar_unlock(rktp); + rd_kafka_rdunlock(rktp->rktp_rkt->rkt_rk); +} + +/** + * @brief Revert the topic+partition delegation to the internal broker. + * + * @locks none + * @locks_acquired rk rdlock, rktp + * @locality any + */ +void rd_kafka_toppar_undelegate(rd_kafka_toppar_t *rktp) { + rd_kafka_rdlock(rktp->rktp_rkt->rkt_rk); + rd_kafka_toppar_lock(rktp); + + rd_kafka_toppar_broker_delegate(rktp, NULL); + + rd_kafka_toppar_unlock(rktp); + rd_kafka_rdunlock(rktp->rktp_rkt->rkt_rk); +} /** * @brief Save idempotent producer state for a partition that is about to @@ -1109,11 +1228,14 @@ rd_bool_t rd_kafka_topic_set_notexists(rd_kafka_topic_t *rkt, rd_assert(err != RD_KAFKA_RESP_ERR_NO_ERROR); remains_us = - (rkt->rkt_ts_create + + (rkt->rkt_ts_state + (rkt->rkt_rk->rk_conf.metadata_propagation_max_ms * 1000)) - rkt->rkt_ts_metadata; - if (!permanent && rkt->rkt_state == RD_KAFKA_TOPIC_S_UNKNOWN && + if (!permanent && + (rkt->rkt_state == RD_KAFKA_TOPIC_S_UNKNOWN || + rkt->rkt_state == RD_KAFKA_TOPIC_S_ERROR || + rkt->rkt_state == RD_KAFKA_TOPIC_S_EXISTS) && remains_us > 0) { /* Still allowing topic metadata to propagate. */ rd_kafka_dbg( @@ -1141,6 +1263,47 @@ rd_bool_t rd_kafka_topic_set_notexists(rd_kafka_topic_t *rkt, return rd_true; } +/** + * @brief Mark topic as existent, unless metadata propagation configuration + * disallows it. + * + * @returns true if the topic was marked as existent, else false. + * + * @locks topic_wrlock() MUST be held. + */ +rd_bool_t rd_kafka_topic_set_exists(rd_kafka_topic_t *rkt, + rd_kafka_Uuid_t topic_id) { + rd_ts_t remains_us; + + if (unlikely(rd_kafka_terminating(rkt->rkt_rk))) { + /* Dont update metadata while terminating. */ + return rd_false; + } + + remains_us = + (rkt->rkt_ts_state + + (rkt->rkt_rk->rk_conf.metadata_propagation_max_ms * 1000)) - + rkt->rkt_ts_metadata; + + if (/* Same topic id */ + rd_kafka_Uuid_cmp(rkt->rkt_topic_id, topic_id) == 0 && + rkt->rkt_state == RD_KAFKA_TOPIC_S_NOTEXISTS && remains_us > 0) { + /* Still allowing topic metadata to propagate. */ + rd_kafka_dbg( + rkt->rkt_rk, TOPIC | RD_KAFKA_DBG_METADATA, "TOPICPROP", + "Topic %.*s exists after being deleted, " + " allowing %dms for metadata propagation before marking " + "topic " + "as existent", + RD_KAFKAP_STR_PR(rkt->rkt_topic), (int)(remains_us / 1000)); + return rd_false; + } + + rd_kafka_topic_set_state(rkt, RD_KAFKA_TOPIC_S_EXISTS); + + return rd_true; +} + /** * @brief Mark topic as errored, such as when topic authorization fails. * @@ -1187,16 +1350,19 @@ rd_bool_t rd_kafka_topic_set_error(rd_kafka_topic_t *rkt, /** * @brief Update a topic from metadata. * + * @param mdt Topic metadata. + * @param mdit Topic internal metadata. * @param ts_age absolute age (timestamp) of metadata. * @returns 1 if the number of partitions changed, 0 if not, and -1 if the * topic is unknown. * - * @locks rd_kafka_*lock() MUST be held. + * @locks_required rd_kafka_*lock() MUST be held. */ static int rd_kafka_topic_metadata_update(rd_kafka_topic_t *rkt, const struct rd_kafka_metadata_topic *mdt, + const rd_kafka_metadata_topic_internal_t *mdit, rd_ts_t ts_age) { rd_kafka_t *rk = rkt->rkt_rk; int upd = 0; @@ -1204,6 +1370,8 @@ rd_kafka_topic_metadata_update(rd_kafka_topic_t *rkt, rd_kafka_broker_t **partbrokers; int leader_cnt = 0; int old_state; + rd_bool_t partition_exists_with_no_leader_epoch = rd_false; + rd_bool_t partition_exists_with_stale_leader_epoch = rd_false; if (mdt->err != RD_KAFKA_RESP_ERR_NO_ERROR) rd_kafka_dbg(rk, TOPIC | RD_KAFKA_DBG_METADATA, "METADATA", @@ -1238,21 +1406,45 @@ rd_kafka_topic_metadata_update(rd_kafka_topic_t *rkt, rkt->rkt_ts_metadata = ts_age; /* Set topic state. - * UNKNOWN_TOPIC_OR_PART may indicate that auto.create.topics failed */ + * UNKNOWN_TOPIC_* may indicate that auto.create.topics failed */ if (mdt->err == RD_KAFKA_RESP_ERR_TOPIC_EXCEPTION /*invalid topic*/ || - mdt->err == RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART) + mdt->err == RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_OR_PART || + mdt->err == RD_KAFKA_RESP_ERR_UNKNOWN_TOPIC_ID) rd_kafka_topic_set_notexists(rkt, mdt->err); - else if (mdt->partition_cnt > 0) - rd_kafka_topic_set_state(rkt, RD_KAFKA_TOPIC_S_EXISTS); - else if (mdt->err) + else if (mdt->err == RD_KAFKA_RESP_ERR_NO_ERROR && + mdt->partition_cnt > 0) + rd_kafka_topic_set_exists(rkt, mdit->topic_id); + else if (mdt->err == RD_KAFKA_RESP_ERR_TOPIC_AUTHORIZATION_FAILED) + /* Only set an error when it's permanent and it needs + * to be surfaced to the application. */ rd_kafka_topic_set_error(rkt, mdt->err); /* Update number of partitions, but not if there are * (possibly intermittent) errors (e.g., "Leader not available"). */ if (mdt->err == RD_KAFKA_RESP_ERR_NO_ERROR) { - upd += rd_kafka_topic_partition_cnt_update(rkt, - mdt->partition_cnt); - + rd_bool_t different_topic_id = + rd_kafka_Uuid_cmp(mdit->topic_id, rkt->rkt_topic_id) != 0; + if (different_topic_id || + mdt->partition_cnt > rkt->rkt_partition_cnt) + upd += rd_kafka_topic_partition_cnt_update( + rkt, mdt->partition_cnt); + if (different_topic_id) { + /* FIXME: an offset reset must be triggered. + * when rkt_topic_id wasn't zero. + * There are no problems + * in test 0107_topic_recreate if offsets in new + * topic are lower than in previous one, + * causing an out of range and an offset reset, + * but the rarer case where they're higher needs + * to be checked. */ + rd_kafka_dbg( + rk, TOPIC | RD_KAFKA_DBG_METADATA, "METADATA", + "Topic %s changed id from %s to %s", + rkt->rkt_topic->str, + rd_kafka_Uuid_base64str(&rkt->rkt_topic_id), + rd_kafka_Uuid_base64str(&mdit->topic_id)); + rkt->rkt_topic_id = mdit->topic_id; + } /* If the metadata times out for a topic (because all brokers * are down) the state will transition to S_UNKNOWN. * When updated metadata is eventually received there might @@ -1262,55 +1454,63 @@ rd_kafka_topic_metadata_update(rd_kafka_topic_t *rkt, * Issue #1985. */ if (old_state == RD_KAFKA_TOPIC_S_UNKNOWN) upd++; - } - /* Update leader for each partition */ - for (j = 0; j < mdt->partition_cnt; j++) { - int r; - rd_kafka_broker_t *leader; + /* Update leader for each partition + * only when topic response has no errors. */ + for (j = 0; + j < mdt->partition_cnt && j < rkt->rkt_partition_cnt; + j++) { + int r = 0; + rd_kafka_broker_t *leader; + int32_t leader_epoch = mdit->partitions[j].leader_epoch; + rd_kafka_toppar_t *rktp = + rd_kafka_toppar_get(rkt, mdt->partitions[j].id, 0); - rd_kafka_dbg(rk, TOPIC | RD_KAFKA_DBG_METADATA, "METADATA", - " Topic %s partition %i Leader %" PRId32, - rkt->rkt_topic->str, mdt->partitions[j].id, - mdt->partitions[j].leader); + rd_kafka_dbg(rk, TOPIC | RD_KAFKA_DBG_METADATA, + "METADATA", + "Topic %s [%" PRId32 "] Leader %" PRId32 + " Epoch %" PRId32, + rkt->rkt_topic->str, mdt->partitions[j].id, + mdt->partitions[j].leader, leader_epoch); - leader = partbrokers[j]; - partbrokers[j] = NULL; + leader = partbrokers[j]; + partbrokers[j] = NULL; - /* Update leader for partition */ - r = rd_kafka_toppar_leader_update(rkt, mdt->partitions[j].id, - mdt->partitions[j].leader, - leader); + /* If broker does not support leaderEpoch(KIP 320) then + * it is set to -1, we assume that metadata is not + * stale. */ + if (leader_epoch == -1) + partition_exists_with_no_leader_epoch = rd_true; + else if (rktp && leader_epoch < rktp->rktp_leader_epoch) + partition_exists_with_stale_leader_epoch = + rd_true; - upd += (r != 0 ? 1 : 0); - if (leader) { - if (r != -1) - leader_cnt++; - /* Drop reference to broker (from find()) */ - rd_kafka_broker_destroy(leader); + /* Update leader for partition */ + r = rd_kafka_toppar_leader_update( + rkt, mdt->partitions[j].id, + mdt->partitions[j].leader, leader, leader_epoch); + + upd += (r != 0 ? 1 : 0); + + if (leader) { + if (r != -1) + leader_cnt++; + /* Drop reference to broker (from find()) */ + rd_kafka_broker_destroy(leader); + } + RD_IF_FREE(rktp, rd_kafka_toppar_destroy); } } - /* If all partitions have leaders we can turn off fast leader query. */ - if (mdt->partition_cnt > 0 && leader_cnt == mdt->partition_cnt) + /* If all partitions have leaders, and this metadata update was not + * stale, we can turn off fast leader query. */ + if (rkt->rkt_partition_cnt > 0 && + leader_cnt == rkt->rkt_partition_cnt && + (partition_exists_with_no_leader_epoch || + !partition_exists_with_stale_leader_epoch)) rkt->rkt_flags &= ~RD_KAFKA_TOPIC_F_LEADER_UNAVAIL; - if (mdt->err != RD_KAFKA_RESP_ERR_NO_ERROR && rkt->rkt_partition_cnt) { - /* (Possibly intermittent) topic-wide error: - * remove leaders for partitions */ - - for (j = 0; j < rkt->rkt_partition_cnt; j++) { - rd_kafka_toppar_t *rktp; - if (!rkt->rkt_p[j]) - continue; - - rktp = rkt->rkt_p[j]; - rd_kafka_toppar_lock(rktp); - rd_kafka_toppar_broker_delegate(rktp, NULL); - rd_kafka_toppar_unlock(rktp); - } - } /* If there was an update to the partitions try to assign * unassigned messages to new partitions, or fail them */ @@ -1336,19 +1536,28 @@ rd_kafka_topic_metadata_update(rd_kafka_topic_t *rkt, * @sa rd_kafka_topic_metadata_update() * @locks none */ -int rd_kafka_topic_metadata_update2(rd_kafka_broker_t *rkb, - const struct rd_kafka_metadata_topic *mdt) { +int rd_kafka_topic_metadata_update2( + rd_kafka_broker_t *rkb, + const struct rd_kafka_metadata_topic *mdt, + const rd_kafka_metadata_topic_internal_t *mdit) { rd_kafka_topic_t *rkt; int r; rd_kafka_wrlock(rkb->rkb_rk); - if (!(rkt = - rd_kafka_topic_find(rkb->rkb_rk, mdt->topic, 0 /*!lock*/))) { + + if (likely(mdt->topic != NULL)) { + rkt = rd_kafka_topic_find(rkb->rkb_rk, mdt->topic, 0 /*!lock*/); + } else { + rkt = rd_kafka_topic_find_by_topic_id(rkb->rkb_rk, + mdit->topic_id); + } + + if (!rkt) { rd_kafka_wrunlock(rkb->rkb_rk); return -1; /* Ignore topics that we dont have locally. */ } - r = rd_kafka_topic_metadata_update(rkt, mdt, rd_clock()); + r = rd_kafka_topic_metadata_update(rkt, mdt, mdit, rd_clock()); rd_kafka_wrunlock(rkb->rkb_rk); @@ -1520,8 +1729,8 @@ void rd_kafka_topic_scan_all(rd_kafka_t *rk, rd_ts_t now) { /* Check if metadata information has timed out. */ if (rkt->rkt_state != RD_KAFKA_TOPIC_S_UNKNOWN && - !rd_kafka_metadata_cache_topic_get(rk, rkt->rkt_topic->str, - 1 /*only valid*/)) { + !rd_kafka_metadata_cache_topic_get( + rk, rkt->rkt_topic->str, NULL, 1 /*only valid*/)) { rd_kafka_dbg(rk, TOPIC, "NOINFO", "Topic %s metadata information timed out " "(%" PRId64 "ms old)", @@ -1639,7 +1848,8 @@ void rd_kafka_topic_scan_all(rd_kafka_t *rk, rd_ts_t now) { * info exists*/ , rk->rk_conf.allow_auto_create_topics, - rd_false /*!cgrp_update*/, "refresh unavailable topics"); + rd_false /*!cgrp_update*/, -1, + "refresh unavailable topics"); rd_list_destroy(&query_topics); } @@ -1695,12 +1905,36 @@ void *rd_kafka_topic_opaque(const rd_kafka_topic_t *app_rkt) { int rd_kafka_topic_info_cmp(const void *_a, const void *_b) { const rd_kafka_topic_info_t *a = _a, *b = _b; - int r; + int r, i; if ((r = strcmp(a->topic, b->topic))) return r; - return RD_CMP(a->partition_cnt, b->partition_cnt); + if ((r = RD_CMP(a->partition_cnt, b->partition_cnt))) + return r; + + if (a->partitions_internal == NULL && b->partitions_internal == NULL) + return 0; + + if (a->partitions_internal == NULL || b->partitions_internal == NULL) + return (a->partitions_internal == NULL) ? 1 : -1; + + /* We're certain partitions_internal exist for a/b and have the same + * count. */ + for (i = 0; i < a->partition_cnt; i++) { + size_t k; + if ((r = RD_CMP(a->partitions_internal[i].racks_cnt, + b->partitions_internal[i].racks_cnt))) + return r; + + for (k = 0; k < a->partitions_internal[i].racks_cnt; k++) { + if ((r = rd_strcmp(a->partitions_internal[i].racks[k], + b->partitions_internal[i].racks[k]))) + return r; + } + } + + return 0; } @@ -1730,7 +1964,83 @@ rd_kafka_topic_info_t *rd_kafka_topic_info_new(const char *topic, ti = rd_malloc(sizeof(*ti) + tlen); ti->topic = (char *)(ti + 1); memcpy((char *)ti->topic, topic, tlen); - ti->partition_cnt = partition_cnt; + ti->partition_cnt = partition_cnt; + ti->partitions_internal = NULL; + + return ti; +} + +/** + * Allocate new topic_info, including rack information. + * \p topic is copied. + */ +rd_kafka_topic_info_t *rd_kafka_topic_info_new_with_rack( + const char *topic, + int partition_cnt, + const rd_kafka_metadata_partition_internal_t *mdpi) { + rd_kafka_topic_info_t *ti; + rd_tmpabuf_t tbuf; + int i; + rd_bool_t has_racks = rd_false; + + rd_tmpabuf_new(&tbuf, 0, rd_true /* assert on fail */); + + rd_tmpabuf_add_alloc(&tbuf, sizeof(*ti)); + rd_tmpabuf_add_alloc(&tbuf, strlen(topic) + 1); + for (i = 0; i < partition_cnt; i++) { + size_t j; + if (!mdpi[i].racks) + continue; + + if (unlikely(!has_racks)) + has_racks = rd_true; + + for (j = 0; j < mdpi[i].racks_cnt; j++) { + rd_tmpabuf_add_alloc(&tbuf, + strlen(mdpi[i].racks[j]) + 1); + } + rd_tmpabuf_add_alloc(&tbuf, sizeof(char *) * mdpi[i].racks_cnt); + } + + /* Only bother allocating this if at least one + * rack is there. */ + if (has_racks) { + rd_tmpabuf_add_alloc( + &tbuf, sizeof(rd_kafka_metadata_partition_internal_t) * + partition_cnt); + } + + rd_tmpabuf_finalize(&tbuf); + + ti = rd_tmpabuf_alloc(&tbuf, sizeof(*ti)); + ti->topic = rd_tmpabuf_write_str(&tbuf, topic); + ti->partition_cnt = partition_cnt; + ti->partitions_internal = NULL; + + if (has_racks) { + ti->partitions_internal = rd_tmpabuf_alloc( + &tbuf, sizeof(*ti->partitions_internal) * partition_cnt); + + for (i = 0; i < partition_cnt; i++) { + size_t j; + ti->partitions_internal[i].id = mdpi[i].id; + ti->partitions_internal[i].racks = NULL; + + if (!mdpi[i].racks) + continue; + + ti->partitions_internal[i].racks_cnt = + mdpi[i].racks_cnt; + ti->partitions_internal[i].racks = rd_tmpabuf_alloc( + &tbuf, sizeof(char *) * mdpi[i].racks_cnt); + + for (j = 0; j < mdpi[i].racks_cnt; j++) { + ti->partitions_internal[i].racks[j] = + rd_tmpabuf_write_str(&tbuf, + mdpi[i].racks[j]); + } + } + } return ti; } @@ -1738,7 +2048,7 @@ rd_kafka_topic_info_t *rd_kafka_topic_info_new(const char *topic, /** * Destroy/free topic_info */ -void rd_kafka_topic_info_destroy(rd_kafka_topic_info_t *ti) { +void rd_kafka_topic_info_destroy_free(void *ti) { rd_free(ti); } @@ -1777,16 +2087,16 @@ int rd_kafka_topic_match(rd_kafka_t *rk, */ void rd_kafka_topic_leader_query0(rd_kafka_t *rk, rd_kafka_topic_t *rkt, - int do_rk_lock) { + int do_rk_lock, + rd_bool_t force) { rd_list_t topics; rd_list_init(&topics, 1, rd_free); rd_list_add(&topics, rd_strdup(rkt->rkt_topic->str)); rd_kafka_metadata_refresh_topics( - rk, NULL, &topics, rd_false /*dont force*/, - rk->rk_conf.allow_auto_create_topics, rd_false /*!cgrp_update*/, - "leader query"); + rk, NULL, &topics, force, rk->rk_conf.allow_auto_create_topics, + rd_false /*!cgrp_update*/, -1, "leader query"); rd_list_destroy(&topics); } @@ -1812,7 +2122,7 @@ void rd_kafka_local_topics_to_list(rd_kafka_t *rk, rd_list_grow(topics, rk->rk_topic_cnt); TAILQ_FOREACH(rkt, &rk->rk_topics, rkt_link) rd_list_add(topics, rd_strdup(rkt->rkt_topic->str)); - cache_cnt = rd_kafka_metadata_cache_topics_to_list(rk, topics); + cache_cnt = rd_kafka_metadata_cache_topics_to_list(rk, topics, rd_true); if (cache_cntp) *cache_cntp = cache_cnt; rd_kafka_rdunlock(rk); @@ -1826,9 +2136,12 @@ void rd_kafka_local_topics_to_list(rd_kafka_t *rk, void rd_ut_kafka_topic_set_topic_exists(rd_kafka_topic_t *rkt, int partition_cnt, int32_t leader_id) { - struct rd_kafka_metadata_topic mdt = {.topic = - (char *)rkt->rkt_topic->str, - .partition_cnt = partition_cnt}; + rd_kafka_metadata_partition_internal_t *partitions = + rd_calloc(partition_cnt, sizeof(*partitions)); + struct rd_kafka_metadata_topic mdt = {.topic = + (char *)rkt->rkt_topic->str, + .partition_cnt = partition_cnt}; + rd_kafka_metadata_topic_internal_t mdit = {.partitions = partitions}; int i; mdt.partitions = rd_alloca(sizeof(*mdt.partitions) * partition_cnt); @@ -1840,7 +2153,9 @@ void rd_ut_kafka_topic_set_topic_exists(rd_kafka_topic_t *rkt, } rd_kafka_wrlock(rkt->rkt_rk); - rd_kafka_metadata_cache_topic_update(rkt->rkt_rk, &mdt, rd_true); - rd_kafka_topic_metadata_update(rkt, &mdt, rd_clock()); + rd_kafka_metadata_cache_topic_update(rkt->rkt_rk, &mdt, &mdit, rd_true, + rd_false, rd_true); + rd_kafka_topic_metadata_update(rkt, &mdt, &mdit, rd_clock()); rd_kafka_wrunlock(rkt->rkt_rk); + rd_free(partitions); } diff --git a/src/third_party/librdkafka/dist/src/rdkafka_topic.h b/src/third_party/librdkafka/dist/src/rdkafka_topic.h index 19e0c020062..7035b188b0f 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_topic.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_topic.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012,2013 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -98,6 +99,27 @@ typedef struct rd_kafka_partition_msgid_s { } rd_kafka_partition_msgid_t; +/** + * @struct Aux struct that holds a partition id and a leader epoch. + * Used as temporary holding space for per-partition leader epochs + * while parsing MetadataResponse. + */ +typedef struct rd_kafka_partition_leader_epoch_s { + int32_t partition_id; + int32_t leader_epoch; +} rd_kafka_partition_leader_epoch_t; + +/** + * Finds and returns a topic based on its topic_id, or NULL if not found. + * The 'rkt' refcount is increased by one and the caller must call + * rd_kafka_topic_destroy() when it is done with the topic to decrease + * the refcount. + * + * Locality: any thread + */ +rd_kafka_topic_t *rd_kafka_topic_find_by_topic_id(rd_kafka_t *rk, + rd_kafka_Uuid_t topic_id); + /* * @struct Internal representation of a topic. * @@ -112,6 +134,7 @@ struct rd_kafka_topic_s { rwlock_t rkt_lock; rd_kafkap_str_t *rkt_topic; + rd_kafka_Uuid_t rkt_topic_id; rd_kafka_toppar_t *rkt_ua; /**< Unassigned partition (-1) */ rd_kafka_toppar_t **rkt_p; /**< Partition array */ @@ -136,12 +159,14 @@ struct rd_kafka_topic_s { rd_refcnt_t rkt_app_refcnt; /**< Number of active rkt's new()ed * by application. */ - enum { RD_KAFKA_TOPIC_S_UNKNOWN, /* No cluster information yet */ - RD_KAFKA_TOPIC_S_EXISTS, /* Topic exists in cluster */ - RD_KAFKA_TOPIC_S_NOTEXISTS, /* Topic is not known in cluster */ - RD_KAFKA_TOPIC_S_ERROR, /* Topic exists but is in an errored - * state, such as auth failure. */ + enum { + RD_KAFKA_TOPIC_S_UNKNOWN, /* No cluster information yet */ + RD_KAFKA_TOPIC_S_EXISTS, /* Topic exists in cluster */ + RD_KAFKA_TOPIC_S_NOTEXISTS, /* Topic is not known in cluster */ + RD_KAFKA_TOPIC_S_ERROR, /* Topic exists but is in an errored + * state, such as auth failure. */ } rkt_state; + rd_ts_t rkt_ts_state; /**< State change time. */ int rkt_flags; #define RD_KAFKA_TOPIC_F_LEADER_UNAVAIL \ @@ -244,8 +269,10 @@ rd_kafka_topic_get_error(rd_kafka_topic_t *rkt) { return err; } -int rd_kafka_topic_metadata_update2(rd_kafka_broker_t *rkb, - const struct rd_kafka_metadata_topic *mdt); +int rd_kafka_topic_metadata_update2( + rd_kafka_broker_t *rkb, + const struct rd_kafka_metadata_topic *mdt, + const rd_kafka_metadata_topic_internal_t *mdit); void rd_kafka_topic_scan_all(rd_kafka_t *rk, rd_ts_t now); @@ -253,13 +280,18 @@ void rd_kafka_topic_scan_all(rd_kafka_t *rk, rd_ts_t now); typedef struct rd_kafka_topic_info_s { const char *topic; /**< Allocated along with struct */ int partition_cnt; + rd_kafka_metadata_partition_internal_t *partitions_internal; } rd_kafka_topic_info_t; int rd_kafka_topic_info_topic_cmp(const void *_a, const void *_b); int rd_kafka_topic_info_cmp(const void *_a, const void *_b); rd_kafka_topic_info_t *rd_kafka_topic_info_new(const char *topic, int partition_cnt); -void rd_kafka_topic_info_destroy(rd_kafka_topic_info_t *ti); +rd_kafka_topic_info_t *rd_kafka_topic_info_new_with_rack( + const char *topic, + int partition_cnt, + const rd_kafka_metadata_partition_internal_t *mdpi); +void rd_kafka_topic_info_destroy_free(void *ti); int rd_kafka_topic_match(rd_kafka_t *rk, const char *pattern, @@ -272,18 +304,24 @@ int rd_kafka_toppar_broker_update(rd_kafka_toppar_t *rktp, int rd_kafka_toppar_delegate_to_leader(rd_kafka_toppar_t *rktp); +void rd_kafka_toppar_undelegate(rd_kafka_toppar_t *rktp); + +void rd_kafka_toppar_forget_leader(rd_kafka_toppar_t *rktp); + rd_kafka_resp_err_t rd_kafka_topics_leader_query_sync(rd_kafka_t *rk, int all_topics, const rd_list_t *topics, int timeout_ms); void rd_kafka_topic_leader_query0(rd_kafka_t *rk, rd_kafka_topic_t *rkt, - int do_rk_lock); + int do_rk_lock, + rd_bool_t force); #define rd_kafka_topic_leader_query(rk, rkt) \ - rd_kafka_topic_leader_query0(rk, rkt, 1 /*lock*/) + rd_kafka_topic_leader_query0(rk, rkt, 1 /*lock*/, \ + rd_false /*dont force*/) -#define rd_kafka_topic_fast_leader_query(rk) \ - rd_kafka_metadata_fast_leader_query(rk) +#define rd_kafka_topic_fast_leader_query(rk, force) \ + rd_kafka_metadata_fast_leader_query(rk, force) void rd_kafka_local_topics_to_list(rd_kafka_t *rk, rd_list_t *topics, diff --git a/src/third_party/librdkafka/dist/src/rdkafka_transport.c b/src/third_party/librdkafka/dist/src/rdkafka_transport.c index ae5895b29ae..3407fc4bec6 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_transport.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_transport.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2015, Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -277,9 +278,18 @@ static ssize_t rd_kafka_transport_socket_recvmsg(rd_kafka_transport_t *rktrans, if (unlikely(r <= 0)) { if (r == -1 && rd_socket_errno == EAGAIN) return 0; - else if (r == 0 || (r == -1 && rd_socket_errno == ECONNRESET)) { + else if (r == 0) { /* Receive 0 after POLLIN event means * connection closed. */ + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: connection closed by " + "peer"); + rd_snprintf(errstr, errstr_size, "Disconnected"); + return -1; + } else if (r == -1 && rd_socket_errno == ECONNRESET) { + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: connection " + "reset by peer"); rd_snprintf(errstr, errstr_size, "Disconnected"); return -1; } else if (r == -1) { @@ -334,6 +344,9 @@ static ssize_t rd_kafka_transport_socket_recv0(rd_kafka_transport_t *rktrans, } else if (unlikely(r == 0)) { /* Receive 0 after POLLIN event means * connection closed. */ + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: connection closed by " + "peer"); rd_snprintf(errstr, errstr_size, "Disconnected"); return -1; } @@ -542,33 +555,6 @@ void rd_kafka_transport_post_connect_setup(rd_kafka_transport_t *rktrans) { rd_kafka_broker_t *rkb = rktrans->rktrans_rkb; unsigned int slen; - /* Set socket send & receive buffer sizes if configuerd */ - if (rkb->rkb_rk->rk_conf.socket_sndbuf_size != 0) { - if (setsockopt( - rktrans->rktrans_s, SOL_SOCKET, SO_SNDBUF, - (void *)&rkb->rkb_rk->rk_conf.socket_sndbuf_size, - sizeof(rkb->rkb_rk->rk_conf.socket_sndbuf_size)) == - RD_SOCKET_ERROR) - rd_rkb_log(rkb, LOG_WARNING, "SNDBUF", - "Failed to set socket send " - "buffer size to %i: %s", - rkb->rkb_rk->rk_conf.socket_sndbuf_size, - rd_socket_strerror(rd_socket_errno)); - } - - if (rkb->rkb_rk->rk_conf.socket_rcvbuf_size != 0) { - if (setsockopt( - rktrans->rktrans_s, SOL_SOCKET, SO_RCVBUF, - (void *)&rkb->rkb_rk->rk_conf.socket_rcvbuf_size, - sizeof(rkb->rkb_rk->rk_conf.socket_rcvbuf_size)) == - RD_SOCKET_ERROR) - rd_rkb_log(rkb, LOG_WARNING, "RCVBUF", - "Failed to set socket receive " - "buffer size to %i: %s", - rkb->rkb_rk->rk_conf.socket_rcvbuf_size, - rd_socket_strerror(rd_socket_errno)); - } - /* Get send and receive buffer sizes to allow limiting * the total number of bytes passed with iovecs to sendmsg() * and recvmsg(). */ @@ -597,19 +583,6 @@ void rd_kafka_transport_post_connect_setup(rd_kafka_transport_t *rktrans) { } else if (rktrans->rktrans_sndbuf_size < 1024 * 64) rktrans->rktrans_sndbuf_size = 1024 * 64; /* Use at least 64KB */ - - -#ifdef TCP_NODELAY - if (rkb->rkb_rk->rk_conf.socket_nagle_disable) { - int one = 1; - if (setsockopt(rktrans->rktrans_s, IPPROTO_TCP, TCP_NODELAY, - (void *)&one, sizeof(one)) == RD_SOCKET_ERROR) - rd_rkb_log(rkb, LOG_WARNING, "NAGLE", - "Failed to disable Nagle (TCP_NODELAY) " - "on socket: %s", - rd_socket_strerror(rd_socket_errno)); - } -#endif } @@ -744,6 +717,9 @@ static void rd_kafka_transport_io_event(rd_kafka_transport_t *rktrans, if (r == 0 /* handshake still in progress */ && (events & POLLHUP)) { + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: during " + "SSL connection handshake"); rd_kafka_broker_conn_closed( rkb, RD_KAFKA_RESP_ERR__TRANSPORT, "Disconnected"); return; @@ -767,6 +743,9 @@ static void rd_kafka_transport_io_event(rd_kafka_transport_t *rktrans, } if (events & POLLHUP) { + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: hung up from peer in " + "state AUTH_LEGACY"); rd_kafka_broker_fail(rkb, LOG_ERR, RD_KAFKA_RESP_ERR__AUTHENTICATION, "Disconnected"); @@ -780,7 +759,6 @@ static void rd_kafka_transport_io_event(rd_kafka_transport_t *rktrans, case RD_KAFKA_BROKER_STATE_AUTH_HANDSHAKE: case RD_KAFKA_BROKER_STATE_AUTH_REQ: case RD_KAFKA_BROKER_STATE_UP: - case RD_KAFKA_BROKER_STATE_UPDATE: if (events & POLLIN) { while (rkb->rkb_state >= RD_KAFKA_BROKER_STATE_UP && @@ -793,6 +771,9 @@ static void rd_kafka_transport_io_event(rd_kafka_transport_t *rktrans, } if (events & POLLHUP) { + rd_rkb_dbg(rktrans->rktrans_rkb, BROKER, "SOCKET", + "Disconnected: connection closed by " + "peer"); rd_kafka_broker_conn_closed( rkb, RD_KAFKA_RESP_ERR__TRANSPORT, "Disconnected"); return; @@ -807,6 +788,7 @@ static void rd_kafka_transport_io_event(rd_kafka_transport_t *rktrans, case RD_KAFKA_BROKER_STATE_INIT: case RD_KAFKA_BROKER_STATE_DOWN: case RD_KAFKA_BROKER_STATE_TRY_CONNECT: + case RD_KAFKA_BROKER_STATE_REAUTH: rd_kafka_assert(rkb->rkb_rk, !*"bad state"); } } @@ -1079,6 +1061,45 @@ rd_kafka_transport_t *rd_kafka_transport_new(rd_kafka_broker_t *rkb, } #endif +#ifdef TCP_NODELAY + if (rkb->rkb_rk->rk_conf.socket_nagle_disable) { + int one = 1; + if (setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void *)&one, + sizeof(one)) == RD_SOCKET_ERROR) + rd_rkb_log(rkb, LOG_WARNING, "NAGLE", + "Failed to disable Nagle (TCP_NODELAY) " + "on socket: %s", + rd_socket_strerror(rd_socket_errno)); + } +#endif + + /* Set socket send & receive buffer sizes if configuerd */ + if (rkb->rkb_rk->rk_conf.socket_sndbuf_size != 0) { + if (setsockopt( + s, SOL_SOCKET, SO_SNDBUF, + (void *)&rkb->rkb_rk->rk_conf.socket_sndbuf_size, + sizeof(rkb->rkb_rk->rk_conf.socket_sndbuf_size)) == + RD_SOCKET_ERROR) + rd_rkb_log(rkb, LOG_WARNING, "SNDBUF", + "Failed to set socket send " + "buffer size to %i: %s", + rkb->rkb_rk->rk_conf.socket_sndbuf_size, + rd_socket_strerror(rd_socket_errno)); + } + + if (rkb->rkb_rk->rk_conf.socket_rcvbuf_size != 0) { + if (setsockopt( + s, SOL_SOCKET, SO_RCVBUF, + (void *)&rkb->rkb_rk->rk_conf.socket_rcvbuf_size, + sizeof(rkb->rkb_rk->rk_conf.socket_rcvbuf_size)) == + RD_SOCKET_ERROR) + rd_rkb_log(rkb, LOG_WARNING, "RCVBUF", + "Failed to set socket receive " + "buffer size to %i: %s", + rkb->rkb_rk->rk_conf.socket_rcvbuf_size, + rd_socket_strerror(rd_socket_errno)); + } + /* Set the socket to non-blocking */ if ((r = rd_fd_set_nonblocking(s))) { rd_snprintf(errstr, errstr_size, diff --git a/src/third_party/librdkafka/dist/src/rdkafka_transport.h b/src/third_party/librdkafka/dist/src/rdkafka_transport.h index 83af5ae9016..c5f73163f94 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_transport.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_transport.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2015, Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_transport_int.h b/src/third_party/librdkafka/dist/src/rdkafka_transport_int.h index 4b053b98fa1..9e00f238c30 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_transport_int.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_transport_int.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2015, Magnus Edenhill + * Copyright (c) 2015-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_txnmgr.c b/src/third_party/librdkafka/dist/src/rdkafka_txnmgr.c index 2c69cd7d85f..019a3b80c61 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_txnmgr.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_txnmgr.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -1496,8 +1496,12 @@ static void rd_kafka_txn_handle_TxnOffsetCommit(rd_kafka_t *rk, rd_kafka_buf_read_throttle_time(rkbuf); - partitions = - rd_kafka_buf_read_topic_partitions(rkbuf, 0, rd_false, rd_true); + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_ERR, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; + partitions = rd_kafka_buf_read_topic_partitions( + rkbuf, rd_false /*don't use topic_id*/, rd_true, 0, fields); if (!partitions) goto err_parse; @@ -1704,11 +1708,17 @@ rd_kafka_txn_send_TxnOffsetCommitRequest(rd_kafka_broker_t *rkb, } /* Write per-partition offsets list */ + const rd_kafka_topic_partition_field_t fields[] = { + RD_KAFKA_TOPIC_PARTITION_FIELD_PARTITION, + RD_KAFKA_TOPIC_PARTITION_FIELD_OFFSET, + ApiVersion >= 2 ? RD_KAFKA_TOPIC_PARTITION_FIELD_EPOCH + : RD_KAFKA_TOPIC_PARTITION_FIELD_NOOP, + RD_KAFKA_TOPIC_PARTITION_FIELD_METADATA, + RD_KAFKA_TOPIC_PARTITION_FIELD_END}; cnt = rd_kafka_buf_write_topic_partitions( rkbuf, rko->rko_u.txn.offsets, rd_true /*skip invalid offsets*/, - rd_false /*any offset*/, rd_true /*write offsets*/, - ApiVersion >= 2 /*write Epoch (-1) */, rd_true /*write Metadata*/); - + rd_false /*any offset*/, rd_false /*don't use topic id*/, + rd_true /*use topic name*/, fields); if (!cnt) { /* No valid partition offsets, don't commit. */ rd_kafka_buf_destroy(rkbuf); @@ -2019,7 +2029,7 @@ rd_kafka_error_t *rd_kafka_send_offsets_to_transaction( rd_kafka_topic_partition_list_sort_by_topic(valid_offsets); rko = rd_kafka_op_new_cb(rk, RD_KAFKA_OP_TXN, - rd_kafka_txn_op_send_offsets_to_transaction); + rd_kafka_txn_op_send_offsets_to_transaction); rko->rko_u.txn.offsets = valid_offsets; rko->rko_u.txn.cgmetadata = rd_kafka_consumer_group_metadata_dup(cgmetadata); @@ -2948,6 +2958,11 @@ static void rd_kafka_txn_handle_FindCoordinator(rd_kafka_t *rk, NodeId); err = RD_KAFKA_RESP_ERR__UNKNOWN_BROKER; } + if (rkb && rkb->rkb_source != RD_KAFKA_LEARNED) { + rd_kafka_broker_destroy(rkb); + rkb = NULL; + err = RD_KAFKA_RESP_ERR__UNKNOWN_BROKER; + } rd_kafka_rdunlock(rk); if (err) diff --git a/src/third_party/librdkafka/dist/src/rdkafka_txnmgr.h b/src/third_party/librdkafka/dist/src/rdkafka_txnmgr.h index 3c088d09a65..d67b57bce26 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_txnmgr.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_txnmgr.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2019 Magnus Edenhill + * Copyright (c) 2019-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_zstd.c b/src/third_party/librdkafka/dist/src/rdkafka_zstd.c index 68b01a4e1ce..dac2c4dfcc7 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_zstd.c +++ b/src/third_party/librdkafka/dist/src/rdkafka_zstd.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdkafka_zstd.h b/src/third_party/librdkafka/dist/src/rdkafka_zstd.h index f87c4c6fbc3..7f5a7490413 100644 --- a/src/third_party/librdkafka/dist/src/rdkafka_zstd.h +++ b/src/third_party/librdkafka/dist/src/rdkafka_zstd.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdlist.c b/src/third_party/librdkafka/dist/src/rdlist.c index c71e3004ad7..65e3eb97e04 100644 --- a/src/third_party/librdkafka/dist/src/rdlist.c +++ b/src/third_party/librdkafka/dist/src/rdlist.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -148,6 +149,7 @@ void *rd_list_add(rd_list_t *rl, void *elem) { return rl->rl_elems[rl->rl_cnt++]; } + void rd_list_set(rd_list_t *rl, int idx, void *ptr) { if (idx >= rl->rl_size) rd_list_grow(rl, idx + 1); @@ -376,6 +378,34 @@ void *rd_list_find_duplicate(const rd_list_t *rl, return NULL; } +void rd_list_deduplicate(rd_list_t **rl, + int (*cmp)(const void *, const void *)) { + rd_list_t *deduped = rd_list_new(0, (*rl)->rl_free_cb); + void *elem; + void *prev_elem = NULL; + int i; + + if (!((*rl)->rl_flags & RD_LIST_F_SORTED)) + rd_list_sort(*rl, cmp); + + RD_LIST_FOREACH(elem, *rl, i) { + if (prev_elem && cmp(elem, prev_elem) == 0) { + /* Skip this element, and destroy it */ + rd_list_free_cb(*rl, elem); + continue; + } + rd_list_add(deduped, elem); + prev_elem = elem; + } + /* The elements we want destroyed are already destroyed. */ + (*rl)->rl_free_cb = NULL; + rd_list_destroy(*rl); + + /* The parent list was sorted, we can set this without re-sorting. */ + deduped->rl_flags |= RD_LIST_F_SORTED; + *rl = deduped; +} + int rd_list_cmp(const rd_list_t *a, const rd_list_t *b, int (*cmp)(const void *, const void *)) { diff --git a/src/third_party/librdkafka/dist/src/rdlist.h b/src/third_party/librdkafka/dist/src/rdlist.h index db5295f6cfd..3a1316c3899 100644 --- a/src/third_party/librdkafka/dist/src/rdlist.h +++ b/src/third_party/librdkafka/dist/src/rdlist.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill, + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -301,6 +302,18 @@ void *rd_list_find_duplicate(const rd_list_t *rl, int (*cmp)(const void *, const void *)); +/** + * @brief Deduplicates a list. + * + * @param rl is a ptrptr since a new list is created and assigned to *rl, for + * efficiency. + * @returns a deduplicated and sorted version of \p *rl. + * @warning the original \p *rl is destroyed. + */ +void rd_list_deduplicate(rd_list_t **rl, + int (*cmp)(const void *, const void *)); + + /** * @brief Compare list \p a to \p b. * diff --git a/src/third_party/librdkafka/dist/src/rdlog.c b/src/third_party/librdkafka/dist/src/rdlog.c index 19fbbb1614b..3ddc82d06ea 100644 --- a/src/third_party/librdkafka/dist/src/rdlog.c +++ b/src/third_party/librdkafka/dist/src/rdlog.c @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdlog.h b/src/third_party/librdkafka/dist/src/rdlog.h index f360a0b66e0..a83701f6a3e 100644 --- a/src/third_party/librdkafka/dist/src/rdlog.h +++ b/src/third_party/librdkafka/dist/src/rdlog.h @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdmap.c b/src/third_party/librdkafka/dist/src/rdmap.c index 4b85470336b..522b786c443 100644 --- a/src/third_party/librdkafka/dist/src/rdmap.c +++ b/src/third_party/librdkafka/dist/src/rdmap.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -237,6 +238,21 @@ unsigned int rd_map_str_hash(const void *key) { } +/** + * @returns a djb2 hash of \p bytes. + * + * @param len \p bytes will be hashed up to \p len. + */ +unsigned int rd_bytes_hash(unsigned char *bytes, size_t len) { + unsigned int hash = 5381; + size_t i; + + for (i = 0; i < len; i++) + hash = ((hash << 5) + hash) + bytes[i]; + + return hash; +} + /** * @name Unit tests @@ -388,12 +404,17 @@ static int unittest_typed_map2(void) { static int unittest_untyped_map(void) { rd_map_t rmap; int pass, i, r; - int cnt = 100000; + int cnt = rd_unittest_with_valgrind ? 1000 : 100000; int exp_cnt = 0, get_cnt = 0, iter_cnt = 0; const rd_map_elem_t *elem; rd_ts_t ts = rd_clock(); rd_ts_t ts_get = 0; + if (rd_unittest_with_valgrind) + RD_UT_WARN( + "Reducing count in " + "untyped map test when using Valgrind"); + rd_map_init(&rmap, cnt, rd_map_str_cmp, rd_map_str_hash, rd_free, rd_free); diff --git a/src/third_party/librdkafka/dist/src/rdmap.h b/src/third_party/librdkafka/dist/src/rdmap.h index a79dcda06a8..b8e3feb97bf 100644 --- a/src/third_party/librdkafka/dist/src/rdmap.h +++ b/src/third_party/librdkafka/dist/src/rdmap.h @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2020 Magnus Edenhill + * Copyright (c) 2020-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -249,6 +250,10 @@ int rd_map_str_cmp(const void *a, const void *b); */ unsigned int rd_map_str_hash(const void *a); +/** + * @brief Bytes hash function (djb2). + */ +unsigned int rd_bytes_hash(unsigned char *bytes, size_t len); /** diff --git a/src/third_party/librdkafka/dist/src/rdmurmur2.c b/src/third_party/librdkafka/dist/src/rdmurmur2.c index c3e4095d4c0..c54fa2f51c3 100644 --- a/src/third_party/librdkafka/dist/src/rdmurmur2.c +++ b/src/third_party/librdkafka/dist/src/rdmurmur2.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdmurmur2.h b/src/third_party/librdkafka/dist/src/rdmurmur2.h index 5991caa50ca..fc23dfec947 100644 --- a/src/third_party/librdkafka/dist/src/rdmurmur2.h +++ b/src/third_party/librdkafka/dist/src/rdmurmur2.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdports.c b/src/third_party/librdkafka/dist/src/rdports.c index 15c57e9289a..9af8ede531f 100644 --- a/src/third_party/librdkafka/dist/src/rdports.c +++ b/src/third_party/librdkafka/dist/src/rdports.c @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdports.h b/src/third_party/librdkafka/dist/src/rdports.h index 0cdbcd85fc9..41314ebfbe3 100644 --- a/src/third_party/librdkafka/dist/src/rdports.h +++ b/src/third_party/librdkafka/dist/src/rdports.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdposix.h b/src/third_party/librdkafka/dist/src/rdposix.h index 7b2376823f6..0af5948168e 100644 --- a/src/third_party/librdkafka/dist/src/rdposix.h +++ b/src/third_party/librdkafka/dist/src/rdposix.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdrand.c b/src/third_party/librdkafka/dist/src/rdrand.c index e36d79380bc..104b289d6f0 100644 --- a/src/third_party/librdkafka/dist/src/rdrand.c +++ b/src/third_party/librdkafka/dist/src/rdrand.c @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,6 +30,7 @@ #include "rdrand.h" #include "rdtime.h" #include "tinycthread.h" +#include "rdmurmur2.h" int rd_jitter(int low, int high) { int rand_num; @@ -40,8 +41,17 @@ int rd_jitter(int low, int high) { if (unlikely(seed == 0)) { struct timeval tv; rd_gettimeofday(&tv, NULL); - seed = (unsigned int)(tv.tv_usec / 1000); + seed = (unsigned int)(tv.tv_usec); seed ^= (unsigned int)(intptr_t)thrd_current(); + + /* When many threads are created at the same time and the + * thread id is different only by a few bits it's possible that + * `rand_r`, that is initially multiplying by `1103515245`, + * truncates the variable bits and uses the same seed for + * different threads. By applying `murmur2` we ensure that seed + * variability is distributed across various bits at different + * positions. */ + seed = (unsigned int)rd_murmur2(&seed, sizeof(seed)); } rand_num = rand_r(&seed); diff --git a/src/third_party/librdkafka/dist/src/rdrand.h b/src/third_party/librdkafka/dist/src/rdrand.h index 0e3a927c2cb..f86fb83e791 100644 --- a/src/third_party/librdkafka/dist/src/rdrand.h +++ b/src/third_party/librdkafka/dist/src/rdrand.h @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdregex.c b/src/third_party/librdkafka/dist/src/rdregex.c index 0c70cb334be..4a09286b81d 100644 --- a/src/third_party/librdkafka/dist/src/rdregex.c +++ b/src/third_party/librdkafka/dist/src/rdregex.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdregex.h b/src/third_party/librdkafka/dist/src/rdregex.h index 135229d6268..94edcf661ca 100644 --- a/src/third_party/librdkafka/dist/src/rdregex.h +++ b/src/third_party/librdkafka/dist/src/rdregex.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdsignal.h b/src/third_party/librdkafka/dist/src/rdsignal.h index a2c0de1b0cd..6f3462130ab 100644 --- a/src/third_party/librdkafka/dist/src/rdsignal.h +++ b/src/third_party/librdkafka/dist/src/rdsignal.h @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012-2013, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdstring.c b/src/third_party/librdkafka/dist/src/rdstring.c index 6a18210c937..c981f7705a3 100644 --- a/src/third_party/librdkafka/dist/src/rdstring.c +++ b/src/third_party/librdkafka/dist/src/rdstring.c @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -289,6 +290,21 @@ int rd_strcmp(const char *a, const char *b) { } +/** + * @brief Same as rd_strcmp() but works with rd_list comparator. + */ +int rd_strcmp2(const void *a, const void *b) { + return rd_strcmp((const char *)a, (const char *)b); +} + +/** + * @brief Same as rd_strcmp() but works with bsearch, which requires one more + * indirection. + */ +int rd_strcmp3(const void *a, const void *b) { + return rd_strcmp(*((const char **)a), *((const char **)b)); +} + /** * @brief Case-insensitive strstr() for platforms where strcasestr() diff --git a/src/third_party/librdkafka/dist/src/rdstring.h b/src/third_party/librdkafka/dist/src/rdstring.h index 67ea19401bd..dc0627a138a 100644 --- a/src/third_party/librdkafka/dist/src/rdstring.h +++ b/src/third_party/librdkafka/dist/src/rdstring.h @@ -1,7 +1,8 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -80,6 +81,10 @@ unsigned int rd_string_hash(const char *str, ssize_t len); int rd_strcmp(const char *a, const char *b); +int rd_strcmp2(const void *a, const void *b); + +int rd_strcmp3(const void *a, const void *b); + char *_rd_strcasestr(const char *haystack, const char *needle); char **rd_string_split(const char *input, diff --git a/src/third_party/librdkafka/dist/src/rdsysqueue.h b/src/third_party/librdkafka/dist/src/rdsysqueue.h index ecba4154eb5..738cdad792f 100644 --- a/src/third_party/librdkafka/dist/src/rdsysqueue.h +++ b/src/third_party/librdkafka/dist/src/rdsysqueue.h @@ -1,8 +1,8 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012-2013, Magnus Edenhill - * Copyright (c) 2012-2013, Andreas Öman + * Copyright (c) 2012-2022, Magnus Edenhill + * Copyright (c) 2012-2022, Andreas Öman * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdtime.h b/src/third_party/librdkafka/dist/src/rdtime.h index 4a3e5d85591..2d8207a5a05 100644 --- a/src/third_party/librdkafka/dist/src/rdtime.h +++ b/src/third_party/librdkafka/dist/src/rdtime.h @@ -1,7 +1,8 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill + * 2025, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -164,9 +165,44 @@ static RD_INLINE int rd_timeout_ms(rd_ts_t timeout_us) { return (int)((timeout_us + 999) / 1000); } +/** + * @brief Initialize an absolute timeout based on the provided \p timeout_ms + * and given clock \p now + * + * To be used with rd_timeout_remains() or rd_timeout_remains_us(). + * + * Honours RD_POLL_INFINITE, RD_POLL_NOWAIT. + * + * @returns the absolute timeout. + */ +static RD_INLINE rd_ts_t rd_timeout_init0(rd_ts_t now, int timeout_ms) { + if (timeout_ms == RD_POLL_INFINITE || timeout_ms == RD_POLL_NOWAIT) + return timeout_ms; + + return now + ((rd_ts_t)timeout_ms * 1000); +} + +/** + * @brief Initialize an absolute timeout based on the provided \p timeout_us + * and given clock \p now + * + * To be used with rd_timeout_remains() or rd_timeout_remains_us(). + * + * Honours RD_POLL_INFINITE, RD_POLL_NOWAIT. + * + * @returns the absolute timeout. + */ +static RD_INLINE rd_ts_t rd_timeout_init_us0(rd_ts_t now, rd_ts_t timeout_us) { + if (timeout_us == RD_POLL_INFINITE || timeout_us == RD_POLL_NOWAIT) + return timeout_us; + + return now + timeout_us; +} + /** * @brief Initialize an absolute timeout based on the provided \p timeout_ms + * and current clock. * * To be used with rd_timeout_adjust(). * @@ -176,12 +212,35 @@ static RD_INLINE int rd_timeout_ms(rd_ts_t timeout_us) { * to rd_timeout_adjust(). */ static RD_INLINE rd_ts_t rd_timeout_init(int timeout_ms) { - if (timeout_ms == RD_POLL_INFINITE || timeout_ms == RD_POLL_NOWAIT) - return timeout_ms; - - return rd_clock() + ((rd_ts_t)timeout_ms * 1000); + return rd_timeout_init0(rd_clock(), timeout_ms); } +/** + * @brief Initialize an absolute timeout based on the provided \p timeout_us + * and current clock. + * + * To be used with rd_timeout_remains() or rd_timeout_remains_us(). + * + * Honours RD_POLL_INFINITE, RD_POLL_NOWAIT. + * + * @returns the absolute timeout. + */ +static RD_INLINE rd_ts_t rd_timeout_init_us(rd_ts_t timeout_us) { + return rd_timeout_init_us0(rd_clock(), timeout_us); +} + +/** + * @brief Gets time since epoch (UTC). + */ +static RD_INLINE void rd_timespec_get(struct timespec *tspec) { +#if defined(__APPLE__) || (defined(__ANDROID__) && __ANDROID_API__ < 29) + struct timeval tv; + gettimeofday(&tv, NULL); + TIMEVAL_TO_TIMESPEC(&tv, tspec); +#else + timespec_get(tspec, TIME_UTC); +#endif +} /** * @brief Initialize an absolute timespec timeout based on the provided @@ -197,13 +256,7 @@ static RD_INLINE void rd_timeout_init_timespec_us(struct timespec *tspec, tspec->tv_sec = timeout_us; tspec->tv_nsec = 0; } else { -#if defined(__APPLE__) || (defined(__ANDROID__) && __ANDROID_API__ < 29) - struct timeval tv; - gettimeofday(&tv, NULL); - TIMEVAL_TO_TIMESPEC(&tv, tspec); -#else - timespec_get(tspec, TIME_UTC); -#endif + rd_timespec_get(tspec); tspec->tv_sec += timeout_us / 1000000; tspec->tv_nsec += (timeout_us % 1000000) * 1000; if (tspec->tv_nsec >= 1000000000) { @@ -227,13 +280,7 @@ static RD_INLINE void rd_timeout_init_timespec(struct timespec *tspec, tspec->tv_sec = timeout_ms; tspec->tv_nsec = 0; } else { -#if defined(__APPLE__) || (defined(__ANDROID__) && __ANDROID_API__ < 29) - struct timeval tv; - gettimeofday(&tv, NULL); - TIMEVAL_TO_TIMESPEC(&tv, tspec); -#else - timespec_get(tspec, TIME_UTC); -#endif + rd_timespec_get(tspec); tspec->tv_sec += timeout_ms / 1000; tspec->tv_nsec += (timeout_ms % 1000) * 1000000; if (tspec->tv_nsec >= 1000000000) { diff --git a/src/third_party/librdkafka/dist/src/rdtypes.h b/src/third_party/librdkafka/dist/src/rdtypes.h index 8f3625512dc..a22bb906496 100644 --- a/src/third_party/librdkafka/dist/src/rdtypes.h +++ b/src/third_party/librdkafka/dist/src/rdtypes.h @@ -1,7 +1,7 @@ /* * librd - Rapid Development C library * - * Copyright (c) 2012, Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdunittest.c b/src/third_party/librdkafka/dist/src/rdunittest.c index aa14b6aa841..d65749c9059 100644 --- a/src/third_party/librdkafka/dist/src/rdunittest.c +++ b/src/third_party/librdkafka/dist/src/rdunittest.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill + * 2023, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,6 +55,7 @@ #include "rdkafka_txnmgr.h" rd_bool_t rd_unittest_assert_on_failure = rd_false; +rd_bool_t rd_unittest_with_valgrind = rd_false; rd_bool_t rd_unittest_on_ci = rd_false; rd_bool_t rd_unittest_slow = rd_false; @@ -415,6 +417,7 @@ static int unittest_rdclock(void) { extern int unittest_string(void); extern int unittest_cgrp(void); +extern int unittest_mock_cluster(void); #if WITH_SASL_SCRAM extern int unittest_scram(void); #endif @@ -425,7 +428,11 @@ extern int unittest_http(void); #endif #if WITH_OAUTHBEARER_OIDC extern int unittest_sasl_oauthbearer_oidc(void); +extern int unittest_sasl_oauthbearer_oidc_jwt_bearer(void); +extern int unittest_sasl_oauthbearer_oidc_assertion(void); #endif +extern int unittest_telemetry(void); +extern int unittest_telemetry_decode(void); int rd_unittest(void) { int fails = 0; @@ -433,41 +440,47 @@ int rd_unittest(void) { const char *name; int (*call)(void); } unittests[] = { - {"sysqueue", unittest_sysqueue}, - {"string", unittest_string}, - {"map", unittest_map}, - {"rdbuf", unittest_rdbuf}, - {"rdvarint", unittest_rdvarint}, - {"crc32c", unittest_rd_crc32c}, - {"msg", unittest_msg}, - {"murmurhash", unittest_murmur2}, - {"fnv1a", unittest_fnv1a}, + {"sysqueue", unittest_sysqueue}, + {"string", unittest_string}, + {"map", unittest_map}, + {"rdbuf", unittest_rdbuf}, + {"rdvarint", unittest_rdvarint}, + {"crc32c", unittest_rd_crc32c}, + {"msg", unittest_msg}, + {"murmurhash", unittest_murmur2}, + {"fnv1a", unittest_fnv1a}, + {"mock", unittest_mock_cluster}, #if WITH_HDRHISTOGRAM - {"rdhdrhistogram", unittest_rdhdrhistogram}, + {"rdhdrhistogram", unittest_rdhdrhistogram}, #endif #ifdef _WIN32 - {"rdclock", unittest_rdclock}, + {"rdclock", unittest_rdclock}, #endif - {"conf", unittest_conf}, - {"broker", unittest_broker}, - {"request", unittest_request}, + {"conf", unittest_conf}, + {"broker", unittest_broker}, + {"request", unittest_request}, #if WITH_SASL_OAUTHBEARER - {"sasl_oauthbearer", unittest_sasl_oauthbearer}, + {"sasl_oauthbearer", unittest_sasl_oauthbearer}, #endif - {"aborted_txns", unittest_aborted_txns}, - {"cgrp", unittest_cgrp}, + {"aborted_txns", unittest_aborted_txns}, + {"cgrp", unittest_cgrp}, #if WITH_SASL_SCRAM - {"scram", unittest_scram}, + {"scram", unittest_scram}, #endif - {"assignors", unittest_assignors}, + {"assignors", unittest_assignors}, #if WITH_CURL - {"http", unittest_http}, + {"http", unittest_http}, #endif #if WITH_OAUTHBEARER_OIDC - {"sasl_oauthbearer_oidc", unittest_sasl_oauthbearer_oidc}, + {"sasl_oauthbearer_oidc", unittest_sasl_oauthbearer_oidc}, + {"sasl_oauthbearer_oidc_jwt_bearer", + unittest_sasl_oauthbearer_oidc_jwt_bearer}, + {"sasl_oauthbearer_oidc_assertion", + unittest_sasl_oauthbearer_oidc_assertion}, #endif - {NULL} - }; + {"telemetry", unittest_telemetry}, + {"telemetry_decode", unittest_telemetry_decode}, + {NULL}}; int i; const char *match = rd_getenv("RD_UT_TEST", NULL); int cnt = 0; @@ -479,7 +492,13 @@ int rd_unittest(void) { rd_unittest_on_ci = rd_true; } - if (rd_unittest_on_ci || (ENABLE_DEVEL + 0)) { + if (rd_strcmp(rd_getenv("TEST_MODE", NULL), "valgrind") == 0) { + RD_UT_SAY("Unittests running with valgrind"); + rd_unittest_with_valgrind = rd_true; + } + + if (rd_unittest_on_ci || rd_unittest_with_valgrind || + (ENABLE_DEVEL + 0)) { RD_UT_SAY("Unittests will not error out on slow CPUs"); rd_unittest_slow = rd_true; } diff --git a/src/third_party/librdkafka/dist/src/rdunittest.h b/src/third_party/librdkafka/dist/src/rdunittest.h index a1548856806..c063d151f6c 100644 --- a/src/third_party/librdkafka/dist/src/rdunittest.h +++ b/src/third_party/librdkafka/dist/src/rdunittest.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2017 Magnus Edenhill + * Copyright (c) 2017-2022, Magnus Edenhill + * 2025, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -34,6 +35,7 @@ extern rd_bool_t rd_unittest_assert_on_failure; extern rd_bool_t rd_unittest_on_ci; +extern rd_bool_t rd_unittest_with_valgrind; extern rd_bool_t rd_unittest_slow; #define ENABLE_CODECOV ENABLE_DEVEL diff --git a/src/third_party/librdkafka/dist/src/rdvarint.c b/src/third_party/librdkafka/dist/src/rdvarint.c index fb0cbd04660..cb8b8a9837c 100644 --- a/src/third_party/librdkafka/dist/src/rdvarint.c +++ b/src/third_party/librdkafka/dist/src/rdvarint.c @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdvarint.h b/src/third_party/librdkafka/dist/src/rdvarint.h index 6fe112ba95d..c628822fc82 100644 --- a/src/third_party/librdkafka/dist/src/rdvarint.h +++ b/src/third_party/librdkafka/dist/src/rdvarint.h @@ -1,7 +1,7 @@ /* * librdkafka - The Apache Kafka C/C++ library * - * Copyright (c) 2016 Magnus Edenhill + * Copyright (c) 2016-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/rdwin32.h b/src/third_party/librdkafka/dist/src/rdwin32.h index 73edd41d6a7..40ea43a7ac0 100644 --- a/src/third_party/librdkafka/dist/src/rdwin32.h +++ b/src/third_party/librdkafka/dist/src/rdwin32.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -42,7 +42,7 @@ #include /* for sockets + struct timeval */ #include #include - +#include /** * Types diff --git a/src/third_party/librdkafka/dist/src/rdxxhash.c b/src/third_party/librdkafka/dist/src/rdxxhash.c index 29cb5f84482..5ee252d619d 100644 --- a/src/third_party/librdkafka/dist/src/rdxxhash.c +++ b/src/third_party/librdkafka/dist/src/rdxxhash.c @@ -1,92 +1,86 @@ /* - * xxHash - Fast Hash algorithm - * Copyright (C) 2012-2016, Yann Collet - * - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at : - * - xxHash homepage: http://www.xxhash.com - * - xxHash source repository : https://github.com/Cyan4973/xxHash - */ +* xxHash - Fast Hash algorithm +* Copyright (C) 2012-2016, Yann Collet +* +* BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: +* +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* * Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following disclaimer +* in the documentation and/or other materials provided with the +* distribution. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +* You can contact the author at : +* - xxHash homepage: http://www.xxhash.com +* - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ /* ************************************* - * Tuning parameters - ***************************************/ +* Tuning parameters +***************************************/ /*!KXXH_FORCE_MEMORY_ACCESS : - * By default, access to unaligned memory is controlled by `memcpy()`, which is - * safe and portable. Unfortunately, on some target/compiler combinations, the - * generated assembly is sub-optimal. The below switch allow to select different - * access method for improved performance. Method 0 (default) : use `memcpy()`. - * Safe and portable. Method 1 : `__packed` statement. It depends on compiler - * extension (ie, not portable). This method is safe if your compiler supports - * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct - * access. This method doesn't depend on compiler but violate C standard. It can - * generate buggy code on targets which do not support unaligned memory - * accesses. But in some circumstances, it's the only known way to get the most - * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947 - * for details. Prefer these methods in priority order (0 > 1 > 2) + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. + * It can generate buggy code on targets which do not support unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) */ -#ifndef KXXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line \ - for example */ -#if defined(__GNUC__) && \ - (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ - defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \ - defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)) -#define KXXH_FORCE_MEMORY_ACCESS 2 -#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ - (defined(__GNUC__) && \ - (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ - defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ - defined(__ARM_ARCH_7S__))) -#define KXXH_FORCE_MEMORY_ACCESS 1 -#endif +#ifndef KXXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \ + || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define KXXH_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7S__) )) +# define KXXH_FORCE_MEMORY_ACCESS 1 +# endif #endif /*!KXXH_ACCEPT_NULL_INPUT_POINTER : - * If input pointer is NULL, xxHash default behavior is to dereference it, - * triggering a segfault. When this macro is enabled, xxHash actively checks - * input for null pointer. It it is, result for null input pointers is the same - * as a null-length input. + * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault. + * When this macro is enabled, xxHash actively checks input for null pointer. + * It it is, result for null input pointers is the same as a null-length input. */ -#ifndef KXXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ -#define KXXH_ACCEPT_NULL_INPUT_POINTER 0 +#ifndef KXXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define KXXH_ACCEPT_NULL_INPUT_POINTER 0 #endif /*!KXXH_FORCE_NATIVE_FORMAT : - * By default, xxHash library provides endian-independent Hash values, based on - * little-endian convention. Results are therefore identical for little-endian - * and big-endian CPU. This comes at a performance cost for big-endian CPU, - * since some swapping is required to emulate little-endian format. Should - * endian-independence be of no importance for your application, you may set the - * #define below to 1, to improve speed for Big-endian CPU. This option has no - * impact on Little_Endian CPU. + * By default, xxHash library provides endian-independent Hash values, based on little-endian convention. + * Results are therefore identical for little-endian and big-endian CPU. + * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. + * Should endian-independence be of no importance for your application, you may set the #define below to 1, + * to improve speed for Big-endian CPU. + * This option has no impact on Little_Endian CPU. */ -#ifndef KXXH_FORCE_NATIVE_FORMAT /* can be defined externally */ -#define KXXH_FORCE_NATIVE_FORMAT 0 +#ifndef KXXH_FORCE_NATIVE_FORMAT /* can be defined externally */ +# define KXXH_FORCE_NATIVE_FORMAT 0 #endif /*!KXXH_FORCE_ALIGN_CHECK : @@ -97,353 +91,306 @@ * or when alignment doesn't matter for performance. */ #ifndef KXXH_FORCE_ALIGN_CHECK /* can be defined externally */ -#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \ - defined(_M_X64) -#define KXXH_FORCE_ALIGN_CHECK 0 -#else -#define KXXH_FORCE_ALIGN_CHECK 1 -#endif +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define KXXH_FORCE_ALIGN_CHECK 0 +# else +# define KXXH_FORCE_ALIGN_CHECK 1 +# endif #endif /* ************************************* - * Includes & Memory related functions - ***************************************/ -/*! Modify the local functions below should you wish to use some other memory - * routines for malloc(), free() */ -#include "rd.h" -static void *KXXH_malloc(size_t s) { - return rd_malloc(s); -} -static void KXXH_free(void *p) { - rd_free(p); -} +* Includes & Memory related functions +***************************************/ +/*! Modify the local functions below should you wish to use some other memory routines +* for malloc(), free() */ +#include +static void* KXXH_malloc(size_t s) { return malloc(s); } +static void KXXH_free (void* p) { free(p); } /*! and for memcpy() */ #include -static void *KXXH_memcpy(void *dest, const void *src, size_t size) { - return memcpy(dest, src, size); -} +static void* KXXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } -#include /* assert */ +#include /* assert */ #define KXXH_STATIC_LINKING_ONLY #include "rdxxhash.h" /* ************************************* - * Compiler Specific Options - ***************************************/ -#ifdef _MSC_VER /* Visual Studio */ -#pragma warning( \ - disable : 4127) /* disable: C4127: conditional expression is constant */ -#define FORCE_INLINE static __forceinline +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# define FORCE_INLINE static __forceinline #else -#if defined(__cplusplus) || \ - defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -#ifdef __GNUC__ -#define FORCE_INLINE static inline __attribute__((always_inline)) -#else -#define FORCE_INLINE static inline -#endif -#else -#define FORCE_INLINE static -#endif /* __STDC_VERSION__ */ +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ #endif /* ************************************* - * Basic Types - ***************************************/ +* Basic Types +***************************************/ #ifndef MEM_MODULE -#if !defined(__VMS) && \ - (defined(__cplusplus) || \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) -#include -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -#else -typedef unsigned char BYTE; -typedef unsigned short U16; -typedef unsigned int U32; -#endif +# if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; +# else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; +# endif #endif -#if (defined(KXXH_FORCE_MEMORY_ACCESS) && (KXXH_FORCE_MEMORY_ACCESS == 2)) +#if (defined(KXXH_FORCE_MEMORY_ACCESS) && (KXXH_FORCE_MEMORY_ACCESS==2)) -/* Force direct memory access. Only works on CPU which support unaligned memory - * access in hardware */ -static U32 KXXH_read32(const void *memPtr) { - return *(const U32 *)memPtr; -} +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U32 KXXH_read32(const void* memPtr) { return *(const U32*) memPtr; } -#elif (defined(KXXH_FORCE_MEMORY_ACCESS) && (KXXH_FORCE_MEMORY_ACCESS == 1)) +#elif (defined(KXXH_FORCE_MEMORY_ACCESS) && (KXXH_FORCE_MEMORY_ACCESS==1)) -/* __pack instructions are safer, but compiler specific, hence potentially - * problematic for some compilers */ +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ /* currently only defined for gcc and icc */ -typedef union { - U32 u32; -} __attribute__((packed)) unalign; -static U32 KXXH_read32(const void *ptr) { - return ((const unalign *)ptr)->u32; -} +typedef union { U32 u32; } __attribute__((packed)) unalign; +static U32 KXXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } #else /* portable and safe solution. Generally efficient. * see : http://stackoverflow.com/a/32095106/646947 */ -static U32 KXXH_read32(const void *memPtr) { - U32 val; - memcpy(&val, memPtr, sizeof(val)); - return val; +static U32 KXXH_read32(const void* memPtr) +{ + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; } -#endif /* KXXH_FORCE_DIRECT_MEMORY_ACCESS */ +#endif /* KXXH_FORCE_DIRECT_MEMORY_ACCESS */ /* **************************************** - * Compiler-specific Functions and Macros - ******************************************/ +* Compiler-specific Functions and Macros +******************************************/ #define KXXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) -/* Note : although _rotl exists for minGW (GCC under windows), performance seems - * poor */ +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ #if defined(_MSC_VER) -#define KXXH_rotl32(x, r) _rotl(x, r) -#define KXXH_rotl64(x, r) _rotl64(x, r) +# define KXXH_rotl32(x,r) _rotl(x,r) +# define KXXH_rotl64(x,r) _rotl64(x,r) #else -#define KXXH_rotl32(x, r) ((x << r) | (x >> (32 - r))) -#define KXXH_rotl64(x, r) ((x << r) | (x >> (64 - r))) +# define KXXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +# define KXXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) #endif -#if defined(_MSC_VER) /* Visual Studio */ -#define KXXH_swap32 _byteswap_ulong +#if defined(_MSC_VER) /* Visual Studio */ +# define KXXH_swap32 _byteswap_ulong #elif KXXH_GCC_VERSION >= 403 -#define KXXH_swap32 __builtin_bswap32 +# define KXXH_swap32 __builtin_bswap32 #else -static U32 KXXH_swap32(U32 x) { - return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | - ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff); +static U32 KXXH_swap32 (U32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); } #endif /* ************************************* - * Architecture Macros - ***************************************/ -typedef enum { KXXH_bigEndian = 0, KXXH_littleEndian = 1 } KXXH_endianess; +* Architecture Macros +***************************************/ +typedef enum { KXXH_bigEndian=0, KXXH_littleEndian=1 } KXXH_endianess; -/* KXXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler - * command line */ +/* KXXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ #ifndef KXXH_CPU_LITTLE_ENDIAN -static int KXXH_isLittleEndian(void) { - const union { - U32 u; - BYTE c[4]; - } one = {1}; /* don't use static : performance detrimental */ - return one.c[0]; +static int KXXH_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; } -#define KXXH_CPU_LITTLE_ENDIAN KXXH_isLittleEndian() +# define KXXH_CPU_LITTLE_ENDIAN KXXH_isLittleEndian() #endif /* *************************** - * Memory reads - *****************************/ +* Memory reads +*****************************/ typedef enum { KXXH_aligned, KXXH_unaligned } KXXH_alignment; -FORCE_INLINE U32 KXXH_readLE32_align(const void *ptr, - KXXH_endianess endian, - KXXH_alignment align) { - if (align == KXXH_unaligned) - return endian == KXXH_littleEndian ? KXXH_read32(ptr) - : KXXH_swap32(KXXH_read32(ptr)); - else - return endian == KXXH_littleEndian - ? *(const U32 *)ptr - : KXXH_swap32(*(const U32 *)ptr); +FORCE_INLINE U32 KXXH_readLE32_align(const void* ptr, KXXH_endianess endian, KXXH_alignment align) +{ + if (align==KXXH_unaligned) + return endian==KXXH_littleEndian ? KXXH_read32(ptr) : KXXH_swap32(KXXH_read32(ptr)); + else + return endian==KXXH_littleEndian ? *(const U32*)ptr : KXXH_swap32(*(const U32*)ptr); } -FORCE_INLINE U32 KXXH_readLE32(const void *ptr, KXXH_endianess endian) { - return KXXH_readLE32_align(ptr, endian, KXXH_unaligned); +FORCE_INLINE U32 KXXH_readLE32(const void* ptr, KXXH_endianess endian) +{ + return KXXH_readLE32_align(ptr, endian, KXXH_unaligned); } -static U32 KXXH_readBE32(const void *ptr) { - return KXXH_CPU_LITTLE_ENDIAN ? KXXH_swap32(KXXH_read32(ptr)) - : KXXH_read32(ptr); +static U32 KXXH_readBE32(const void* ptr) +{ + return KXXH_CPU_LITTLE_ENDIAN ? KXXH_swap32(KXXH_read32(ptr)) : KXXH_read32(ptr); } /* ************************************* - * Macros - ***************************************/ -#define KXXH_STATIC_ASSERT(c) \ - { \ - enum { KXXH_sa = 1 / (int)(!!(c)) }; \ - } /* use after variable declarations */ -KXXH_PUBLIC_API unsigned KXXH_versionNumber(void) { - return KXXH_VERSION_NUMBER; -} +* Macros +***************************************/ +#define KXXH_STATIC_ASSERT(c) { enum { KXXH_sa = 1/(int)(!!(c)) }; } /* use after variable declarations */ +KXXH_PUBLIC_API unsigned KXXH_versionNumber (void) { return KXXH_VERSION_NUMBER; } /* ******************************************************************* - * 32-bit hash functions - *********************************************************************/ +* 32-bit hash functions +*********************************************************************/ static const U32 PRIME32_1 = 2654435761U; static const U32 PRIME32_2 = 2246822519U; static const U32 PRIME32_3 = 3266489917U; -static const U32 PRIME32_4 = 668265263U; -static const U32 PRIME32_5 = 374761393U; +static const U32 PRIME32_4 = 668265263U; +static const U32 PRIME32_5 = 374761393U; -static U32 KXXH32_round(U32 seed, U32 input) { - seed += input * PRIME32_2; - seed = KXXH_rotl32(seed, 13); - seed *= PRIME32_1; - return seed; +static U32 KXXH32_round(U32 seed, U32 input) +{ + seed += input * PRIME32_2; + seed = KXXH_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; } /* mix all bits */ -static U32 KXXH32_avalanche(U32 h32) { - h32 ^= h32 >> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - return (h32); +static U32 KXXH32_avalanche(U32 h32) +{ + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + return(h32); } #define KXXH_get32bits(p) KXXH_readLE32_align(p, endian, align) -static U32 KXXH32_finalize(U32 h32, - const void *ptr, - size_t len, - KXXH_endianess endian, - KXXH_alignment align) +static U32 +KXXH32_finalize(U32 h32, const void* ptr, size_t len, + KXXH_endianess endian, KXXH_alignment align) { - const BYTE *p = (const BYTE *)ptr; + const BYTE* p = (const BYTE*)ptr; -#define PROCESS1 \ - h32 += (*p++) * PRIME32_5; \ - h32 = KXXH_rotl32(h32, 11) * PRIME32_1; +#define PROCESS1 \ + h32 += (*p++) * PRIME32_5; \ + h32 = KXXH_rotl32(h32, 11) * PRIME32_1 ; -#define PROCESS4 \ - h32 += KXXH_get32bits(p) * PRIME32_3; \ - p += 4; \ - h32 = KXXH_rotl32(h32, 17) * PRIME32_4; +#define PROCESS4 \ + h32 += KXXH_get32bits(p) * PRIME32_3; \ + p+=4; \ + h32 = KXXH_rotl32(h32, 17) * PRIME32_4 ; - switch (len & 15) /* or switch(bEnd - p) */ - { - case 12: - PROCESS4; - /* fallthrough */ - case 8: - PROCESS4; - /* fallthrough */ - case 4: - PROCESS4; - return KXXH32_avalanche(h32); + switch(len&15) /* or switch(bEnd - p) */ + { + case 12: PROCESS4; + /* fallthrough */ + case 8: PROCESS4; + /* fallthrough */ + case 4: PROCESS4; + return KXXH32_avalanche(h32); - case 13: - PROCESS4; - /* fallthrough */ - case 9: - PROCESS4; - /* fallthrough */ - case 5: - PROCESS4; - PROCESS1; - return KXXH32_avalanche(h32); + case 13: PROCESS4; + /* fallthrough */ + case 9: PROCESS4; + /* fallthrough */ + case 5: PROCESS4; + PROCESS1; + return KXXH32_avalanche(h32); - case 14: - PROCESS4; - /* fallthrough */ - case 10: - PROCESS4; - /* fallthrough */ - case 6: - PROCESS4; - PROCESS1; - PROCESS1; - return KXXH32_avalanche(h32); + case 14: PROCESS4; + /* fallthrough */ + case 10: PROCESS4; + /* fallthrough */ + case 6: PROCESS4; + PROCESS1; + PROCESS1; + return KXXH32_avalanche(h32); - case 15: - PROCESS4; - /* fallthrough */ - case 11: - PROCESS4; - /* fallthrough */ - case 7: - PROCESS4; - /* fallthrough */ - case 3: - PROCESS1; - /* fallthrough */ - case 2: - PROCESS1; - /* fallthrough */ - case 1: - PROCESS1; - /* fallthrough */ - case 0: - return KXXH32_avalanche(h32); - } - assert(0); - return h32; /* reaching this point is deemed impossible */ + case 15: PROCESS4; + /* fallthrough */ + case 11: PROCESS4; + /* fallthrough */ + case 7: PROCESS4; + /* fallthrough */ + case 3: PROCESS1; + /* fallthrough */ + case 2: PROCESS1; + /* fallthrough */ + case 1: PROCESS1; + /* fallthrough */ + case 0: return KXXH32_avalanche(h32); + } + assert(0); + return h32; /* reaching this point is deemed impossible */ } -FORCE_INLINE U32 KXXH32_endian_align(const void *input, - size_t len, - U32 seed, - KXXH_endianess endian, - KXXH_alignment align) { - const BYTE *p = (const BYTE *)input; - const BYTE *bEnd = p + len; - U32 h32; +FORCE_INLINE U32 +KXXH32_endian_align(const void* input, size_t len, U32 seed, + KXXH_endianess endian, KXXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U32 h32; -#if defined(KXXH_ACCEPT_NULL_INPUT_POINTER) && \ - (KXXH_ACCEPT_NULL_INPUT_POINTER >= 1) - if (p == NULL) { - len = 0; - bEnd = p = (const BYTE *)(size_t)16; - } +#if defined(KXXH_ACCEPT_NULL_INPUT_POINTER) && (KXXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)16; + } #endif - if (len >= 16) { - const BYTE *const limit = bEnd - 15; - U32 v1 = seed + PRIME32_1 + PRIME32_2; - U32 v2 = seed + PRIME32_2; - U32 v3 = seed + 0; - U32 v4 = seed - PRIME32_1; + if (len>=16) { + const BYTE* const limit = bEnd - 15; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; - do { - v1 = KXXH32_round(v1, KXXH_get32bits(p)); - p += 4; - v2 = KXXH32_round(v2, KXXH_get32bits(p)); - p += 4; - v3 = KXXH32_round(v3, KXXH_get32bits(p)); - p += 4; - v4 = KXXH32_round(v4, KXXH_get32bits(p)); - p += 4; - } while (p < limit); + do { + v1 = KXXH32_round(v1, KXXH_get32bits(p)); p+=4; + v2 = KXXH32_round(v2, KXXH_get32bits(p)); p+=4; + v3 = KXXH32_round(v3, KXXH_get32bits(p)); p+=4; + v4 = KXXH32_round(v4, KXXH_get32bits(p)); p+=4; + } while (p < limit); - h32 = KXXH_rotl32(v1, 1) + KXXH_rotl32(v2, 7) + - KXXH_rotl32(v3, 12) + KXXH_rotl32(v4, 18); - } else { - h32 = seed + PRIME32_5; - } + h32 = KXXH_rotl32(v1, 1) + KXXH_rotl32(v2, 7) + + KXXH_rotl32(v3, 12) + KXXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } - h32 += (U32)len; + h32 += (U32)len; - return KXXH32_finalize(h32, p, len & 15, endian, align); + return KXXH32_finalize(h32, p, len&15, endian, align); } -KXXH_PUBLIC_API unsigned int -KXXH32(const void *input, size_t len, unsigned int seed) { +KXXH_PUBLIC_API unsigned int KXXH32 (const void* input, size_t len, unsigned int seed) +{ #if 0 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ KXXH32_state_t state; @@ -451,30 +398,20 @@ KXXH32(const void *input, size_t len, unsigned int seed) { KXXH32_update(&state, input, len); return KXXH32_digest(&state); #else - KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; + KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; - if (KXXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 3) == - 0) { /* Input is 4-bytes aligned, leverage the speed benefit - */ - if ((endian_detected == KXXH_littleEndian) || - KXXH_FORCE_NATIVE_FORMAT) - return KXXH32_endian_align(input, len, seed, - KXXH_littleEndian, - KXXH_aligned); - else - return KXXH32_endian_align(input, len, seed, - KXXH_bigEndian, - KXXH_aligned); - } - } + if (KXXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + if ((endian_detected==KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) + return KXXH32_endian_align(input, len, seed, KXXH_littleEndian, KXXH_aligned); + else + return KXXH32_endian_align(input, len, seed, KXXH_bigEndian, KXXH_aligned); + } } - if ((endian_detected == KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) - return KXXH32_endian_align(input, len, seed, KXXH_littleEndian, - KXXH_unaligned); - else - return KXXH32_endian_align(input, len, seed, KXXH_bigEndian, - KXXH_unaligned); + if ((endian_detected==KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) + return KXXH32_endian_align(input, len, seed, KXXH_littleEndian, KXXH_unaligned); + else + return KXXH32_endian_align(input, len, seed, KXXH_bigEndian, KXXH_unaligned); #endif } @@ -482,223 +419,195 @@ KXXH32(const void *input, size_t len, unsigned int seed) { /*====== Hash streaming ======*/ -KXXH_PUBLIC_API KXXH32_state_t *KXXH32_createState(void) { - return (KXXH32_state_t *)KXXH_malloc(sizeof(KXXH32_state_t)); +KXXH_PUBLIC_API KXXH32_state_t* KXXH32_createState(void) +{ + return (KXXH32_state_t*)KXXH_malloc(sizeof(KXXH32_state_t)); } -KXXH_PUBLIC_API KXXH_errorcode KXXH32_freeState(KXXH32_state_t *statePtr) { - KXXH_free(statePtr); +KXXH_PUBLIC_API KXXH_errorcode KXXH32_freeState(KXXH32_state_t* statePtr) +{ + KXXH_free(statePtr); + return KXXH_OK; +} + +KXXH_PUBLIC_API void KXXH32_copyState(KXXH32_state_t* dstState, const KXXH32_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +KXXH_PUBLIC_API KXXH_errorcode KXXH32_reset(KXXH32_state_t* statePtr, unsigned int seed) +{ + KXXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return KXXH_OK; +} + + +FORCE_INLINE KXXH_errorcode +KXXH32_update_endian(KXXH32_state_t* state, const void* input, size_t len, KXXH_endianess endian) +{ + if (input==NULL) +#if defined(KXXH_ACCEPT_NULL_INPUT_POINTER) && (KXXH_ACCEPT_NULL_INPUT_POINTER>=1) return KXXH_OK; -} - -KXXH_PUBLIC_API void KXXH32_copyState(KXXH32_state_t *dstState, - const KXXH32_state_t *srcState) { - memcpy(dstState, srcState, sizeof(*dstState)); -} - -KXXH_PUBLIC_API KXXH_errorcode KXXH32_reset(KXXH32_state_t *statePtr, - unsigned int seed) { - KXXH32_state_t state; /* using a local state to memcpy() in order to - avoid strict-aliasing warnings */ - memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME32_1 + PRIME32_2; - state.v2 = seed + PRIME32_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME32_1; - /* do not write into reserved, planned to be removed in a future version - */ - memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); - return KXXH_OK; -} - - -FORCE_INLINE KXXH_errorcode KXXH32_update_endian(KXXH32_state_t *state, - const void *input, - size_t len, - KXXH_endianess endian) { - if (input == NULL) -#if defined(KXXH_ACCEPT_NULL_INPUT_POINTER) && \ - (KXXH_ACCEPT_NULL_INPUT_POINTER >= 1) - return KXXH_OK; #else - return KXXH_ERROR; + return KXXH_ERROR; #endif - { - const BYTE *p = (const BYTE *)input; - const BYTE *const bEnd = p + len; + { const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; - state->total_len_32 += (unsigned)len; - state->large_len |= (len >= 16) | (state->total_len_32 >= 16); + state->total_len_32 += (unsigned)len; + state->large_len |= (len>=16) | (state->total_len_32>=16); - if (state->memsize + len < 16) { /* fill in tmp buffer */ - KXXH_memcpy((BYTE *)(state->mem32) + state->memsize, - input, len); - state->memsize += (unsigned)len; - return KXXH_OK; - } - - if (state->memsize) { /* some data left from previous update */ - KXXH_memcpy((BYTE *)(state->mem32) + state->memsize, - input, 16 - state->memsize); - { - const U32 *p32 = state->mem32; - state->v1 = KXXH32_round( - state->v1, KXXH_readLE32(p32, endian)); - p32++; - state->v2 = KXXH32_round( - state->v2, KXXH_readLE32(p32, endian)); - p32++; - state->v3 = KXXH32_round( - state->v3, KXXH_readLE32(p32, endian)); - p32++; - state->v4 = KXXH32_round( - state->v4, KXXH_readLE32(p32, endian)); - } - p += 16 - state->memsize; - state->memsize = 0; - } - - if (p <= bEnd - 16) { - const BYTE *const limit = bEnd - 16; - U32 v1 = state->v1; - U32 v2 = state->v2; - U32 v3 = state->v3; - U32 v4 = state->v4; - - do { - v1 = KXXH32_round(v1, KXXH_readLE32(p, endian)); - p += 4; - v2 = KXXH32_round(v2, KXXH_readLE32(p, endian)); - p += 4; - v3 = KXXH32_round(v3, KXXH_readLE32(p, endian)); - p += 4; - v4 = KXXH32_round(v4, KXXH_readLE32(p, endian)); - p += 4; - } while (p <= limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < bEnd) { - KXXH_memcpy(state->mem32, p, (size_t)(bEnd - p)); - state->memsize = (unsigned)(bEnd - p); - } + if (state->memsize + len < 16) { /* fill in tmp buffer */ + KXXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); + state->memsize += (unsigned)len; + return KXXH_OK; } - return KXXH_OK; -} - - -KXXH_PUBLIC_API KXXH_errorcode KXXH32_update(KXXH32_state_t *state_in, - const void *input, - size_t len) { - KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected == KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) - return KXXH32_update_endian(state_in, input, len, - KXXH_littleEndian); - else - return KXXH32_update_endian(state_in, input, len, KXXH_bigEndian); -} - - -FORCE_INLINE U32 KXXH32_digest_endian(const KXXH32_state_t *state, - KXXH_endianess endian) { - U32 h32; - - if (state->large_len) { - h32 = KXXH_rotl32(state->v1, 1) + KXXH_rotl32(state->v2, 7) + - KXXH_rotl32(state->v3, 12) + KXXH_rotl32(state->v4, 18); - } else { - h32 = state->v3 /* == seed */ + PRIME32_5; + if (state->memsize) { /* some data left from previous update */ + KXXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const U32* p32 = state->mem32; + state->v1 = KXXH32_round(state->v1, KXXH_readLE32(p32, endian)); p32++; + state->v2 = KXXH32_round(state->v2, KXXH_readLE32(p32, endian)); p32++; + state->v3 = KXXH32_round(state->v3, KXXH_readLE32(p32, endian)); p32++; + state->v4 = KXXH32_round(state->v4, KXXH_readLE32(p32, endian)); + } + p += 16-state->memsize; + state->memsize = 0; } - h32 += state->total_len_32; + if (p <= bEnd-16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; - return KXXH32_finalize(h32, state->mem32, state->memsize, endian, - KXXH_aligned); + do { + v1 = KXXH32_round(v1, KXXH_readLE32(p, endian)); p+=4; + v2 = KXXH32_round(v2, KXXH_readLE32(p, endian)); p+=4; + v3 = KXXH32_round(v3, KXXH_readLE32(p, endian)); p+=4; + v4 = KXXH32_round(v4, KXXH_readLE32(p, endian)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + KXXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return KXXH_OK; } -KXXH_PUBLIC_API unsigned int KXXH32_digest(const KXXH32_state_t *state_in) { - KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; +KXXH_PUBLIC_API KXXH_errorcode KXXH32_update (KXXH32_state_t* state_in, const void* input, size_t len) +{ + KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; - if ((endian_detected == KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) - return KXXH32_digest_endian(state_in, KXXH_littleEndian); - else - return KXXH32_digest_endian(state_in, KXXH_bigEndian); + if ((endian_detected==KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) + return KXXH32_update_endian(state_in, input, len, KXXH_littleEndian); + else + return KXXH32_update_endian(state_in, input, len, KXXH_bigEndian); +} + + +FORCE_INLINE U32 +KXXH32_digest_endian (const KXXH32_state_t* state, KXXH_endianess endian) +{ + U32 h32; + + if (state->large_len) { + h32 = KXXH_rotl32(state->v1, 1) + + KXXH_rotl32(state->v2, 7) + + KXXH_rotl32(state->v3, 12) + + KXXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + return KXXH32_finalize(h32, state->mem32, state->memsize, endian, KXXH_aligned); +} + + +KXXH_PUBLIC_API unsigned int KXXH32_digest (const KXXH32_state_t* state_in) +{ + KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) + return KXXH32_digest_endian(state_in, KXXH_littleEndian); + else + return KXXH32_digest_endian(state_in, KXXH_bigEndian); } /*====== Canonical representation ======*/ /*! Default KXXH result types are basic unsigned 32 and 64 bits. - * The canonical representation follows human-readable write convention, aka - * big-endian (large digits first). These functions allow transformation of hash - * result into and from its canonical format. This way, hash values can be - * written into a file or buffer, remaining comparable across different systems. - */ +* The canonical representation follows human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file or buffer, remaining comparable across different systems. +*/ -KXXH_PUBLIC_API void KXXH32_canonicalFromHash(KXXH32_canonical_t *dst, - KXXH32_hash_t hash) { - KXXH_STATIC_ASSERT(sizeof(KXXH32_canonical_t) == sizeof(KXXH32_hash_t)); - if (KXXH_CPU_LITTLE_ENDIAN) - hash = KXXH_swap32(hash); - memcpy(dst, &hash, sizeof(*dst)); +KXXH_PUBLIC_API void KXXH32_canonicalFromHash(KXXH32_canonical_t* dst, KXXH32_hash_t hash) +{ + KXXH_STATIC_ASSERT(sizeof(KXXH32_canonical_t) == sizeof(KXXH32_hash_t)); + if (KXXH_CPU_LITTLE_ENDIAN) hash = KXXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); } -KXXH_PUBLIC_API KXXH32_hash_t -KXXH32_hashFromCanonical(const KXXH32_canonical_t *src) { - return KXXH_readBE32(src); +KXXH_PUBLIC_API KXXH32_hash_t KXXH32_hashFromCanonical(const KXXH32_canonical_t* src) +{ + return KXXH_readBE32(src); } #ifndef KXXH_NO_LONG_LONG /* ******************************************************************* - * 64-bit hash functions - *********************************************************************/ +* 64-bit hash functions +*********************************************************************/ /*====== Memory access ======*/ #ifndef MEM_MODULE -#define MEM_MODULE -#if !defined(__VMS) && \ - (defined(__cplusplus) || \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) -#include -typedef uint64_t U64; -#else -/* if compiler doesn't support unsigned long long, replace by another 64-bit - * type */ -typedef unsigned long long U64; -#endif +# define MEM_MODULE +# if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t U64; +# else + /* if compiler doesn't support unsigned long long, replace by another 64-bit type */ + typedef unsigned long long U64; +# endif #endif -#if (defined(KXXH_FORCE_MEMORY_ACCESS) && (KXXH_FORCE_MEMORY_ACCESS == 2)) +#if (defined(KXXH_FORCE_MEMORY_ACCESS) && (KXXH_FORCE_MEMORY_ACCESS==2)) -/* Force direct memory access. Only works on CPU which support unaligned memory - * access in hardware */ -static U64 KXXH_read64(const void *memPtr) { - return *(const U64 *)memPtr; -} +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U64 KXXH_read64(const void* memPtr) { return *(const U64*) memPtr; } -#elif (defined(KXXH_FORCE_MEMORY_ACCESS) && (KXXH_FORCE_MEMORY_ACCESS == 1)) +#elif (defined(KXXH_FORCE_MEMORY_ACCESS) && (KXXH_FORCE_MEMORY_ACCESS==1)) -/* __pack instructions are safer, but compiler specific, hence potentially - * problematic for some compilers */ +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ /* currently only defined for gcc and icc */ -typedef union { - U32 u32; - U64 u64; -} __attribute__((packed)) unalign64; -static U64 KXXH_read64(const void *ptr) { - return ((const unalign64 *)ptr)->u64; -} +typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64; +static U64 KXXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } #else @@ -706,50 +615,49 @@ static U64 KXXH_read64(const void *ptr) { * see : http://stackoverflow.com/a/32095106/646947 */ -static U64 KXXH_read64(const void *memPtr) { - U64 val; - memcpy(&val, memPtr, sizeof(val)); - return val; +static U64 KXXH_read64(const void* memPtr) +{ + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; } -#endif /* KXXH_FORCE_DIRECT_MEMORY_ACCESS */ +#endif /* KXXH_FORCE_DIRECT_MEMORY_ACCESS */ -#if defined(_MSC_VER) /* Visual Studio */ -#define KXXH_swap64 _byteswap_uint64 +#if defined(_MSC_VER) /* Visual Studio */ +# define KXXH_swap64 _byteswap_uint64 #elif KXXH_GCC_VERSION >= 403 -#define KXXH_swap64 __builtin_bswap64 +# define KXXH_swap64 __builtin_bswap64 #else -static U64 KXXH_swap64(U64 x) { - return ((x << 56) & 0xff00000000000000ULL) | - ((x << 40) & 0x00ff000000000000ULL) | - ((x << 24) & 0x0000ff0000000000ULL) | - ((x << 8) & 0x000000ff00000000ULL) | - ((x >> 8) & 0x00000000ff000000ULL) | - ((x >> 24) & 0x0000000000ff0000ULL) | - ((x >> 40) & 0x000000000000ff00ULL) | - ((x >> 56) & 0x00000000000000ffULL); +static U64 KXXH_swap64 (U64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); } #endif -FORCE_INLINE U64 KXXH_readLE64_align(const void *ptr, - KXXH_endianess endian, - KXXH_alignment align) { - if (align == KXXH_unaligned) - return endian == KXXH_littleEndian ? KXXH_read64(ptr) - : KXXH_swap64(KXXH_read64(ptr)); - else - return endian == KXXH_littleEndian - ? *(const U64 *)ptr - : KXXH_swap64(*(const U64 *)ptr); +FORCE_INLINE U64 KXXH_readLE64_align(const void* ptr, KXXH_endianess endian, KXXH_alignment align) +{ + if (align==KXXH_unaligned) + return endian==KXXH_littleEndian ? KXXH_read64(ptr) : KXXH_swap64(KXXH_read64(ptr)); + else + return endian==KXXH_littleEndian ? *(const U64*)ptr : KXXH_swap64(*(const U64*)ptr); } -FORCE_INLINE U64 KXXH_readLE64(const void *ptr, KXXH_endianess endian) { - return KXXH_readLE64_align(ptr, endian, KXXH_unaligned); +FORCE_INLINE U64 KXXH_readLE64(const void* ptr, KXXH_endianess endian) +{ + return KXXH_readLE64_align(ptr, endian, KXXH_unaligned); } -static U64 KXXH_readBE64(const void *ptr) { - return KXXH_CPU_LITTLE_ENDIAN ? KXXH_swap64(KXXH_read64(ptr)) - : KXXH_read64(ptr); +static U64 KXXH_readBE64(const void* ptr) +{ + return KXXH_CPU_LITTLE_ENDIAN ? KXXH_swap64(KXXH_read64(ptr)) : KXXH_read64(ptr); } @@ -757,233 +665,195 @@ static U64 KXXH_readBE64(const void *ptr) { static const U64 PRIME64_1 = 11400714785074694791ULL; static const U64 PRIME64_2 = 14029467366897019727ULL; -static const U64 PRIME64_3 = 1609587929392839161ULL; -static const U64 PRIME64_4 = 9650029242287828579ULL; -static const U64 PRIME64_5 = 2870177450012600261ULL; +static const U64 PRIME64_3 = 1609587929392839161ULL; +static const U64 PRIME64_4 = 9650029242287828579ULL; +static const U64 PRIME64_5 = 2870177450012600261ULL; -static U64 KXXH64_round(U64 acc, U64 input) { - acc += input * PRIME64_2; - acc = KXXH_rotl64(acc, 31); - acc *= PRIME64_1; - return acc; +static U64 KXXH64_round(U64 acc, U64 input) +{ + acc += input * PRIME64_2; + acc = KXXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; } -static U64 KXXH64_mergeRound(U64 acc, U64 val) { - val = KXXH64_round(0, val); - acc ^= val; - acc = acc * PRIME64_1 + PRIME64_4; - return acc; +static U64 KXXH64_mergeRound(U64 acc, U64 val) +{ + val = KXXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; } -static U64 KXXH64_avalanche(U64 h64) { - h64 ^= h64 >> 33; - h64 *= PRIME64_2; - h64 ^= h64 >> 29; - h64 *= PRIME64_3; - h64 ^= h64 >> 32; - return h64; +static U64 KXXH64_avalanche(U64 h64) +{ + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; } #define KXXH_get64bits(p) KXXH_readLE64_align(p, endian, align) -static U64 KXXH64_finalize(U64 h64, - const void *ptr, - size_t len, - KXXH_endianess endian, - KXXH_alignment align) { - const BYTE *p = (const BYTE *)ptr; +static U64 +KXXH64_finalize(U64 h64, const void* ptr, size_t len, + KXXH_endianess endian, KXXH_alignment align) +{ + const BYTE* p = (const BYTE*)ptr; -#define PROCESS1_64 \ - h64 ^= (*p++) * PRIME64_5; \ - h64 = KXXH_rotl64(h64, 11) * PRIME64_1; +#define PROCESS1_64 \ + h64 ^= (*p++) * PRIME64_5; \ + h64 = KXXH_rotl64(h64, 11) * PRIME64_1; -#define PROCESS4_64 \ - h64 ^= (U64)(KXXH_get32bits(p)) * PRIME64_1; \ - p += 4; \ - h64 = KXXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; +#define PROCESS4_64 \ + h64 ^= (U64)(KXXH_get32bits(p)) * PRIME64_1; \ + p+=4; \ + h64 = KXXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; -#define PROCESS8_64 \ - { \ - U64 const k1 = KXXH64_round(0, KXXH_get64bits(p)); \ - p += 8; \ - h64 ^= k1; \ - h64 = KXXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; \ - } - - switch (len & 31) { - case 24: - PROCESS8_64; - /* fallthrough */ - case 16: - PROCESS8_64; - /* fallthrough */ - case 8: - PROCESS8_64; - return KXXH64_avalanche(h64); - - case 28: - PROCESS8_64; - /* fallthrough */ - case 20: - PROCESS8_64; - /* fallthrough */ - case 12: - PROCESS8_64; - /* fallthrough */ - case 4: - PROCESS4_64; - return KXXH64_avalanche(h64); - - case 25: - PROCESS8_64; - /* fallthrough */ - case 17: - PROCESS8_64; - /* fallthrough */ - case 9: - PROCESS8_64; - PROCESS1_64; - return KXXH64_avalanche(h64); - - case 29: - PROCESS8_64; - /* fallthrough */ - case 21: - PROCESS8_64; - /* fallthrough */ - case 13: - PROCESS8_64; - /* fallthrough */ - case 5: - PROCESS4_64; - PROCESS1_64; - return KXXH64_avalanche(h64); - - case 26: - PROCESS8_64; - /* fallthrough */ - case 18: - PROCESS8_64; - /* fallthrough */ - case 10: - PROCESS8_64; - PROCESS1_64; - PROCESS1_64; - return KXXH64_avalanche(h64); - - case 30: - PROCESS8_64; - /* fallthrough */ - case 22: - PROCESS8_64; - /* fallthrough */ - case 14: - PROCESS8_64; - /* fallthrough */ - case 6: - PROCESS4_64; - PROCESS1_64; - PROCESS1_64; - return KXXH64_avalanche(h64); - - case 27: - PROCESS8_64; - /* fallthrough */ - case 19: - PROCESS8_64; - /* fallthrough */ - case 11: - PROCESS8_64; - PROCESS1_64; - PROCESS1_64; - PROCESS1_64; - return KXXH64_avalanche(h64); - - case 31: - PROCESS8_64; - /* fallthrough */ - case 23: - PROCESS8_64; - /* fallthrough */ - case 15: - PROCESS8_64; - /* fallthrough */ - case 7: - PROCESS4_64; - /* fallthrough */ - case 3: - PROCESS1_64; - /* fallthrough */ - case 2: - PROCESS1_64; - /* fallthrough */ - case 1: - PROCESS1_64; - /* fallthrough */ - case 0: - return KXXH64_avalanche(h64); - } - - /* impossible to reach */ - assert(0); - return 0; /* unreachable, but some compilers complain without it */ +#define PROCESS8_64 { \ + U64 const k1 = KXXH64_round(0, KXXH_get64bits(p)); \ + p+=8; \ + h64 ^= k1; \ + h64 = KXXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ } -FORCE_INLINE U64 KXXH64_endian_align(const void *input, - size_t len, - U64 seed, - KXXH_endianess endian, - KXXH_alignment align) { - const BYTE *p = (const BYTE *)input; - const BYTE *bEnd = p + len; - U64 h64; + switch(len&31) { + case 24: PROCESS8_64; + /* fallthrough */ + case 16: PROCESS8_64; + /* fallthrough */ + case 8: PROCESS8_64; + return KXXH64_avalanche(h64); -#if defined(KXXH_ACCEPT_NULL_INPUT_POINTER) && \ - (KXXH_ACCEPT_NULL_INPUT_POINTER >= 1) - if (p == NULL) { - len = 0; - bEnd = p = (const BYTE *)(size_t)32; - } + case 28: PROCESS8_64; + /* fallthrough */ + case 20: PROCESS8_64; + /* fallthrough */ + case 12: PROCESS8_64; + /* fallthrough */ + case 4: PROCESS4_64; + return KXXH64_avalanche(h64); + + case 25: PROCESS8_64; + /* fallthrough */ + case 17: PROCESS8_64; + /* fallthrough */ + case 9: PROCESS8_64; + PROCESS1_64; + return KXXH64_avalanche(h64); + + case 29: PROCESS8_64; + /* fallthrough */ + case 21: PROCESS8_64; + /* fallthrough */ + case 13: PROCESS8_64; + /* fallthrough */ + case 5: PROCESS4_64; + PROCESS1_64; + return KXXH64_avalanche(h64); + + case 26: PROCESS8_64; + /* fallthrough */ + case 18: PROCESS8_64; + /* fallthrough */ + case 10: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return KXXH64_avalanche(h64); + + case 30: PROCESS8_64; + /* fallthrough */ + case 22: PROCESS8_64; + /* fallthrough */ + case 14: PROCESS8_64; + /* fallthrough */ + case 6: PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return KXXH64_avalanche(h64); + + case 27: PROCESS8_64; + /* fallthrough */ + case 19: PROCESS8_64; + /* fallthrough */ + case 11: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return KXXH64_avalanche(h64); + + case 31: PROCESS8_64; + /* fallthrough */ + case 23: PROCESS8_64; + /* fallthrough */ + case 15: PROCESS8_64; + /* fallthrough */ + case 7: PROCESS4_64; + /* fallthrough */ + case 3: PROCESS1_64; + /* fallthrough */ + case 2: PROCESS1_64; + /* fallthrough */ + case 1: PROCESS1_64; + /* fallthrough */ + case 0: return KXXH64_avalanche(h64); + } + + /* impossible to reach */ + assert(0); + return 0; /* unreachable, but some compilers complain without it */ +} + +FORCE_INLINE U64 +KXXH64_endian_align(const void* input, size_t len, U64 seed, + KXXH_endianess endian, KXXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U64 h64; + +#if defined(KXXH_ACCEPT_NULL_INPUT_POINTER) && (KXXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)32; + } #endif - if (len >= 32) { - const BYTE *const limit = bEnd - 32; - U64 v1 = seed + PRIME64_1 + PRIME64_2; - U64 v2 = seed + PRIME64_2; - U64 v3 = seed + 0; - U64 v4 = seed - PRIME64_1; + if (len>=32) { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; - do { - v1 = KXXH64_round(v1, KXXH_get64bits(p)); - p += 8; - v2 = KXXH64_round(v2, KXXH_get64bits(p)); - p += 8; - v3 = KXXH64_round(v3, KXXH_get64bits(p)); - p += 8; - v4 = KXXH64_round(v4, KXXH_get64bits(p)); - p += 8; - } while (p <= limit); + do { + v1 = KXXH64_round(v1, KXXH_get64bits(p)); p+=8; + v2 = KXXH64_round(v2, KXXH_get64bits(p)); p+=8; + v3 = KXXH64_round(v3, KXXH_get64bits(p)); p+=8; + v4 = KXXH64_round(v4, KXXH_get64bits(p)); p+=8; + } while (p<=limit); - h64 = KXXH_rotl64(v1, 1) + KXXH_rotl64(v2, 7) + - KXXH_rotl64(v3, 12) + KXXH_rotl64(v4, 18); - h64 = KXXH64_mergeRound(h64, v1); - h64 = KXXH64_mergeRound(h64, v2); - h64 = KXXH64_mergeRound(h64, v3); - h64 = KXXH64_mergeRound(h64, v4); + h64 = KXXH_rotl64(v1, 1) + KXXH_rotl64(v2, 7) + KXXH_rotl64(v3, 12) + KXXH_rotl64(v4, 18); + h64 = KXXH64_mergeRound(h64, v1); + h64 = KXXH64_mergeRound(h64, v2); + h64 = KXXH64_mergeRound(h64, v3); + h64 = KXXH64_mergeRound(h64, v4); - } else { - h64 = seed + PRIME64_5; - } + } else { + h64 = seed + PRIME64_5; + } - h64 += (U64)len; + h64 += (U64) len; - return KXXH64_finalize(h64, p, len, endian, align); + return KXXH64_finalize(h64, p, len, endian, align); } -KXXH_PUBLIC_API unsigned long long -KXXH64(const void *input, size_t len, unsigned long long seed) { +KXXH_PUBLIC_API unsigned long long KXXH64 (const void* input, size_t len, unsigned long long seed) +{ #if 0 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ KXXH64_state_t state; @@ -991,197 +861,170 @@ KXXH64(const void *input, size_t len, unsigned long long seed) { KXXH64_update(&state, input, len); return KXXH64_digest(&state); #else - KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; + KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; - if (KXXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 7) == - 0) { /* Input is aligned, let's leverage the speed advantage - */ - if ((endian_detected == KXXH_littleEndian) || - KXXH_FORCE_NATIVE_FORMAT) - return KXXH64_endian_align(input, len, seed, - KXXH_littleEndian, - KXXH_aligned); - else - return KXXH64_endian_align(input, len, seed, - KXXH_bigEndian, - KXXH_aligned); - } - } + if (KXXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + if ((endian_detected==KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) + return KXXH64_endian_align(input, len, seed, KXXH_littleEndian, KXXH_aligned); + else + return KXXH64_endian_align(input, len, seed, KXXH_bigEndian, KXXH_aligned); + } } - if ((endian_detected == KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) - return KXXH64_endian_align(input, len, seed, KXXH_littleEndian, - KXXH_unaligned); - else - return KXXH64_endian_align(input, len, seed, KXXH_bigEndian, - KXXH_unaligned); + if ((endian_detected==KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) + return KXXH64_endian_align(input, len, seed, KXXH_littleEndian, KXXH_unaligned); + else + return KXXH64_endian_align(input, len, seed, KXXH_bigEndian, KXXH_unaligned); #endif } /*====== Hash Streaming ======*/ -KXXH_PUBLIC_API KXXH64_state_t *KXXH64_createState(void) { - return (KXXH64_state_t *)KXXH_malloc(sizeof(KXXH64_state_t)); +KXXH_PUBLIC_API KXXH64_state_t* KXXH64_createState(void) +{ + return (KXXH64_state_t*)KXXH_malloc(sizeof(KXXH64_state_t)); } -KXXH_PUBLIC_API KXXH_errorcode KXXH64_freeState(KXXH64_state_t *statePtr) { - KXXH_free(statePtr); +KXXH_PUBLIC_API KXXH_errorcode KXXH64_freeState(KXXH64_state_t* statePtr) +{ + KXXH_free(statePtr); + return KXXH_OK; +} + +KXXH_PUBLIC_API void KXXH64_copyState(KXXH64_state_t* dstState, const KXXH64_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +KXXH_PUBLIC_API KXXH_errorcode KXXH64_reset(KXXH64_state_t* statePtr, unsigned long long seed) +{ + KXXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return KXXH_OK; +} + +FORCE_INLINE KXXH_errorcode +KXXH64_update_endian (KXXH64_state_t* state, const void* input, size_t len, KXXH_endianess endian) +{ + if (input==NULL) +#if defined(KXXH_ACCEPT_NULL_INPUT_POINTER) && (KXXH_ACCEPT_NULL_INPUT_POINTER>=1) return KXXH_OK; -} - -KXXH_PUBLIC_API void KXXH64_copyState(KXXH64_state_t *dstState, - const KXXH64_state_t *srcState) { - memcpy(dstState, srcState, sizeof(*dstState)); -} - -KXXH_PUBLIC_API KXXH_errorcode KXXH64_reset(KXXH64_state_t *statePtr, - unsigned long long seed) { - KXXH64_state_t state; /* using a local state to memcpy() in order to - avoid strict-aliasing warnings */ - memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME64_1 + PRIME64_2; - state.v2 = seed + PRIME64_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME64_1; - /* do not write into reserved, planned to be removed in a future version - */ - memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); - return KXXH_OK; -} - -FORCE_INLINE KXXH_errorcode KXXH64_update_endian(KXXH64_state_t *state, - const void *input, - size_t len, - KXXH_endianess endian) { - if (input == NULL) -#if defined(KXXH_ACCEPT_NULL_INPUT_POINTER) && \ - (KXXH_ACCEPT_NULL_INPUT_POINTER >= 1) - return KXXH_OK; #else - return KXXH_ERROR; + return KXXH_ERROR; #endif - { - const BYTE *p = (const BYTE *)input; - const BYTE *const bEnd = p + len; + { const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; - state->total_len += len; + state->total_len += len; - if (state->memsize + len < 32) { /* fill in tmp buffer */ - KXXH_memcpy(((BYTE *)state->mem64) + state->memsize, - input, len); - state->memsize += (U32)len; - return KXXH_OK; - } - - if (state->memsize) { /* tmp buffer is full */ - KXXH_memcpy(((BYTE *)state->mem64) + state->memsize, - input, 32 - state->memsize); - state->v1 = KXXH64_round( - state->v1, KXXH_readLE64(state->mem64 + 0, endian)); - state->v2 = KXXH64_round( - state->v2, KXXH_readLE64(state->mem64 + 1, endian)); - state->v3 = KXXH64_round( - state->v3, KXXH_readLE64(state->mem64 + 2, endian)); - state->v4 = KXXH64_round( - state->v4, KXXH_readLE64(state->mem64 + 3, endian)); - p += 32 - state->memsize; - state->memsize = 0; - } - - if (p + 32 <= bEnd) { - const BYTE *const limit = bEnd - 32; - U64 v1 = state->v1; - U64 v2 = state->v2; - U64 v3 = state->v3; - U64 v4 = state->v4; - - do { - v1 = KXXH64_round(v1, KXXH_readLE64(p, endian)); - p += 8; - v2 = KXXH64_round(v2, KXXH_readLE64(p, endian)); - p += 8; - v3 = KXXH64_round(v3, KXXH_readLE64(p, endian)); - p += 8; - v4 = KXXH64_round(v4, KXXH_readLE64(p, endian)); - p += 8; - } while (p <= limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < bEnd) { - KXXH_memcpy(state->mem64, p, (size_t)(bEnd - p)); - state->memsize = (unsigned)(bEnd - p); - } + if (state->memsize + len < 32) { /* fill in tmp buffer */ + KXXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + state->memsize += (U32)len; + return KXXH_OK; } - return KXXH_OK; -} - -KXXH_PUBLIC_API KXXH_errorcode KXXH64_update(KXXH64_state_t *state_in, - const void *input, - size_t len) { - KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected == KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) - return KXXH64_update_endian(state_in, input, len, - KXXH_littleEndian); - else - return KXXH64_update_endian(state_in, input, len, KXXH_bigEndian); -} - -FORCE_INLINE U64 KXXH64_digest_endian(const KXXH64_state_t *state, - KXXH_endianess endian) { - U64 h64; - - if (state->total_len >= 32) { - U64 const v1 = state->v1; - U64 const v2 = state->v2; - U64 const v3 = state->v3; - U64 const v4 = state->v4; - - h64 = KXXH_rotl64(v1, 1) + KXXH_rotl64(v2, 7) + - KXXH_rotl64(v3, 12) + KXXH_rotl64(v4, 18); - h64 = KXXH64_mergeRound(h64, v1); - h64 = KXXH64_mergeRound(h64, v2); - h64 = KXXH64_mergeRound(h64, v3); - h64 = KXXH64_mergeRound(h64, v4); - } else { - h64 = state->v3 /*seed*/ + PRIME64_5; + if (state->memsize) { /* tmp buffer is full */ + KXXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = KXXH64_round(state->v1, KXXH_readLE64(state->mem64+0, endian)); + state->v2 = KXXH64_round(state->v2, KXXH_readLE64(state->mem64+1, endian)); + state->v3 = KXXH64_round(state->v3, KXXH_readLE64(state->mem64+2, endian)); + state->v4 = KXXH64_round(state->v4, KXXH_readLE64(state->mem64+3, endian)); + p += 32-state->memsize; + state->memsize = 0; } - h64 += (U64)state->total_len; + if (p+32 <= bEnd) { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; - return KXXH64_finalize(h64, state->mem64, (size_t)state->total_len, - endian, KXXH_aligned); + do { + v1 = KXXH64_round(v1, KXXH_readLE64(p, endian)); p+=8; + v2 = KXXH64_round(v2, KXXH_readLE64(p, endian)); p+=8; + v3 = KXXH64_round(v3, KXXH_readLE64(p, endian)); p+=8; + v4 = KXXH64_round(v4, KXXH_readLE64(p, endian)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + KXXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return KXXH_OK; } -KXXH_PUBLIC_API unsigned long long KXXH64_digest(const KXXH64_state_t *state_in) { - KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; +KXXH_PUBLIC_API KXXH_errorcode KXXH64_update (KXXH64_state_t* state_in, const void* input, size_t len) +{ + KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; - if ((endian_detected == KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) - return KXXH64_digest_endian(state_in, KXXH_littleEndian); - else - return KXXH64_digest_endian(state_in, KXXH_bigEndian); + if ((endian_detected==KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) + return KXXH64_update_endian(state_in, input, len, KXXH_littleEndian); + else + return KXXH64_update_endian(state_in, input, len, KXXH_bigEndian); +} + +FORCE_INLINE U64 KXXH64_digest_endian (const KXXH64_state_t* state, KXXH_endianess endian) +{ + U64 h64; + + if (state->total_len >= 32) { + U64 const v1 = state->v1; + U64 const v2 = state->v2; + U64 const v3 = state->v3; + U64 const v4 = state->v4; + + h64 = KXXH_rotl64(v1, 1) + KXXH_rotl64(v2, 7) + KXXH_rotl64(v3, 12) + KXXH_rotl64(v4, 18); + h64 = KXXH64_mergeRound(h64, v1); + h64 = KXXH64_mergeRound(h64, v2); + h64 = KXXH64_mergeRound(h64, v3); + h64 = KXXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (U64) state->total_len; + + return KXXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, KXXH_aligned); +} + +KXXH_PUBLIC_API unsigned long long KXXH64_digest (const KXXH64_state_t* state_in) +{ + KXXH_endianess endian_detected = (KXXH_endianess)KXXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==KXXH_littleEndian) || KXXH_FORCE_NATIVE_FORMAT) + return KXXH64_digest_endian(state_in, KXXH_littleEndian); + else + return KXXH64_digest_endian(state_in, KXXH_bigEndian); } /*====== Canonical representation ======*/ -KXXH_PUBLIC_API void KXXH64_canonicalFromHash(KXXH64_canonical_t *dst, - KXXH64_hash_t hash) { - KXXH_STATIC_ASSERT(sizeof(KXXH64_canonical_t) == sizeof(KXXH64_hash_t)); - if (KXXH_CPU_LITTLE_ENDIAN) - hash = KXXH_swap64(hash); - memcpy(dst, &hash, sizeof(*dst)); +KXXH_PUBLIC_API void KXXH64_canonicalFromHash(KXXH64_canonical_t* dst, KXXH64_hash_t hash) +{ + KXXH_STATIC_ASSERT(sizeof(KXXH64_canonical_t) == sizeof(KXXH64_hash_t)); + if (KXXH_CPU_LITTLE_ENDIAN) hash = KXXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); } -KXXH_PUBLIC_API KXXH64_hash_t -KXXH64_hashFromCanonical(const KXXH64_canonical_t *src) { - return KXXH_readBE64(src); +KXXH_PUBLIC_API KXXH64_hash_t KXXH64_hashFromCanonical(const KXXH64_canonical_t* src) +{ + return KXXH_readBE64(src); } -#endif /* KXXH_NO_LONG_LONG */ +#endif /* KXXH_NO_LONG_LONG */ diff --git a/src/third_party/librdkafka/dist/src/rdxxhash.h b/src/third_party/librdkafka/dist/src/rdxxhash.h index fb6c4b0df0b..8dc0f136050 100644 --- a/src/third_party/librdkafka/dist/src/rdxxhash.h +++ b/src/third_party/librdkafka/dist/src/rdxxhash.h @@ -37,8 +37,7 @@ xxHash is an extremely fast Hash algorithm, running at RAM speed limits. It also successfully passes all tests from the SMHasher suite. -Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo -@3GHz) +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) Name Speed Q.Score Author xxHash 5.4 GB/s 10 @@ -68,16 +67,16 @@ KXXH32 6.8 GB/s 6.0 GB/s #ifndef KXXHASH_H_5627135585666179 #define KXXHASH_H_5627135585666179 1 -#if defined(__cplusplus) +#if defined (__cplusplus) extern "C" { #endif /* **************************** - * Definitions - ******************************/ -#include /* size_t */ -typedef enum { KXXH_OK = 0, KXXH_ERROR } KXXH_errorcode; +* Definitions +******************************/ +#include /* size_t */ +typedef enum { KXXH_OK=0, KXXH_ERROR } KXXH_errorcode; /* **************************** @@ -94,191 +93,153 @@ typedef enum { KXXH_OK = 0, KXXH_ERROR } KXXH_errorcode; * It's not useful to compile and link it as a separate module. */ #if defined(KXXH_INLINE_ALL) || defined(KXXH_PRIVATE_API) -#ifndef KXXH_STATIC_LINKING_ONLY -#define KXXH_STATIC_LINKING_ONLY -#endif -#if defined(__GNUC__) -#define KXXH_PUBLIC_API static __inline __attribute__((unused)) -#elif defined(__cplusplus) || \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -#define KXXH_PUBLIC_API static inline -#elif defined(_MSC_VER) -#define KXXH_PUBLIC_API static __inline +# ifndef KXXH_STATIC_LINKING_ONLY +# define KXXH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define KXXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define KXXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define KXXH_PUBLIC_API static __inline +# else + /* this version may generate warnings for unused static functions */ +# define KXXH_PUBLIC_API static +# endif #else -/* this version may generate warnings for unused static functions */ -#define KXXH_PUBLIC_API static -#endif -#else -#define KXXH_PUBLIC_API /* do nothing */ -#endif /* KXXH_INLINE_ALL || KXXH_PRIVATE_API */ +# define KXXH_PUBLIC_API /* do nothing */ +#endif /* KXXH_INLINE_ALL || KXXH_PRIVATE_API */ /*! KXXH_NAMESPACE, aka Namespace Emulation : * - * If you want to include _and expose_ xxHash functions from within your own - * library, but also want to avoid symbol collisions with other libraries which - * may also include xxHash, + * If you want to include _and expose_ xxHash functions from within your own library, + * but also want to avoid symbol collisions with other libraries which may also include xxHash, * - * you can use KXXH_NAMESPACE, to automatically prefix any public symbol from - * xxhash library with the value of KXXH_NAMESPACE (therefore, avoid NULL and - * numeric values). + * you can use KXXH_NAMESPACE, to automatically prefix any public symbol from xxhash library + * with the value of KXXH_NAMESPACE (therefore, avoid NULL and numeric values). * - * Note that no change is required within the calling program as long as it - * includes `xxhash.h` : regular symbol name will be automatically translated by - * this header. + * Note that no change is required within the calling program as long as it includes `xxhash.h` : + * regular symbol name will be automatically translated by this header. */ #ifdef KXXH_NAMESPACE -#define KXXH_CAT(A, B) A##B -#define KXXH_NAME2(A, B) KXXH_CAT(A, B) -#define KXXH_versionNumber KXXH_NAME2(KXXH_NAMESPACE, KXXH_versionNumber) -#define KXXH32 KXXH_NAME2(KXXH_NAMESPACE, KXXH32) -#define KXXH32_createState KXXH_NAME2(KXXH_NAMESPACE, KXXH32_createState) -#define KXXH32_freeState KXXH_NAME2(KXXH_NAMESPACE, KXXH32_freeState) -#define KXXH32_reset KXXH_NAME2(KXXH_NAMESPACE, KXXH32_reset) -#define KXXH32_update KXXH_NAME2(KXXH_NAMESPACE, KXXH32_update) -#define KXXH32_digest KXXH_NAME2(KXXH_NAMESPACE, KXXH32_digest) -#define KXXH32_copyState KXXH_NAME2(KXXH_NAMESPACE, KXXH32_copyState) -#define KXXH32_canonicalFromHash \ - KXXH_NAME2(KXXH_NAMESPACE, KXXH32_canonicalFromHash) -#define KXXH32_hashFromCanonical \ - KXXH_NAME2(KXXH_NAMESPACE, KXXH32_hashFromCanonical) -#define KXXH64 KXXH_NAME2(KXXH_NAMESPACE, KXXH64) -#define KXXH64_createState KXXH_NAME2(KXXH_NAMESPACE, KXXH64_createState) -#define KXXH64_freeState KXXH_NAME2(KXXH_NAMESPACE, KXXH64_freeState) -#define KXXH64_reset KXXH_NAME2(KXXH_NAMESPACE, KXXH64_reset) -#define KXXH64_update KXXH_NAME2(KXXH_NAMESPACE, KXXH64_update) -#define KXXH64_digest KXXH_NAME2(KXXH_NAMESPACE, KXXH64_digest) -#define KXXH64_copyState KXXH_NAME2(KXXH_NAMESPACE, KXXH64_copyState) -#define KXXH64_canonicalFromHash \ - KXXH_NAME2(KXXH_NAMESPACE, KXXH64_canonicalFromHash) -#define KXXH64_hashFromCanonical \ - KXXH_NAME2(KXXH_NAMESPACE, KXXH64_hashFromCanonical) +# define KXXH_CAT(A,B) A##B +# define KXXH_NAME2(A,B) KXXH_CAT(A,B) +# define KXXH_versionNumber KXXH_NAME2(KXXH_NAMESPACE, KXXH_versionNumber) +# define KXXH32 KXXH_NAME2(KXXH_NAMESPACE, KXXH32) +# define KXXH32_createState KXXH_NAME2(KXXH_NAMESPACE, KXXH32_createState) +# define KXXH32_freeState KXXH_NAME2(KXXH_NAMESPACE, KXXH32_freeState) +# define KXXH32_reset KXXH_NAME2(KXXH_NAMESPACE, KXXH32_reset) +# define KXXH32_update KXXH_NAME2(KXXH_NAMESPACE, KXXH32_update) +# define KXXH32_digest KXXH_NAME2(KXXH_NAMESPACE, KXXH32_digest) +# define KXXH32_copyState KXXH_NAME2(KXXH_NAMESPACE, KXXH32_copyState) +# define KXXH32_canonicalFromHash KXXH_NAME2(KXXH_NAMESPACE, KXXH32_canonicalFromHash) +# define KXXH32_hashFromCanonical KXXH_NAME2(KXXH_NAMESPACE, KXXH32_hashFromCanonical) +# define KXXH64 KXXH_NAME2(KXXH_NAMESPACE, KXXH64) +# define KXXH64_createState KXXH_NAME2(KXXH_NAMESPACE, KXXH64_createState) +# define KXXH64_freeState KXXH_NAME2(KXXH_NAMESPACE, KXXH64_freeState) +# define KXXH64_reset KXXH_NAME2(KXXH_NAMESPACE, KXXH64_reset) +# define KXXH64_update KXXH_NAME2(KXXH_NAMESPACE, KXXH64_update) +# define KXXH64_digest KXXH_NAME2(KXXH_NAMESPACE, KXXH64_digest) +# define KXXH64_copyState KXXH_NAME2(KXXH_NAMESPACE, KXXH64_copyState) +# define KXXH64_canonicalFromHash KXXH_NAME2(KXXH_NAMESPACE, KXXH64_canonicalFromHash) +# define KXXH64_hashFromCanonical KXXH_NAME2(KXXH_NAMESPACE, KXXH64_hashFromCanonical) #endif /* ************************************* - * Version - ***************************************/ -#define KXXH_VERSION_MAJOR 0 -#define KXXH_VERSION_MINOR 6 -#define KXXH_VERSION_RELEASE 5 -#define KXXH_VERSION_NUMBER \ - (KXXH_VERSION_MAJOR * 100 * 100 + KXXH_VERSION_MINOR * 100 + \ - KXXH_VERSION_RELEASE) -KXXH_PUBLIC_API unsigned KXXH_versionNumber(void); +* Version +***************************************/ +#define KXXH_VERSION_MAJOR 0 +#define KXXH_VERSION_MINOR 6 +#define KXXH_VERSION_RELEASE 5 +#define KXXH_VERSION_NUMBER (KXXH_VERSION_MAJOR *100*100 + KXXH_VERSION_MINOR *100 + KXXH_VERSION_RELEASE) +KXXH_PUBLIC_API unsigned KXXH_versionNumber (void); /*-********************************************************************** - * 32-bit hash - ************************************************************************/ +* 32-bit hash +************************************************************************/ typedef unsigned int KXXH32_hash_t; /*! KXXH32() : - Calculate the 32-bit hash of sequence "length" bytes stored at memory - address "input". The memory between input & input+length must be valid - (allocated and read-accessible). "seed" can be used to alter the result - predictably. - Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s - */ -KXXH_PUBLIC_API KXXH32_hash_t KXXH32(const void *input, - size_t length, - unsigned int seed); + Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */ +KXXH_PUBLIC_API KXXH32_hash_t KXXH32 (const void* input, size_t length, unsigned int seed); /*====== Streaming ======*/ -typedef struct KXXH32_state_s KXXH32_state_t; /* incomplete type */ -KXXH_PUBLIC_API KXXH32_state_t *KXXH32_createState(void); -KXXH_PUBLIC_API KXXH_errorcode KXXH32_freeState(KXXH32_state_t *statePtr); -KXXH_PUBLIC_API void KXXH32_copyState(KXXH32_state_t *dst_state, - const KXXH32_state_t *src_state); +typedef struct KXXH32_state_s KXXH32_state_t; /* incomplete type */ +KXXH_PUBLIC_API KXXH32_state_t* KXXH32_createState(void); +KXXH_PUBLIC_API KXXH_errorcode KXXH32_freeState(KXXH32_state_t* statePtr); +KXXH_PUBLIC_API void KXXH32_copyState(KXXH32_state_t* dst_state, const KXXH32_state_t* src_state); -KXXH_PUBLIC_API KXXH_errorcode KXXH32_reset(KXXH32_state_t *statePtr, - unsigned int seed); -KXXH_PUBLIC_API KXXH_errorcode KXXH32_update(KXXH32_state_t *statePtr, - const void *input, - size_t length); -KXXH_PUBLIC_API KXXH32_hash_t KXXH32_digest(const KXXH32_state_t *statePtr); +KXXH_PUBLIC_API KXXH_errorcode KXXH32_reset (KXXH32_state_t* statePtr, unsigned int seed); +KXXH_PUBLIC_API KXXH_errorcode KXXH32_update (KXXH32_state_t* statePtr, const void* input, size_t length); +KXXH_PUBLIC_API KXXH32_hash_t KXXH32_digest (const KXXH32_state_t* statePtr); /* - * Streaming functions generate the xxHash of an input provided in multiple - * segments. Note that, for small input, they are slower than single-call - * functions, due to state management. For small inputs, prefer `KXXH32()` and - * `KXXH64()`, which are better optimized. + * Streaming functions generate the xxHash of an input provided in multiple segments. + * Note that, for small input, they are slower than single-call functions, due to state management. + * For small inputs, prefer `KXXH32()` and `KXXH64()`, which are better optimized. * * KXXH state must first be allocated, using KXXH*_createState() . * * Start a new hash by initializing state with a seed, using KXXH*_reset(). * - * Then, feed the hash state by calling KXXH*_update() as many times as - * necessary. The function returns an error code, with 0 meaning OK, and any - * other value meaning there is an error. + * Then, feed the hash state by calling KXXH*_update() as many times as necessary. + * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. * * Finally, a hash value can be produced anytime, by using KXXH*_digest(). * This function returns the nn-bits hash as an int or long long. * - * It's still possible to continue inserting input into the hash state after a - * digest, and generate some new hashes later on, by calling again - * KXXH*_digest(). + * It's still possible to continue inserting input into the hash state after a digest, + * and generate some new hashes later on, by calling again KXXH*_digest(). * * When done, free KXXH state space if it was allocated dynamically. */ /*====== Canonical representation ======*/ -typedef struct { - unsigned char digest[4]; -} KXXH32_canonical_t; -KXXH_PUBLIC_API void KXXH32_canonicalFromHash(KXXH32_canonical_t *dst, - KXXH32_hash_t hash); -KXXH_PUBLIC_API KXXH32_hash_t -KXXH32_hashFromCanonical(const KXXH32_canonical_t *src); +typedef struct { unsigned char digest[4]; } KXXH32_canonical_t; +KXXH_PUBLIC_API void KXXH32_canonicalFromHash(KXXH32_canonical_t* dst, KXXH32_hash_t hash); +KXXH_PUBLIC_API KXXH32_hash_t KXXH32_hashFromCanonical(const KXXH32_canonical_t* src); /* Default result type for KXXH functions are primitive unsigned 32 and 64 bits. - * The canonical representation uses human-readable write convention, aka - * big-endian (large digits first). These functions allow transformation of hash - * result into and from its canonical format. This way, hash values can be - * written into a file / memory, and remain comparable on different systems and - * programs. + * The canonical representation uses human-readable write convention, aka big-endian (large digits first). + * These functions allow transformation of hash result into and from its canonical format. + * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. */ #ifndef KXXH_NO_LONG_LONG /*-********************************************************************** - * 64-bit hash - ************************************************************************/ +* 64-bit hash +************************************************************************/ typedef unsigned long long KXXH64_hash_t; /*! KXXH64() : - Calculate the 64-bit hash of sequence of length "len" stored at memory - address "input". "seed" can be used to alter the result predictably. This - function runs faster on 64-bit systems, but slower on 32-bit systems (see - benchmark). + Calculate the 64-bit hash of sequence of length "len" stored at memory address "input". + "seed" can be used to alter the result predictably. + This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark). */ -KXXH_PUBLIC_API KXXH64_hash_t KXXH64(const void *input, - size_t length, - unsigned long long seed); +KXXH_PUBLIC_API KXXH64_hash_t KXXH64 (const void* input, size_t length, unsigned long long seed); /*====== Streaming ======*/ -typedef struct KXXH64_state_s KXXH64_state_t; /* incomplete type */ -KXXH_PUBLIC_API KXXH64_state_t *KXXH64_createState(void); -KXXH_PUBLIC_API KXXH_errorcode KXXH64_freeState(KXXH64_state_t *statePtr); -KXXH_PUBLIC_API void KXXH64_copyState(KXXH64_state_t *dst_state, - const KXXH64_state_t *src_state); +typedef struct KXXH64_state_s KXXH64_state_t; /* incomplete type */ +KXXH_PUBLIC_API KXXH64_state_t* KXXH64_createState(void); +KXXH_PUBLIC_API KXXH_errorcode KXXH64_freeState(KXXH64_state_t* statePtr); +KXXH_PUBLIC_API void KXXH64_copyState(KXXH64_state_t* dst_state, const KXXH64_state_t* src_state); -KXXH_PUBLIC_API KXXH_errorcode KXXH64_reset(KXXH64_state_t *statePtr, - unsigned long long seed); -KXXH_PUBLIC_API KXXH_errorcode KXXH64_update(KXXH64_state_t *statePtr, - const void *input, - size_t length); -KXXH_PUBLIC_API KXXH64_hash_t KXXH64_digest(const KXXH64_state_t *statePtr); +KXXH_PUBLIC_API KXXH_errorcode KXXH64_reset (KXXH64_state_t* statePtr, unsigned long long seed); +KXXH_PUBLIC_API KXXH_errorcode KXXH64_update (KXXH64_state_t* statePtr, const void* input, size_t length); +KXXH_PUBLIC_API KXXH64_hash_t KXXH64_digest (const KXXH64_state_t* statePtr); /*====== Canonical representation ======*/ -typedef struct { - unsigned char digest[8]; -} KXXH64_canonical_t; -KXXH_PUBLIC_API void KXXH64_canonicalFromHash(KXXH64_canonical_t *dst, - KXXH64_hash_t hash); -KXXH_PUBLIC_API KXXH64_hash_t -KXXH64_hashFromCanonical(const KXXH64_canonical_t *src); -#endif /* KXXH_NO_LONG_LONG */ +typedef struct { unsigned char digest[8]; } KXXH64_canonical_t; +KXXH_PUBLIC_API void KXXH64_canonicalFromHash(KXXH64_canonical_t* dst, KXXH64_hash_t hash); +KXXH_PUBLIC_API KXXH64_hash_t KXXH64_hashFromCanonical(const KXXH64_canonical_t* src); +#endif /* KXXH_NO_LONG_LONG */ @@ -286,86 +247,81 @@ KXXH64_hashFromCanonical(const KXXH64_canonical_t *src); /* ================================================================================================ This section contains declarations which are not guaranteed to remain stable. - They may change in future versions, becoming incompatible with a different -version of the library. These declarations should only be used with static -linking. Never use them in association with dynamic linking ! -=================================================================================================== -*/ + They may change in future versions, becoming incompatible with a different version of the library. + These declarations should only be used with static linking. + Never use them in association with dynamic linking ! +=================================================================================================== */ /* These definitions are only present to allow * static allocation of KXXH state, on stack or in a struct for example. * Never **ever** use members directly. */ -#if !defined(__VMS) && \ - (defined(__cplusplus) || \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) -#include +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include struct KXXH32_state_s { - uint32_t total_len_32; - uint32_t large_len; - uint32_t v1; - uint32_t v2; - uint32_t v3; - uint32_t v4; - uint32_t mem32[4]; - uint32_t memsize; - uint32_t reserved; /* never read nor write, might be removed in a future - version */ -}; /* typedef'd to KXXH32_state_t */ + uint32_t total_len_32; + uint32_t large_len; + uint32_t v1; + uint32_t v2; + uint32_t v3; + uint32_t v4; + uint32_t mem32[4]; + uint32_t memsize; + uint32_t reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to KXXH32_state_t */ struct KXXH64_state_s { - uint64_t total_len; - uint64_t v1; - uint64_t v2; - uint64_t v3; - uint64_t v4; - uint64_t mem64[4]; - uint32_t memsize; - uint32_t reserved[2]; /* never read nor write, might be removed in a - future version */ -}; /* typedef'd to KXXH64_state_t */ + uint64_t total_len; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t v4; + uint64_t mem64[4]; + uint32_t memsize; + uint32_t reserved[2]; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to KXXH64_state_t */ -#else +# else struct KXXH32_state_s { - unsigned total_len_32; - unsigned large_len; - unsigned v1; - unsigned v2; - unsigned v3; - unsigned v4; - unsigned mem32[4]; - unsigned memsize; - unsigned reserved; /* never read nor write, might be removed in a future - version */ -}; /* typedef'd to KXXH32_state_t */ + unsigned total_len_32; + unsigned large_len; + unsigned v1; + unsigned v2; + unsigned v3; + unsigned v4; + unsigned mem32[4]; + unsigned memsize; + unsigned reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to KXXH32_state_t */ -#ifndef KXXH_NO_LONG_LONG /* remove 64-bit support */ +# ifndef KXXH_NO_LONG_LONG /* remove 64-bit support */ struct KXXH64_state_s { - unsigned long long total_len; - unsigned long long v1; - unsigned long long v2; - unsigned long long v3; - unsigned long long v4; - unsigned long long mem64[4]; - unsigned memsize; - unsigned reserved[2]; /* never read nor write, might be removed in a - future version */ -}; /* typedef'd to KXXH64_state_t */ -#endif + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; + unsigned memsize; + unsigned reserved[2]; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to KXXH64_state_t */ +# endif -#endif +# endif #if defined(KXXH_INLINE_ALL) || defined(KXXH_PRIVATE_API) -#include "rdxxhash.c" /* include xxhash function bodies as `static`, for inlining */ +# include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */ #endif #endif /* KXXH_STATIC_LINKING_ONLY */ -#if defined(__cplusplus) +#if defined (__cplusplus) } #endif diff --git a/src/third_party/librdkafka/dist/src/regexp.c b/src/third_party/librdkafka/dist/src/regexp.c index 603546c4782..e9dba924698 100644 --- a/src/third_party/librdkafka/dist/src/regexp.c +++ b/src/third_party/librdkafka/dist/src/regexp.c @@ -95,16 +95,17 @@ static Rune canon(Rune c) { /* Scan */ -enum { L_CHAR = 256, - L_CCLASS, /* character class */ - L_NCCLASS, /* negative character class */ - L_NC, /* "(?:" no capture */ - L_PLA, /* "(?=" positive lookahead */ - L_NLA, /* "(?!" negative lookahead */ - L_WORD, /* "\b" word boundary */ - L_NWORD, /* "\B" non-word boundary */ - L_REF, /* "\1" back-reference */ - L_COUNT /* {M,N} */ +enum { + L_CHAR = 256, + L_CCLASS, /* character class */ + L_NCCLASS, /* negative character class */ + L_NC, /* "(?:" no capture */ + L_PLA, /* "(?=" positive lookahead */ + L_NLA, /* "(?!" negative lookahead */ + L_WORD, /* "\b" word boundary */ + L_NWORD, /* "\B" non-word boundary */ + L_REF, /* "\1" back-reference */ + L_COUNT /* {M,N} */ }; static int hex(Restate *g, int c) { @@ -451,21 +452,23 @@ static int lex(Restate *g) { /* Parse */ -enum { P_CAT, - P_ALT, - P_REP, - P_BOL, - P_EOL, - P_WORD, - P_NWORD, - P_PAR, - P_PLA, - P_NLA, - P_ANY, - P_CHAR, - P_CCLASS, - P_NCCLASS, - P_REF }; +enum { + P_CAT, + P_ALT, + P_REP, + P_BOL, + P_EOL, + P_WORD, + P_NWORD, + P_PAR, + P_PLA, + P_NLA, + P_ANY, + P_CHAR, + P_CCLASS, + P_NCCLASS, + P_REF +}; struct Renode { unsigned char type; @@ -662,23 +665,25 @@ static Renode *parsealt(Restate *g) { /* Compile */ -enum { I_END, - I_JUMP, - I_SPLIT, - I_PLA, - I_NLA, - I_ANYNL, - I_ANY, - I_CHAR, - I_CCLASS, - I_NCCLASS, - I_REF, - I_BOL, - I_EOL, - I_WORD, - I_NWORD, - I_LPAR, - I_RPAR }; +enum { + I_END, + I_JUMP, + I_SPLIT, + I_PLA, + I_NLA, + I_ANYNL, + I_ANY, + I_CHAR, + I_CCLASS, + I_NCCLASS, + I_REF, + I_BOL, + I_EOL, + I_WORD, + I_NWORD, + I_LPAR, + I_RPAR +}; struct Reinst { unsigned char opcode; diff --git a/src/third_party/librdkafka/dist/src/snappy.h b/src/third_party/librdkafka/dist/src/snappy.h index b3742f1ac5c..c366fb5aa6f 100644 --- a/src/third_party/librdkafka/dist/src/snappy.h +++ b/src/third_party/librdkafka/dist/src/snappy.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/dist/src/tinycthread_extra.c b/src/third_party/librdkafka/dist/src/tinycthread_extra.c index 58049448cef..6f6d0a59576 100644 --- a/src/third_party/librdkafka/dist/src/tinycthread_extra.c +++ b/src/third_party/librdkafka/dist/src/tinycthread_extra.c @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill + * 2025, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -78,27 +79,41 @@ void cnd_wait_exit(cnd_t *cond) { int cnd_timedwait_ms(cnd_t *cnd, mtx_t *mtx, int timeout_ms) { - if (timeout_ms == -1 /* INFINITE*/) + int ret; + rd_ts_t abs_timeout; + rd_bool_t continue_timedwait = rd_true; + + if (timeout_ms == RD_POLL_INFINITE) return cnd_wait(cnd, mtx); #if defined(_TTHREAD_WIN32_) return _cnd_timedwait_win32(cnd, mtx, (DWORD)timeout_ms); #else - struct timeval tv; - struct timespec ts; + abs_timeout = rd_timeout_init(timeout_ms); + do { + struct timeval tv; + struct timespec ts; - gettimeofday(&tv, NULL); - ts.tv_sec = tv.tv_sec; - ts.tv_nsec = tv.tv_usec * 1000; + gettimeofday(&tv, NULL); + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * 1000; - ts.tv_sec += timeout_ms / 1000; - ts.tv_nsec += (timeout_ms % 1000) * 1000000; + ts.tv_sec += timeout_ms / 1000; + ts.tv_nsec += (timeout_ms % 1000) * 1000000; - if (ts.tv_nsec >= 1000000000) { - ts.tv_sec++; - ts.tv_nsec -= 1000000000; - } + if (ts.tv_nsec >= 1000000000) { + ts.tv_sec++; + ts.tv_nsec -= 1000000000; + } - return cnd_timedwait(cnd, mtx, &ts); + ret = cnd_timedwait(cnd, mtx, &ts); + continue_timedwait = ret == thrd_timedout; + if (continue_timedwait) { + timeout_ms = rd_timeout_remains(abs_timeout); + if (rd_timeout_expired(timeout_ms)) + continue_timedwait = rd_false; + } + } while (continue_timedwait); + return ret; #endif } @@ -113,13 +128,22 @@ int cnd_timedwait_msp(cnd_t *cnd, mtx_t *mtx, int *timeout_msp) { return r; } -int cnd_timedwait_abs(cnd_t *cnd, mtx_t *mtx, const struct timespec *tspec) { - if (tspec->tv_sec == RD_POLL_INFINITE) +int cnd_timedwait_abs(cnd_t *cnd, mtx_t *mtx, rd_ts_t abs_timeout) { + int r = thrd_timedout; + int timeout_ms; + if (abs_timeout == RD_POLL_INFINITE) return cnd_wait(cnd, mtx); - else if (tspec->tv_sec == RD_POLL_NOWAIT) + else if (abs_timeout == RD_POLL_NOWAIT) return thrd_timedout; - return cnd_timedwait(cnd, mtx, tspec); + do { + timeout_ms = rd_timeout_remains(abs_timeout); + if (timeout_ms == RD_POLL_NOWAIT) + break; + r = cnd_timedwait_ms(cnd, mtx, timeout_ms); + } while (r == thrd_timedout); + + return r; } diff --git a/src/third_party/librdkafka/dist/src/tinycthread_extra.h b/src/third_party/librdkafka/dist/src/tinycthread_extra.h index e5f6731739d..cb6b611ea7e 100644 --- a/src/third_party/librdkafka/dist/src/tinycthread_extra.h +++ b/src/third_party/librdkafka/dist/src/tinycthread_extra.h @@ -1,7 +1,8 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2018 Magnus Edenhill + * Copyright (c) 2018-2022, Magnus Edenhill + * 2025, Confluent Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -40,6 +41,7 @@ #include /* needed for rwlock_t */ #endif +#include "rdtypes.h" /** * @brief Set thread system name if platform supports it (pthreads) @@ -83,13 +85,15 @@ int cnd_timedwait_ms(cnd_t *cnd, mtx_t *mtx, int timeout_ms); int cnd_timedwait_msp(cnd_t *cnd, mtx_t *mtx, int *timeout_msp); /** - * @brief Same as cnd_timedwait() but honours + * @brief Same as cnd_timedwait() but takes an ansolute timeout in microseconds. + * Honours * RD_POLL_INFINITE (uses cnd_wait()), * and RD_POLL_NOWAIT (return thrd_timedout immediately). * - * @remark Set up \p tspec with rd_timeout_init_timespec(). + * @remark Set up \p abs_timeout with rd_timeout_init() or + * rd_timeout_init_us(). */ -int cnd_timedwait_abs(cnd_t *cnd, mtx_t *mtx, const struct timespec *tspec); +int cnd_timedwait_abs(cnd_t *cnd, mtx_t *mtx, rd_ts_t abs_timeout); diff --git a/src/third_party/librdkafka/dist/src/win32_config.h b/src/third_party/librdkafka/dist/src/win32_config.h index dd61b2c92f1..e1b416ba3cf 100644 --- a/src/third_party/librdkafka/dist/src/win32_config.h +++ b/src/third_party/librdkafka/dist/src/win32_config.h @@ -1,7 +1,7 @@ /* * librdkafka - Apache Kafka C library * - * Copyright (c) 2012-2015 Magnus Edenhill + * Copyright (c) 2012-2022, Magnus Edenhill * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/third_party/librdkafka/scripts/librdkafka_get_sources.sh b/src/third_party/librdkafka/scripts/librdkafka_get_sources.sh index f6b46ceaf88..1c6b1c854a7 100755 --- a/src/third_party/librdkafka/scripts/librdkafka_get_sources.sh +++ b/src/third_party/librdkafka/scripts/librdkafka_get_sources.sh @@ -18,7 +18,7 @@ TEMP_DIR=$(mktemp -d /tmp/librdkafka.XXXXXX) DEST_DIR=$(git rev-parse --show-toplevel)/src/third_party/librdkafka DIST_DIR=$DEST_DIR/dist PLATFORM_DIR=$DIST_DIR/platform -VERSION="2.0.2" +VERSION="2.11.0" # Clean the output directories rm -rf $DIST_DIR @@ -28,7 +28,7 @@ rm -rf $TEMP_DIR/* pushd $TEMP_DIR # Clone the v2.0.2 branch of librdkafka. -git clone --depth 1 --branch v2.0.2 https://github.com/confluentinc/librdkafka.git +git clone --depth 1 --branch v2.11.0 https://github.com/confluentinc/librdkafka.git pushd librdkafka @@ -53,8 +53,9 @@ pushd src # Replace all instances of the string "LZ4" and "XXH" with "KLZ4" and "KXXH" in the C source code. # This is to avoid symbol conflicts with the LZ4 and XXH source that is used by # third_party/mozjs. -sed -i 's/LZ4/KLZ4/g' * -sed -i 's/XXH/KXXH/g' * +find . -type f -exec sed -i 's/LZ4/KLZ4/g' {} + +find . -type f -exec sed -i 's/XXH/KXXH/g' {} + +find . -type f -exec sed -i 's/read_long_length_no_check/kread_long_length_no_check/g' {} + popd mkdir -p $DIST_DIR