<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:00:54 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13393] t10crc4K/512 algorithm in rhel8.1 kernel is slower than rhel7.7</title>
                <link>https://jira.whamcloud.com/browse/LU-13393</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;t10crc4K/512 algorithm in rhel8.1 kernel is slower than rhel7.7&lt;/p&gt;

&lt;p&gt;The performance with T10PI checksum algorithm of t10crc4K/512 in rhel8.1 kernel is broken.&lt;br/&gt;
 If client is running with rhel8.1 kernel and enabled t10crc4K/512 checksum, that client performance is much slower than rhel7.7 kernel with enabling same t10crc4K/512 checksum.&lt;br/&gt;
 Here is test configuration and results.&lt;/p&gt;

&lt;p&gt;Configuration&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1 x client
1 x Platinum 8160, 96GB memory, 1 x IB-EDR
(lctl set_param osc.*.max_pages_per_rpc=16M osc.*.max_rpcs_in_flight=16 osc.*.max_dirty_mb=512 llite.*.max_read_ahead_mb=2048 osc.*.checksum_type=t10crc4K)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Test resutl on RHEL7.7 (3.10.0-1062.el7.x86_64)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PPN=1
mpirun  --allow-run-as-root -np 1 ior -w -r -t 1m -b 256g -e -F -o /testfs/s/file
Max Write: 1981.81 MiB/sec (2078.07 MB/sec)
Max Read:  2685.01 MiB/sec (2815.44 MB/sec)

PPN=16
mpirun  --allow-run-as-root -np 16 ior -w -r -t 1m -b 16g -e -F -o /testfs/file
Max Write: 9887.55 MiB/sec (10367.84 MB/sec)
Max Read:  11212.37 MiB/sec (11757.03 MB/sec)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Test resutl on RHEL8.1 (4.18.0-147.el8.x86_64)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PPN=1
mpirun  --allow-run-as-root -np 1 ior -w -r -t 1m -b 256g -e -F -o /testfs/s/file
Max Write: 1703.20 MiB/sec (1785.94 MB/sec)
Max Read:  758.24 MiB/sec (795.07 MB/sec)

PPN=16
mpirun  --allow-run-as-root -np 16 ior -w -r -t 1m -b 16g -e -F -o /testfs/file
Max Write: 6741.36 MiB/sec (7068.83 MB/sec)
Max Read:  5821.17 MiB/sec (6103.94 MB/sec)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Even algorithm performance test indicated t10crc4K/512 algorithm in rhel8.1 is slow against rhel7.7 kernel. (30x slower.)&lt;/p&gt;

&lt;p&gt;RHEL7.7 (3.10.0-1062.el7.x86_64)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;obd_t10_performance_test() T10 checksum algorithm t10ip512 speed = 13015 MB/s
obd_t10_performance_test() T10 checksum algorithm t10ip4K speed = 16855 MB/s
obd_t10_performance_test() T10 checksum algorithm t10crc512 speed = 2551 MB/s
obd_t10_performance_test() T10 checksum algorithm t10crc4K speed = 9231 MB/s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;RHEL8.1 (4.18.0-147.el8.x86_64)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;obd_t10_performance_test() T10 checksum algorithm t10ip512 speed = 13395 MB/s
obd_t10_performance_test() T10 checksum algorithm t10ip4K speed = 19267 MB/s
obd_t10_performance_test() T10 checksum algorithm t10crc512 speed = 339 MB/s
obd_t10_performance_test() T10 checksum algorithm t10crc4K speed = 342 MB/s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>master, rhel8.1 (4.18.0-147.el8.x86_64)</environment>
        <key id="58527">LU-13393</key>
            <summary>t10crc4K/512 algorithm in rhel8.1 kernel is slower than rhel7.7</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="2">Won&apos;t Fix</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="sihara">Shuichi Ihara</reporter>
                        <labels>
                    </labels>
                <created>Thu, 26 Mar 2020 16:37:12 +0000</created>
                <updated>Thu, 30 Apr 2020 13:53:07 +0000</updated>
                            <resolved>Thu, 30 Apr 2020 13:53:07 +0000</resolved>
                                    <version>Lustre 2.14.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="266239" author="lixi_wc" created="Fri, 27 Mar 2020 07:45:40 +0000"  >&lt;p&gt;When benchmarking the performance, Lustre uses crc_t10dif() function to calculate the checksum of t10crc512/t10crc4K. Comparing the performance of t10crc* and t10ip*, looks like crc_t10dif() is much slower than expected. There must be something wrong with it.&lt;/p&gt;</comment>
                            <comment id="266241" author="lixi_wc" created="Fri, 27 Mar 2020 08:17:46 +0000"  >&lt;p&gt;Ihara found that on RHEL7 there is a kernel module crc_t10dif, but on RHEL8 it is gone.&lt;/p&gt;

&lt;p&gt;RHEL7:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@es400nv-vm1 ~]# lsmod | grep crc
crc32_pclmul 13133 0
crc_t10dif 12912 2 obdclass,sd_mod
crct10dif_generic 12647 0
crct10dif_pclmul 14307 1
crct10dif_common 12595 3 crct10dif_pclmul,crct10dif_generic,crc_t10dif
crc32c_intel 22094 0

[root@es400nv-vm1 ~]# grep -i crct10 /boot/config-*
CONFIG_CRYPTO_CRCT10DIF=m
CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;RHEL8:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@ec01 ~]# lsmod | grep crc
crct10dif_pclmul 16384 0
crc32_pclmul 16384 0
crc32c_intel 24576 4

[root@ec01 ~]# grep -i crct10 /boot/config-4.18.0-147.5.1.el8_1.x86_64
CONFIG_CRYPTO_CRCT10DIF=y
CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I think change the module crc_t10dif into inline kernel is the root cause. When crc_t10dif module is inserted, crc_t10dif_mod_init() will choose the quickest crct10dif algorithm. And on RHEL7.7, since it can be inserted after crct10dif_pclmul, so it chooses the quicker one crct10dif_pclmul, not crct10dif_common.&lt;/p&gt;

&lt;p&gt;However, on RHEL8, crc_t10dif_mod_init() is called too early, so it has no choice but use the slow one.&lt;/p&gt;
</comment>
                            <comment id="266242" author="lixi_wc" created="Fri, 27 Mar 2020 08:20:49 +0000"  >&lt;p&gt;I think two things need to be done:&lt;/p&gt;

&lt;p&gt;1) Send a patch or create a ticket on Redhat so they can change CONFIG_CRYPTO_CRCT10DIF back to module. The crc_t10dif needs to have the capability to be removed and inserted later, because other modules might want to register better quicker algorithms of crc_t10dif.&lt;/p&gt;

&lt;p&gt;2) Change Lustre codes to select the current quickest algorithm of crc_t10dif by itself rather than calling crc_t10dif() function. That will solve our problem of Lustre clients on all kernels.&lt;/p&gt;</comment>
                            <comment id="266316" author="dongyang" created="Mon, 30 Mar 2020 04:10:43 +0000"  >&lt;p&gt;I think a easy fix would be changing our kernel config to either&lt;/p&gt;

&lt;p&gt;CONFIG_CRYPTO_CRCT10DIF_PCLMUL=y or&#160;CONFIG_CRYPTO_CRCT10DIF back to m.&lt;/p&gt;

&lt;p&gt;since the lustre patched kernel needs to be built anyway&lt;/p&gt;</comment>
                            <comment id="266317" author="lixi_wc" created="Mon, 30 Mar 2020 04:44:55 +0000"  >&lt;blockquote&gt;&lt;p&gt;I think a easy fix would be changing our kernel config to either&lt;/p&gt;

&lt;p&gt;CONFIG_CRYPTO_CRCT10DIF_PCLMUL=y or CONFIG_CRYPTO_CRCT10DIF back to m.&lt;/p&gt;

&lt;p&gt;since the lustre patched kernel needs to be built anyway&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I not sure that is enough. This is a client side problem. And we can&apos;t control the kernel of Lustre clients.&lt;/p&gt;</comment>
                            <comment id="266368" author="dongyang" created="Mon, 30 Mar 2020 22:08:16 +0000"  >&lt;p&gt;yes this is a client side problem as well, that part we have no control of.&lt;/p&gt;

&lt;p&gt;however the server side kernel config is maintained by us, at least we should fix that.&lt;/p&gt;</comment>
                            <comment id="268937" author="adilger" created="Thu, 30 Apr 2020 03:24:22 +0000"  >&lt;p&gt;In theory, even for the client, we could build and insert a replacement t10crc module as part of the Lustre client that installs an &quot;accelerated&quot; checksum code to be used, even if it is just the same kernel code rebuilt with some small name changes?  That would avoid patching the kernel, and it could be dropped once RHEL8 is fixed (unless it is already fixed in RHEL8.2, and we should just use that).&lt;/p&gt;</comment>
                            <comment id="268959" author="sihara" created="Thu, 30 Apr 2020 08:24:34 +0000"  >&lt;p&gt;Andreas, i was trying your suggstion, but there is good news that Redhat already fixed t10crc problem in the latest RHEL8.1 updated kernel 4.18.0-147.8.1.el8_1.x86_64.&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;kernel: T10 CRC not using hardware-accelerated version from crct10dif_pclmul (BZ#1797961)&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;&lt;a href=&quot;https://access.redhat.com/errata/RHSA-2020:1372&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://access.redhat.com/errata/RHSA-2020:1372&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I just checked checksum speed with t10crc algorithms and confirmed t10crc works well as expected.&lt;/p&gt;

&lt;p&gt;4.18.0-147.5.1.el8_1.x86_64&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;T10 checksum algorithm t10ip512 speed = 20811 MB/s
T10 checksum algorithm t10ip4K speed = 21971 MB/s
T10 checksum algorithm t10crc512 speed = 329 MB/s
T10 checksum algorithm t10crc4K speed = 333 MB/s
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;4.18.0-147.8.1.el8_1.x86_64&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;T10 checksum algorithm t10ip512 speed = 20727 MB/s
T10 checksum algorithm t10ip4K speed = 21967 MB/s
T10 checksum algorithm t10crc512 speed = 9215 MB/s
T10 checksum algorithm t10crc4K speed = 15647 MB/s
&#160;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="268977" author="pjones" created="Thu, 30 Apr 2020 13:53:07 +0000"  >&lt;p&gt;I think that we can just close this as Will Not Fix then because it will certainly be fixed in RHEL 8.2 too&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="58512">LU-13391</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00wg7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>