<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:30:43 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9949] lolnd broken</title>
                <link>https://jira.whamcloud.com/browse/LU-9949</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;It looks like &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9480&quot; title=&quot;LNet Dynamic Discovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9480&quot;&gt;&lt;del&gt;LU-9480&lt;/del&gt;&lt;/a&gt; lnet: implement Peer Discovery&quot; commit 0f1aaad4c1b4447ee5097b8bb79a49d09eaa23c2 broke lolnd (suggested by git bisect)&lt;/p&gt;

&lt;p&gt;This manifests in e.g. sanity test 101b hanging with this in logs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  215.914245] Lustre: DEBUG MARKER: == sanity test 101b: check stride-io mode read-ahead ================================================= 01:32:15 (1504675935)
[  215.985320] Lustre: lfs: using old ioctl(LL_IOC_LOV_GETSTRIPE) on [0x200000401:0x5:0x0], use llapi_layout_get_by_path()
[  256.717500] LNet: Service thread pid 4032 was inactive for 40.01s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[  256.720328] Pid: 4032, comm: ll_ost_io00_002
[  256.721561] 
Call Trace:
[  256.723391]  [&amp;lt;ffffffff81704339&amp;gt;] schedule+0x29/0x70
[  256.724533]  [&amp;lt;ffffffff81700972&amp;gt;] schedule_timeout+0x162/0x2a0
[  256.725651]  [&amp;lt;ffffffff810879f0&amp;gt;] ? process_timeout+0x0/0x10
[  256.726859]  [&amp;lt;ffffffffa0534e3e&amp;gt;] target_bulk_io+0x4ee/0xb20 [ptlrpc]
[  256.729276]  [&amp;lt;ffffffff810b7ce0&amp;gt;] ? default_wake_function+0x0/0x20
[  256.730431]  [&amp;lt;ffffffffa05ddf08&amp;gt;] tgt_brw_read+0xf38/0x1870 [ptlrpc]
[  256.731359]  [&amp;lt;ffffffffa01ba4a4&amp;gt;] ? libcfs_log_return+0x24/0x30 [libcfs]
[  256.732387]  [&amp;lt;ffffffffa0579f90&amp;gt;] ? lustre_pack_reply_v2+0x1a0/0x2a0 [ptlrpc]
[  256.733578]  [&amp;lt;ffffffffa0532800&amp;gt;] ? target_bulk_timeout+0x0/0xb0 [ptlrpc]
[  256.734845]  [&amp;lt;ffffffffa057a102&amp;gt;] ? lustre_pack_reply_flags+0x72/0x1f0 [ptlrpc]
[  256.736719]  [&amp;lt;ffffffffa057a291&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
[  256.737931]  [&amp;lt;ffffffffa05dad2b&amp;gt;] tgt_request_handle+0x93b/0x1390 [ptlrpc]
[  256.738981]  [&amp;lt;ffffffffa05853b1&amp;gt;] ptlrpc_server_handle_request+0x251/0xae0 [ptlrpc]
[  256.740764]  [&amp;lt;ffffffffa0589168&amp;gt;] ptlrpc_main+0xa58/0x1df0 [ptlrpc]
[  256.741800]  [&amp;lt;ffffffff81706487&amp;gt;] ? _raw_spin_unlock_irq+0x27/0x50
[  256.742938]  [&amp;lt;ffffffffa0588710&amp;gt;] ? ptlrpc_main+0x0/0x1df0 [ptlrpc]
[  256.743943]  [&amp;lt;ffffffff810a2eda&amp;gt;] kthread+0xea/0xf0
[  256.744963]  [&amp;lt;ffffffff810a2df0&amp;gt;] ? kthread+0x0/0xf0
[  256.745913]  [&amp;lt;ffffffff8170fbd8&amp;gt;] ret_from_fork+0x58/0x90
[  256.746933]  [&amp;lt;ffffffff810a2df0&amp;gt;] ? kthread+0x0/0xf0

[  256.748798] LustreError: dumping log to /tmp/lustre-log.1504675975.4032
[  269.494952] LustreError: 2624:0:(events.c:449:server_bulk_callback()) event type 5, status -5, desc ffff8800720b3e00
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Easy to reproduce, just run this on a single node: ONLY=101 REFORMAT=yes sh sanity.sh&lt;/p&gt;</description>
                <environment></environment>
        <key id="48146">LU-9949</key>
            <summary>lolnd broken</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                    </labels>
                <created>Wed, 6 Sep 2017 06:30:01 +0000</created>
                <updated>Sun, 10 Feb 2019 00:12:07 +0000</updated>
                            <resolved>Wed, 22 Aug 2018 17:48:54 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="207515" author="green" created="Wed, 6 Sep 2017 06:31:22 +0000"  >&lt;p&gt;This totally breaks significant chunk of my testing with 100% failure rate. was not initially noticed because I am not well equipped to catch such hangs yet.&lt;/p&gt;</comment>
                            <comment id="207559" author="pjones" created="Wed, 6 Sep 2017 11:53:12 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Can you please investigate&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="207838" author="ashehata" created="Fri, 8 Sep 2017 00:10:21 +0000"  >&lt;p&gt;I ran 101b by itself, and I&apos;m seeing the following logs:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep  7 19:55:58 centos6-16 kernel: Lustre: lustre-OST0001: &lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt; disk, initializing
Sep  7 19:55:58 centos6-16 kernel: Lustre: srv-lustre-OST0001: No data found on store. Initialize space
Sep  7 19:55:58 centos6-16 kernel: Lustre: lustre-MDT0000: Connection restored to 192.168.10.226@tcp (at 192.168.10.226@tcp)
Sep  7 19:55:58 centos6-16 kernel: Lustre: lustre-OST0001: Imperative Recovery not enabled, recovery window 60-180
Sep  7 19:55:59 centos6-16 kernel: LustreError: 4252:0:(obd_config.c:1431:class_process_proc_param()) lustre-clilov-ffff88007647f800: error writing proc &lt;span class=&quot;code-quote&quot;&gt;&apos;stripecount=-1&apos;&lt;/span&gt;=&lt;span class=&quot;code-quote&quot;&gt;&apos;-1&apos;&lt;/span&gt;: rc = -34
Sep  7 19:56:01 centos6-16 kernel: Lustre: lustre-OST0000: Connection restored to 192.168.10.226@tcp (at 192.168.10.226@tcp)
Sep  7 19:56:01 centos6-16 rsyslogd-2013: fopen() failed: &lt;span class=&quot;code-quote&quot;&gt;&apos;Read-only file system&apos;&lt;/span&gt;, path: &lt;span class=&quot;code-quote&quot;&gt;&apos;/&lt;span class=&quot;code-keyword&quot;&gt;var&lt;/span&gt;/lib/rsyslog/imjournal.state.tmp&apos;&lt;/span&gt;#012 [&lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; http:&lt;span class=&quot;code-comment&quot;&gt;//www.rsyslog.com/e/2013 ]
&lt;/span&gt;Sep  7 19:56:06 centos6-16 kernel: Lustre: lustre-MDT0000: Connection restored to 192.168.10.226@tcp (at 192.168.10.226@tcp)
Sep  7 19:56:06 centos6-16 kernel: Lustre: Skipped 2 previous similar messages
Sep  7 19:56:06 centos6-16 kernel: Lustre: Mounted lustre-client
Sep  7 19:56:08 centos6-16 kernel: Lustre: DEBUG MARKER: Using TIMEOUT=20
Sep  7 19:56:15 centos6-16 kernel: LNetError: 2602:0:(lib-move.c:2395:lnet_parse_ack()) 192.168.10.226@tcp: Dropping ACK from 12345-192.168.10.226@tcp to invalid MD 0x14e239980d506816.0x18e5
Sep  7 19:56:15 centos6-16 kernel: LNetError: 2602:0:(lib-move.c:2800:lnet_parse()) 192.168.10.226@tcp, src 192.168.10.226@tcp: Dropping ACK. parse_local failed: -2
Sep  7 19:56:15 centos6-16 kernel: Lustre: DEBUG MARKER: excepting tests: 76 407 253 312 42a 42b 42c 45 68b
Sep  7 19:56:15 centos6-16 kernel: Lustre: DEBUG MARKER: skipping tests SLOW=no: 27m 64b 68 71 115 300o
Sep  7 19:56:16 centos6-16 kernel: Lustre: DEBUG MARKER: == sanity test 101b: check stride-io mode read-ahead ================================================= 19:56:16 (1504828576)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;There appears to be a dropped ACK because of no valid MD. Prior to that there is a failure &lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 4252:0:(obd_config.c:1431:class_process_proc_param()) lustre-clilov-ffff88007647f800: error writing proc &lt;span class=&quot;code-quote&quot;&gt;&apos;stripecount=-1&apos;&lt;/span&gt;=&lt;span class=&quot;code-quote&quot;&gt;&apos;-1&apos;&lt;/span&gt;: rc = -34
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;error is ERANGE. Is this expected in that test?&lt;/p&gt;

&lt;p&gt;LNet stats:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;    - net type: tcp
      local NI(s):
        - nid: 192.168.10.226@tcp
          status: up
          statistics:
              send_count: 809
              recv_count: 808
              drop_count: 1
          sent_stats:
              put: 777
              get: 32
              reply: 0
              ack: 0
              hello: 0
          received_stats:
              put: 758
              get: 16
              reply: 16
              ack: 18
              hello: 0
          dropped_stats:
              put: 0
              get: 0
              reply: 0
              ack: 1
              hello: 0
          tunables:
              peer_timeout: 180
              peer_credits: 8
              peer_buffer_credits: 0
              credits: 256
          lnd tunables:
          tcp bonding: 0
          dev cpt: -1
          CPT: &lt;span class=&quot;code-quote&quot;&gt;&quot;[0]&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The only issue the stats seem to point to is the dropped ACK.&lt;/p&gt;

&lt;p&gt;Please note that this is on the socklnd and not the lolnd. There appears to be no problems on the lolnd.&lt;/p&gt;</comment>
                            <comment id="207875" author="jhammond" created="Fri, 8 Sep 2017 13:21:49 +0000"  >&lt;p&gt;&amp;gt; Prior to that there is a failure&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 4252:0:(obd_config.c:1431:class_process_proc_param()) lustre-clilov-ffff88007647f800: error writing proc &apos;stripecount=-1&apos;=&apos;-1&apos;: rc = -34
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&amp;gt; error is ERANGE. Is this expected in that test?&lt;/p&gt;

&lt;p&gt;This is fine.&lt;/p&gt;</comment>
                            <comment id="207974" author="green" created="Sun, 10 Sep 2017 04:50:16 +0000"  >&lt;p&gt;I believe that lnet is smart enough to use lolnd when it notices that the target nid is the same as its own.&lt;/p&gt;</comment>
                            <comment id="208319" author="jhammond" created="Wed, 13 Sep 2017 21:46:54 +0000"  >&lt;p&gt;Simplified reproducer (on a single Oleg test node):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;dd if=/dev/zero of=/mnt/lustre/f1 bs=1M count=1
lctl set_param ldlm.namespaces.*-osc-*.lru_size=clear
lctl set_param osd-ldiskfs.*.read_cache_enable=0
dd if=/mnt/lustre/f1 of=/dev/null bs=1M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="208332" author="simmonsja" created="Thu, 14 Sep 2017 01:26:03 +0000"  >&lt;p&gt;I&apos;m seeing this from time to time which is preventing me from mounting a test file system.&lt;/p&gt;

&lt;p&gt;00000020:00000080:4.0:1505255706.626213:0:6697:0:(obd_config.c:431:class_attach()) OBD: dev 1 attached type mgc with refcount 1&lt;br/&gt;
00000020:00000080:4.0:1505255706.626217:0:6697:0:(obd_config.c:1138:class_process_config()) processing cmd: cf003&lt;br/&gt;
00000100:00000100:5.0:1505255706.627789:0:6697:0:(client.c:96:ptlrpc_uuid_to_connection()) cannot find peer MGC10.37.248.196@o2ib1&lt;br/&gt;
_0!&lt;br/&gt;
00010000:00080000:5.0:1505255706.627791:0:6697:0:(ldlm_lib.c:74:import_set_conn()) can&apos;t find connection MGC10.37.248.196@o2ib1_0&lt;br/&gt;
00010000:00020000:5.0:1505255706.627793:0:6697:0:(ldlm_lib.c:483:client_obd_setup()) can&apos;t add initial connection&lt;br/&gt;
00000020:00000080:1.0:1505255706.638190:0:6711:0:(genops.c:1204:class_import_destroy()) destroying import ffff88101ff0a000 for MGC&lt;br/&gt;
10.37.248.196@o2ib1&lt;br/&gt;
00000020:00020000:6.0:1505255706.639014:0:6697:0:(obd_config.c:563:class_setup()) setup MGC10.37.248.196@o2ib1 failed (-2)&lt;br/&gt;
00000020:00020000:6.0:1505255706.650104:0:6697:0:(obd_mount.c:202:lustre_start_simple()) MGC10.37.248.196@o2ib1 setup error -2&lt;br/&gt;
00000020:00000080:6.0:1505255706.661495:0:6697:0:(obd_config.c:1138:class_process_config()) processing cmd: cf002&lt;/p&gt;

&lt;p&gt;Is this the same bug or such I open another ticket?&lt;/p&gt;</comment>
                            <comment id="208334" author="ashehata" created="Thu, 14 Sep 2017 02:18:47 +0000"  >&lt;p&gt;The problem happens when  read_cache_enable=0. If read_cache_enable=1 then the test passes (both 101b and John&apos;s reproducer)&lt;/p&gt;

&lt;p&gt;In the case when read_cache_enable=0 generic_error_remove_page() is being called for each page in osd_read_prep(). Looking at this function it unmaps the pages. This seems to cause a problem on Oleg&apos;s setup. After the pages are unmapped the socklnd does not seem to complete the bulk transfer. The server appears to call the kernel API to send the page, but the client socklnd (which is common with the server, since it&apos;s the same node) never receives the data. &lt;/p&gt;

&lt;p&gt;I haven&apos;t been able to reproduce this issue on my local VM. Oleg&apos;s VM is running a debug Kernel, and I&apos;m wondering if its behavior is different. I&apos;ll attempt to step through the code and see if the data is being dropped somewhere in the kernel call stack.&lt;/p&gt;</comment>
                            <comment id="208363" author="jhammond" created="Thu, 14 Sep 2017 14:41:53 +0000"  >&lt;p&gt;Oleg have you tested this on a multiple node setup using the same debug kernel?&lt;/p&gt;</comment>
                            <comment id="208366" author="jhammond" created="Thu, 14 Sep 2017 15:03:43 +0000"  >&lt;p&gt;&amp;gt; I believe that lnet is smart enough to use lolnd when it notices that the target nid is the same as its own.&lt;/p&gt;

&lt;p&gt;This is no longer the case after 0f1aaad4c1b4447ee5097b8bb79a49d09eaa23c2 &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9480&quot; title=&quot;LNet Dynamic Discovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9480&quot;&gt;&lt;del&gt;LU-9480&lt;/del&gt;&lt;/a&gt; lnet: implement Peer Discovery. Which seems like a bug.&lt;/p&gt;</comment>
                            <comment id="208425" author="green" created="Thu, 14 Sep 2017 20:55:15 +0000"  >&lt;p&gt;Multinode setup appears to be working ok&lt;/p&gt;</comment>
                            <comment id="208519" author="ashehata" created="Fri, 15 Sep 2017 17:30:04 +0000"  >&lt;p&gt;One outstanding question that should be answered: Is it safe for  generic_error_remove_page() to be called on the pages before they are passed to the LND?&lt;/p&gt;</comment>
                            <comment id="208757" author="ashehata" created="Tue, 19 Sep 2017 16:25:55 +0000"  >&lt;p&gt;According to Alex: yes, it&apos;s safe as page is removed from the mapping (so another threads can&apos;t find it), but it can&apos;t get reused for anything else until last page_put() is called&lt;/p&gt;

&lt;p&gt;to summarize, the issue seems to be observed only on Oleg&apos;s VM on a single node setup running a debug kernel. It appears that calling generic_error_remove_page() which unmaps the pages before they are sent by the socklnd causes the socklnd send to succeed, but not to actually send the pages. It is still not known exactly why that happens.&lt;/p&gt;

&lt;p&gt;This issue occurred after 0f1aaad4c1b4447ee5097b8bb79a49d09eaa23c2 which discovers the loopback interface when it&apos;s initially used then due to the lolnd having no credits always prefers the other interfaces. This issue has been resolved in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9992&quot; title=&quot;Multi-Rail: use lolnd when sending locally&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9992&quot;&gt;&lt;del&gt;LU-9992&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Should this still be a blocker?&lt;/p&gt;</comment>
                            <comment id="232445" author="pjones" created="Wed, 22 Aug 2018 17:48:54 +0000"  >&lt;p&gt;Sounds like a duplicate&#160;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="48315">LU-9992</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="48315">LU-9992</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzjmf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>