<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:30:25 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3036] Interop 1.8.9&lt;-&gt;2.4 Failure on test suite sanityn test_23: atime doesn&apos;t update among nodes</title>
                <link>https://jira.whamcloud.com/browse/LU-3036</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for sarah &amp;lt;sarah@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/25418d26-948b-11e2-93c6-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/25418d26-948b-11e2-93c6-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_23 failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;test_23 failed with 1&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: sanityn 23&lt;/p&gt;</description>
                <environment>client: 1.8.9&lt;br/&gt;
server: lustre-master build #1338</environment>
        <key id="18102">LU-3036</key>
            <summary>Interop 1.8.9&lt;-&gt;2.4 Failure on test suite sanityn test_23: atime doesn&apos;t update among nodes</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="emoly.liu">Emoly Liu</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Tue, 26 Mar 2013 21:06:04 +0000</created>
                <updated>Mon, 17 Jun 2013 02:14:43 +0000</updated>
                            <resolved>Mon, 17 Jun 2013 02:14:43 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                    <fixVersion>Lustre 1.8.9</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="54933" author="green" created="Wed, 27 Mar 2013 17:26:01 +0000"  >&lt;p&gt;I noticed that in the b2_1 the test is different, it sends the signal to multiop before reading the time with a comment that says:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        multiop_bg_pause $DIR1/f23 or20_c || &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 1
        # with SOM and opencache enabled, we need to close a file and cancel
        # open lock to get atime propogated to MDS
        kill -USR1 $!
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I imagine it could be that b1_8 interop needs the same thing?&lt;/p&gt;

&lt;p&gt;Also somebody needs to check correctness of the updated test in 2.x code, it looks pretty strange now to me, what&apos;s the point of a pause if we stop it the very next thing?&lt;/p&gt;</comment>
                            <comment id="55032" author="pjones" created="Thu, 28 Mar 2013 17:56:36 +0000"  >&lt;p&gt;Emoly&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="55080" author="emoly.liu" created="Fri, 29 Mar 2013 07:52:29 +0000"  >&lt;p&gt;I will try per Oleg&apos;s advice.&lt;/p&gt;</comment>
                            <comment id="57721" author="emoly.liu" created="Mon, 6 May 2013 13:47:33 +0000"  >&lt;p&gt;This failure is easy to reproduce on my local VMs. I tried the method that Oleg mentioned but failed.&lt;/p&gt;

&lt;p&gt;I will investigate more.&lt;/p&gt;</comment>
                            <comment id="57901" author="emoly.liu" created="Wed, 8 May 2013 13:50:29 +0000"  >&lt;p&gt;sanity test_203 on b1_8 is another atime test, whose script is similar to sanityN test_23. But I found there was some lustre version judgement there, introduced by bz=23766 for 1.8.6&amp;lt;-&amp;gt;2.1.0 interop (&lt;a href=&quot;https://projectlava.xyratex.com/show_bug.cgi?id=23766#c56&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://projectlava.xyratex.com/show_bug.cgi?id=23766#c56&lt;/a&gt;).&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt; test_203() {
+        local lustre_version=$(get_lustre_version mds)
+        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; [[ $lustre_version != 1.8* ]]; then
+               skip bug23766 mds running $lustre_version
+               &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;
+        fi
+
         local ATIME=`do_facet mds lctl get_param -n mds.*.atime_diff`
         echo &lt;span class=&quot;code-quote&quot;&gt;&quot;atime should be updated on the MDS when closing file&quot;&lt;/span&gt; &amp;gt; $DIR/$tfile
         sync
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Why we added this ? If I remove this judgement, apparently test_203 will fail for b1_8&amp;lt;-&amp;gt;master interop test, as same as test_23 does. Is there anything blocking interop atime update since then?&lt;/p&gt;

&lt;p&gt;BTW, I know the path of parameter &quot;atime_diff&quot; is a little different between b1_8 and others, but I don&apos;t think that will be a reason to skip this test, we can use md*.*.atime_diff instead.&lt;/p&gt;</comment>
                            <comment id="58022" author="emoly.liu" created="Thu, 9 May 2013 14:30:44 +0000"  >&lt;p&gt;I forgot to say in my last two comments that the problem of &quot;no such process&quot; really can be fixed by Oleg&apos;s suggestion, but atime update problem still exists in b18&amp;lt;-&amp;gt;master interop. The maloo test report is at &lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/a1e209cc-b872-11e2-891d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/a1e209cc-b872-11e2-891d-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;On master, we update atime on close if (la_valid == LA_ATIME),&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
{
...
        &lt;span class=&quot;code-comment&quot;&gt;/* Update atime on close only. */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((mode &amp;amp; MDS_FMODE_EXEC || mode &amp;amp; FMODE_READ || mode &amp;amp; FMODE_WRITE)
            &amp;amp;&amp;amp; (ma-&amp;gt;ma_valid &amp;amp; MA_INODE) &amp;amp;&amp;amp; (ma-&amp;gt;ma_attr.la_valid &amp;amp; LA_ATIME)) {
                &lt;span class=&quot;code-comment&quot;&gt;/* Set the atime only. */&lt;/span&gt;
                ma-&amp;gt;ma_valid = MA_INODE;
                ma-&amp;gt;ma_attr.la_valid = LA_ATIME;
                rc = mo_attr_set(info-&amp;gt;mti_env, next, ma);
        }
...
}

&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; mdd_fix_attr(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct mdd_object *obj,
                        struct lu_attr *la, &lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; unsigned &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt; flags)
{
...
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (la-&amp;gt;la_valid == LA_ATIME) {
                &lt;span class=&quot;code-comment&quot;&gt;/* This is atime only set &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; read atime update on close. */&lt;/span&gt;
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (la-&amp;gt;la_atime &amp;gt;= tmp_la-&amp;gt;la_atime &amp;amp;&amp;amp;
                    la-&amp;gt;la_atime &amp;lt; (tmp_la-&amp;gt;la_atime +
                                    mdd_obj2mdd_dev(obj)-&amp;gt;mdd_atime_diff))
                        la-&amp;gt;la_valid &amp;amp;= ~LA_ATIME;
                RETURN(0);
        }
...
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;but my debug message showed that &quot;valid&quot; from b18 client wasn&apos;t mapped to LA_ATIME finally. I will investigate more.&lt;/p&gt;</comment>
                            <comment id="58114" author="emoly.liu" created="Fri, 10 May 2013 06:03:18 +0000"  >&lt;p&gt;After looking at the code between master and b1_8, I found they used different attribute flags to trigger atime update.&lt;/p&gt;

&lt;p&gt;MASTER: ll_close_inode_openhandle()-&amp;gt;ll_prepare_close(): ATTR_MODE | &lt;font color=&quot;red&quot;&gt;ATTR_ATIME_SET&lt;/font&gt; | ATTR_MTIME_SET | ATTR_CTIME_SET;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;----------------------------------------------------------------
     client          |           server
----------------------------------------------------------------
 ATTR_* -----&amp;gt;   MDS_ATTR*   ------&amp;gt;   ATTR_*   ------&amp;gt;   LA_* 
      attr_pack()
                          attr_unpack()
                                        mdt_attr_valid_xlate()  ---&amp;gt;  if (in &amp;amp; ATTR_ATIME_SET)
----------------------------------------------------------------          out |= LA_ATIME;     
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;B1_8: ll_close_inode_openhandle(): OBD_MD_FLTYPE | OBD_MD_FLMODE | &lt;font color=&quot;red&quot;&gt;OBD_MD_FLATIME&lt;/font&gt; | OBD_MD_FLMTIME | OBD_MD_FLCTIME;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;--------------------------------------------------
     client          |           server
--------------------------------------------------
 OBD_MD_FL* -----&amp;gt;   MDS_ATTR*   ------&amp;gt;   ATTR_*  
    mdc_close_pack_20()
                     OBD_MD_FL*
    mdc_close_pack_18()
                            mds_mfd_close()    -----&amp;gt; if ((request_body-&amp;gt;valid &amp;amp; OBD_MD_FLATIME) &amp;amp;&amp;amp;
--------------------------------------------------        ((request_body-&amp;gt;atime &amp;gt; LTIME_S(inode-&amp;gt;i_atime) + mds-&amp;gt;mds_atime_diff) )) {                      
                                                                 LTIME_S(iattr.ia_atime) = request_body-&amp;gt;atime;
                                                                 iattr.ia_valid |= ATTR_ATIME;
                                                      }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I think we don&apos;t need to force 2.x to understand the flags from b18 client, so I decide to add lustre version check to skip this test for interop test.&lt;/p&gt;

&lt;p&gt;I will push a patch later.&lt;/p&gt;</comment>
                            <comment id="58120" author="emoly.liu" created="Fri, 10 May 2013 07:41:19 +0000"  >&lt;p&gt;patch for b18 is at &lt;a href=&quot;http://review.whamcloud.com/#change,6289&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,6289&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="58122" author="adilger" created="Fri, 10 May 2013 08:27:23 +0000"  >&lt;p&gt;Sorry, I don&apos;t understand your comment.  The 1.8 client has support for the 2.x protocol in &lt;tt&gt;mdc_close_pack_20()&lt;/tt&gt;, so it should be sending the same MDS_ATTR_* flags as the 2.x client.  The &lt;tt&gt;mdc_close_pack_18()&lt;/tt&gt; codepath is only used when the client is communicating with a 1.8 MDS.&lt;/p&gt;

&lt;p&gt;So, I don&apos;t think we should be so quick to just skip this test.  If the 1.8 client sends the same MDS_ATTR_* flags as the 2.x client, then the 2.x server should treat the atime update in the same way.&lt;/p&gt;</comment>
                            <comment id="58128" author="emoly.liu" created="Fri, 10 May 2013 08:50:48 +0000"  >&lt;p&gt;Probably the code path in my last comment misled you.&lt;/p&gt;

&lt;p&gt;The flags translation between 1.8 and 2.x has no problem. The problem is that they use different flags to trigger atime update.&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;2.x uses ATTR_ATIME_SET. From the code we can see, 2.x updates atime on close only if (la-&amp;gt;la_valid == LA_ATIME), and only ATTR_ATIME_SET can be translated into LA_ATIME.&lt;/li&gt;
	&lt;li&gt;but 1.8 uses ATTR_ATIME.&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="58209" author="adilger" created="Fri, 10 May 2013 22:40:26 +0000"  >&lt;p&gt;The protocol is &lt;em&gt;supposed&lt;/em&gt; to be that &lt;tt&gt;ATTR_ATIME&lt;/tt&gt; means the atime should be updated, but the server generates the timestamp if &lt;tt&gt;ATTR_ATIME_SET&lt;/tt&gt; is not also set.  See e.g. &lt;tt&gt;mds_fix_attr()&lt;/tt&gt; in 1.8 updating xtime locally if &lt;tt&gt;ATTR_xTIME_SET&lt;/tt&gt; is missing, and &lt;tt&gt;ll_setattr_raw()&lt;/tt&gt; on the client setting &lt;tt&gt;ATTR_xTIME_SET&lt;/tt&gt; if it is actually sending the atime value along in the RPC.&lt;/p&gt;

&lt;p&gt;Looking at the code more closely, I see that there some more improvements that should be made.  The conversion in &lt;tt&gt;mdt_setatt_unpack_rec()&lt;/tt&gt; from &lt;tt&gt;MDS_ATTR_&lt;b&gt;&lt;/tt&gt; to &lt;tt&gt;ATTR_&lt;/b&gt;&lt;/tt&gt; to &lt;tt&gt;LA_&lt;b&gt;&lt;/tt&gt; doesn&apos;t make sense.  &lt;tt&gt;mdt_attr_valid_xlate()&lt;/tt&gt; should be changed to convert from &lt;tt&gt;MDS_ATTR_&lt;/b&gt;&lt;/tt&gt; directly to &lt;tt&gt;LA_*&lt;/tt&gt; and &lt;tt&gt;attr_unpack()&lt;/tt&gt; should just be removed (it is not used anywhere else).&lt;/p&gt;

&lt;p&gt;There are several patches that are needed to fix this issue properly (in order of decreasing priority):&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;2.1/2.4 MDS should check in &lt;tt&gt;mdt_setattr_unpack_rec()&lt;/tt&gt; if &lt;tt&gt;MDS_ATTR_xTIME&lt;/tt&gt; is set without &lt;tt&gt;MDS_ATTR_xTIME_SET&lt;/tt&gt;, and set &lt;tt&gt;MDS_ATTR_xTIME_SET&lt;/tt&gt; if the client does not have &lt;tt&gt;OBD_CONNECT_FULL20&lt;/tt&gt; (i.e. 1.8 client), before calling &lt;tt&gt;mdt_attr_valid_xlate()&lt;/tt&gt;.  This will fix old 1.8 clients with new 2.x servers.&lt;/li&gt;
	&lt;li&gt;2.1/2.4 clients should set both &lt;tt&gt;MDS_ATTR_xTIME | MDS_ATTR_xTIME_SET&lt;/tt&gt; for timestamps in &lt;tt&gt;ll_prepare_close()&lt;/tt&gt;.  This is for protocol correctness and allows us to fix the server-side timestamp setting in the future.&lt;/li&gt;
	&lt;li&gt;2.1/2.4 clients should add sanity.sh test_203() if this is checking something different than test_23().  This improves test coverage.&lt;/li&gt;
	&lt;li&gt;2.4 MDS should remove &lt;tt&gt;attr_unpack()&lt;/tt&gt;&lt;/li&gt;
	&lt;li&gt;1.8 clients should set both &lt;tt&gt;MDS_ATTR_xTIME | MDS_ATTR_xTIME_SET&lt;/tt&gt; when converting from &lt;tt&gt;OBD_MD_FLATIME&lt;/tt&gt; in &lt;tt&gt;mdc_close_pack_20()&lt;/tt&gt;.  This will fix new 1.8 clients with old 2.x servers.&lt;/li&gt;
	&lt;li&gt;1.8 sanity.sh test_203() should remove the &quot;skip&quot; check, since it &lt;em&gt;should&lt;/em&gt; be working properly with 2.x MDS.  This ensures interop is working again.&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="58221" author="emoly.liu" created="Sat, 11 May 2013 01:18:46 +0000"  >&lt;p&gt;Thanks, Andreas! I will prepare for these patches.&lt;/p&gt;

&lt;p&gt;BTW, do we keep supporting 1.8 client for 2.5 and later?&lt;/p&gt;</comment>
                            <comment id="58224" author="adilger" created="Sat, 11 May 2013 05:32:53 +0000"  >&lt;p&gt;No, interoperability with 1.8 clients will end with Lustre 2.4, and 2.5 will not need to interoperate with these older clients anymore. &lt;/p&gt;</comment>
                            <comment id="58276" author="emoly.liu" created="Mon, 13 May 2013 15:22:53 +0000"  >&lt;p&gt;patch for 2.4 is at &lt;a href=&quot;http://review.whamcloud.com/6327&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/6327&lt;/a&gt;. It includes the following fixes&lt;/p&gt;
&lt;blockquote&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;2.1/2.4 MDS should check in mdt_setattr_unpack_rec() if MDS_ATTR_xTIME is set without MDS_ATTR_xTIME_SET, and set MDS_ATTR_xTIME_SET if the client does not have OBD_CONNECT_FULL20 (i.e. 1.8 client), before calling mdt_attr_valid_xlate(). This will fix old 1.8 clients with new 2.x servers.&lt;/li&gt;
	&lt;li&gt;2.1/2.4 clients should set both MDS_ATTR_xTIME | MDS_ATTR_xTIME_SET for timestamps in ll_prepare_close(). This is for protocol correctness and allows us to fix the server-side timestamp setting in the future.&lt;/li&gt;
	&lt;li&gt;2.1/2.4 clients should add sanity.sh test_203() if this is checking something different than test_23(). This improves test coverage.&lt;/li&gt;
	&lt;li&gt;2.4 MDS should remove attr_unpack()&lt;/li&gt;
&lt;/ul&gt;
&lt;/blockquote&gt;

&lt;p&gt;BTW, sanityn.sh test_23 does the same check to sanity.sh test_203, so I didn&apos;t add test_203 but improve test_23 instead.&lt;/p&gt;</comment>
                            <comment id="58848" author="emoly.liu" created="Mon, 20 May 2013 01:03:13 +0000"  >&lt;p&gt;Patch landed for 2.4 and b18&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvmc7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7407</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>