<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:27:35 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2716] DNE on ZFS create remote directory suffers from long sync.</title>
                <link>https://jira.whamcloud.com/browse/LU-2716</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;The ZFS transaction sync operation will passively wait for a txg commit, instead of actively requesting TXG commit start before waiting.  This causes each synchronous operation to wait up to 5s for the normal txg flush.&lt;/p&gt;

&lt;p&gt;Until such a time that we have implemented Lustre ZIL support (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4009&quot; title=&quot;Add ZIL support to osd-zfs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4009&quot;&gt;LU-4009&lt;/a&gt;), it makes sense that transaction handles that are marked as synchronous start a TXG commit instead of passively waiting.  This might impact aggregate performance, but DNE operations will be relatively rare, and should not impact normal operations noticeably.  That is especially true for flash-based ZFS devices, since the IOPS to update the &#252;berblock copies at the start of each block device do not need multiple mechanical full-patter seeks, so the cost of forcing a commit is relatively low.&lt;/p&gt;

&lt;p&gt;As an optimization, there could be a &quot;batch wait and reschedule&quot; for TXG sync, as there is for jbd2 since commit v2.6.28-5737-ge07f7183a486 &quot;&lt;tt&gt;jbd2: improve jbd2 fsync batching&lt;/tt&gt;&quot; so that it allows multiple active threads to join the same TXG before it is closed for commit but does not wait unnecessarily between commits.&lt;/p&gt;</description>
                <environment></environment>
        <key id="15895">LU-2716</key>
            <summary>DNE on ZFS create remote directory suffers from long sync.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="di.wang">Di Wang</reporter>
                        <labels>
                            <label>dne</label>
                            <label>performance</label>
                            <label>zfs</label>
                    </labels>
                <created>Tue, 11 Sep 2012 22:21:10 +0000</created>
                <updated>Wed, 25 Oct 2023 19:40:24 +0000</updated>
                                            <version>Lustre 2.4.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="390442" author="bzzz" created="Wed, 25 Oct 2023 04:33:52 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; boolean_t
txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
{
	tx_state_t *tx = &amp;amp;dp-&amp;gt;dp_tx;

	ASSERT(!dsl_pool_config_held(dp));

	mutex_enter(&amp;amp;tx-&amp;gt;tx_sync_lock);
	ASSERT3U(tx-&amp;gt;tx_threads, ==, 2);
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (txg == 0)
		txg = tx-&amp;gt;tx_open_txg + TXG_DEFER_SIZE;
	&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (tx-&amp;gt;tx_sync_txg_waiting &amp;lt; txg)
		tx-&amp;gt;tx_sync_txg_waiting = txg;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;...&lt;/p&gt;

&lt;p&gt;then the sync thread:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
		/*
		 * We sync when we&lt;span class=&quot;code-quote&quot;&gt;&apos;re scanning, there&apos;&lt;/span&gt;s someone waiting
		 * on us, or the quiesce thread has handed off a txg to
		 * us, or we have reached our timeout.
		 */
		timer = (delta &amp;gt;= timeout ? 0 : timeout - delta);
		&lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; (!dsl_scan_active(dp-&amp;gt;dp_scan) &amp;amp;&amp;amp;
		    !tx-&amp;gt;tx_exiting &amp;amp;&amp;amp; timer &amp;gt; 0 &amp;amp;&amp;amp;
		    tx-&amp;gt;tx_synced_txg &amp;gt;= tx-&amp;gt;tx_sync_txg_waiting &amp;amp;&amp;amp;
		    !txg_has_quiesced_to_sync(dp)) {
			dprintf(&lt;span class=&quot;code-quote&quot;&gt;&quot;waiting; tx_synced=%llu waiting=%llu dp=%p\n&quot;&lt;/span&gt;,
			    (u_longlong_t)tx-&amp;gt;tx_synced_txg,
			    (u_longlong_t)tx-&amp;gt;tx_sync_txg_waiting, dp);
			txg_thread_wait(tx, &amp;amp;cpr, &amp;amp;tx-&amp;gt;tx_sync_more_cv, timer);
			delta = ddi_get_lbolt() - start;
			timer = (delta &amp;gt; timeout ? 0 : timeout - delta);
		}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I don&apos;t think that we do wait passively&lt;/p&gt;</comment>
                            <comment id="390444" author="adilger" created="Wed, 25 Oct 2023 04:53:03 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=behlendorf&quot; class=&quot;user-hover&quot; rel=&quot;behlendorf&quot;&gt;behlendorf&lt;/a&gt;, I came across this issue again in a discussion about DIO performance from clients, which require server transaction commits before they complete.  In the absence of ZIL support, I think it would be fairly easily to change the code to frequently force TXG commits on flash VDEVs, and I think this would dramatically improve DNE operations for ZFS storage.  The algorithm in kernel commit v2.6.28-5737-ge07f7183a486 &quot;&lt;tt&gt;jbd2: improve jbd2 fsync batching&lt;/tt&gt;&quot; is a fairly simple way to decide how frequently to start a TXG commit based on the latency of the underlying storage.  However, instead of checking the PID of the thread doing the sync write, it probably makes sense to use the client NID or similar to detect serial writers.&lt;/p&gt;</comment>
                            <comment id="390602" author="behlendorf" created="Wed, 25 Oct 2023 19:40:24 +0000"  >&lt;p&gt;We do have a couple of options for forcing more frequent TXG syncs which I agree probably makes sense for Lustre on flash pools.&#160; The easiest option which exists today would be to reduce the zfs &quot;zfs_txg_timeout&quot; kmod option.&#160; This option ensures a txg sync happens at least even N seconds (default 5).&#160; It&apos;d be easy to tweak that option to allow a time in milliseconds to be set to experiment with.&#160; My major concern would be that a full TXG sync can is a pretty heavy weight operation, and we want to allow as much async batching as possible.&#160; Even on flash.&#160; Forcing constant, frequent TXG syncs would also probably hurt resilver and scrub speeds which run as part of the txg_sync process and expect typical multi-second TXG sync times.&#160; While I&apos;m sure we could account for this, it seems like this could badly throw off the jbd2 algorithm.&lt;/p&gt;

&lt;p&gt;Another option might be for Lustre to call the ZFS txg_kick() function when it needs a TXG sync to happen immediately.&#160; You could call this for the relevant DNE operations, and even do your own intelligent batching at the Lustre layer.&#160; We don&apos;t currently export this symbol, but we could.&lt;/p&gt;

&lt;p&gt;You an always dump the /proc/spl/kstat/zfs/&amp;lt;pool&amp;gt;/txgs proc file to get an idea of how often and how long TXG syncs are taking.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="21123">LU-4009</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzusx3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2347</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>