Break the huge lock region of htt tx_lock in tx completion

- The motivation is to reduce the contention time of acquire tx_lock for downlink.
- Original tx completion will hold the tx_lock and process MSDUs one by one. This will block htt_tx to fill htt->pending_tx if free space is available.
- This CL breaks the lock region of tx_lock in tx completion so that only MSDU id related operations are in the tx_lock

Performance comparisons:
- Setup: Desktop -> 1G Ethernet -> AP -> 802.11ac -> Mac Book
- 802.11ac channel 52 with DFS, TCP traffic

- Performance reported by iperf -c ... -i 1 -P 3 -t 10 (pick top one in 3 samples)
, ath10k (unchanged), ath10k (with this CL), LSDK
Downlink 620 Mbps, 719 Mbps, 884 Mbps
Uplink 380 Mbps, 400 Mbps, 750 Mbps

- Performance reported using www.speedtest.net at Mac Book (pick top one in 3 samples)
, ath10k (unchanged), ath10k (with this CL), LSDK
Downlink 570 Mbps, 708 Mbps, 300 Mbps
Uplink 325 Mbps, 322 Mbps, 428 Mbps
(not sure why LSDK has such poor performance in my setup)

Change-Id: I1ca723f77594b8e71729c604d7a20f84aa6fbb7e
diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h
index 3b44217..cc32c77 100644
--- a/drivers/net/wireless/ath/ath10k/htt.h
+++ b/drivers/net/wireless/ath/ath10k/htt.h
@@ -1255,6 +1255,16 @@
 	unsigned long *used_msdu_ids; /* bitmap */
 	wait_queue_head_t empty_tx_wq;
 	struct dma_pool *tx_pool;
+	/* A circular array containing all free msdu_ids such that
+	 * any operations on allocating or freeing a msdu_id is O(1).
+	 * The indices of available msdu_ids are
+	 * [head, tail) if tail > head, or
+	 * [head, ..., max_num_pending_tx - 1, 0, ..., tail),
+	 * if tail < head. */
+	int *free_msdu_ids;
+	int free_msdu_ids_head;
+	int free_msdu_ids_tail;
+	int free_msdu_ids_size;
 
 	/* set if host-fw communication goes haywire
 	 * used to avoid further failures */
diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c
index 2cc15a1..dd362b3 100644
--- a/drivers/net/wireless/ath/ath10k/htt_rx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_rx.c
@@ -1464,8 +1464,6 @@
 	__le16 msdu_id;
 	int i;
 
-	lockdep_assert_held(&htt->tx_lock);
-
 	switch (status) {
 	case HTT_DATA_TX_STATUS_NO_ACK:
 		tx_done.no_ack = true;
@@ -1631,15 +1629,11 @@
 			break;
 		}
 
-		spin_lock_bh(&htt->tx_lock);
 		ath10k_txrx_tx_unref(htt, &tx_done);
-		spin_unlock_bh(&htt->tx_lock);
 		break;
 	}
 	case HTT_T2H_MSG_TYPE_TX_COMPL_IND:
-		spin_lock_bh(&htt->tx_lock);
-		__skb_queue_tail(&htt->tx_compl_q, skb);
-		spin_unlock_bh(&htt->tx_lock);
+		skb_queue_tail(&htt->tx_compl_q, skb);
 		tasklet_schedule(&htt->txrx_compl_task);
 		return;
 	case HTT_T2H_MSG_TYPE_SEC_IND: {
@@ -1704,12 +1698,10 @@
 	struct htt_resp *resp;
 	struct sk_buff *skb;
 
-	spin_lock_bh(&htt->tx_lock);
-	while ((skb = __skb_dequeue(&htt->tx_compl_q))) {
+	while ((skb = skb_dequeue(&htt->tx_compl_q))) {
 		ath10k_htt_rx_frm_tx_compl(htt->ar, skb);
 		dev_kfree_skb_any(skb);
 	}
-	spin_unlock_bh(&htt->tx_lock);
 
 	spin_lock_bh(&htt->rx_ring.lock);
 	while ((skb = __skb_dequeue(&htt->rx_compl_q))) {
diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c
index 49bb04d..b3fdf07 100644
--- a/drivers/net/wireless/ath/ath10k/htt_tx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_tx.c
@@ -63,11 +63,15 @@
 
 	lockdep_assert_held(&htt->tx_lock);
 
-	msdu_id = find_first_zero_bit(htt->used_msdu_ids,
-				      htt->max_num_pending_tx);
-	if (msdu_id == htt->max_num_pending_tx)
+	if (htt->free_msdu_ids_size == 0)
 		return -ENOBUFS;
 
+	msdu_id = htt->free_msdu_ids[htt->free_msdu_ids_head];
+	++htt->free_msdu_ids_head;
+	--htt->free_msdu_ids_size;
+	if (htt->free_msdu_ids_head == htt->max_num_pending_tx)
+		htt->free_msdu_ids_head = 0;
+
 	ath10k_dbg(ar, ATH10K_DBG_HTT, "htt tx alloc msdu_id %d\n", msdu_id);
 	__set_bit(msdu_id, htt->used_msdu_ids);
 	return msdu_id;
@@ -79,9 +83,16 @@
 
 	lockdep_assert_held(&htt->tx_lock);
 
-	if (!test_bit(msdu_id, htt->used_msdu_ids))
+	if (!test_bit(msdu_id, htt->used_msdu_ids)) {
 		ath10k_warn(ar, "trying to free unallocated msdu_id %d\n",
 			    msdu_id);
+	} else {
+		htt->free_msdu_ids[htt->free_msdu_ids_tail] = msdu_id;
+		++htt->free_msdu_ids_tail;
+		++htt->free_msdu_ids_size;
+		if (htt->free_msdu_ids_tail == htt->max_num_pending_tx)
+			htt->free_msdu_ids_tail = 0;
+	}
 
 	ath10k_dbg(ar, ATH10K_DBG_HTT, "htt tx free msdu_id %hu\n", msdu_id);
 	__clear_bit(msdu_id, htt->used_msdu_ids);
@@ -90,7 +101,7 @@
 int ath10k_htt_tx_alloc(struct ath10k_htt *htt)
 {
 	struct ath10k *ar = htt->ar;
-
+	int tx_idx;
 	spin_lock_init(&htt->tx_lock);
 	init_waitqueue_head(&htt->empty_tx_wq);
 
@@ -123,6 +134,24 @@
 		return -ENOMEM;
 	}
 
+	htt->free_msdu_ids = kmalloc(sizeof(int) *
+					 htt->max_num_pending_tx,
+					 GFP_KERNEL|__GFP_REPEAT|__GFP_HIGH);
+
+	if (!htt->free_msdu_ids) {
+		kfree(htt->used_msdu_ids);
+		kfree(htt->pending_tx);
+		dma_pool_destroy(htt->tx_pool);
+		return -ENOMEM;
+	}
+
+	for (tx_idx = 0; tx_idx < htt->max_num_pending_tx; ++tx_idx) {
+		htt->free_msdu_ids[tx_idx] = tx_idx;
+	}
+	htt->free_msdu_ids_head = 0;
+	htt->free_msdu_ids_tail = 0;
+	htt->free_msdu_ids_size = htt->max_num_pending_tx;
+
 	return 0;
 }
 
@@ -143,7 +172,10 @@
 		tx_done.discard = 1;
 		tx_done.msdu_id = msdu_id;
 
+		// Unlocks tx_lock since ath10k_txrx_tx_unref will acquire the lock again.
+		spin_unlock_bh(&htt->tx_lock);
 		ath10k_txrx_tx_unref(htt, &tx_done);
+		spin_lock_bh(&htt->tx_lock);
 	}
 	spin_unlock_bh(&htt->tx_lock);
 }
@@ -153,6 +185,7 @@
 	ath10k_htt_tx_free_pending(htt);
 	kfree(htt->pending_tx);
 	kfree(htt->used_msdu_ids);
+	kfree(htt->free_msdu_ids);
 	dma_pool_destroy(htt->tx_pool);
 }
 
diff --git a/drivers/net/wireless/ath/ath10k/txrx.c b/drivers/net/wireless/ath/ath10k/txrx.c
index d498fe3..6ed6a9a 100644
--- a/drivers/net/wireless/ath/ath10k/txrx.c
+++ b/drivers/net/wireless/ath/ath10k/txrx.c
@@ -53,7 +53,6 @@
 	struct ath10k_skb_cb *skb_cb;
 	struct sk_buff *msdu;
 
-	lockdep_assert_held(&htt->tx_lock);
 
 	ath10k_dbg(ar, ATH10K_DBG_HTT, "htt tx completion msdu_id %u discard %d no_ack %d\n",
 		   tx_done->msdu_id, !!tx_done->discard, !!tx_done->no_ack);
@@ -64,7 +63,9 @@
 		return;
 	}
 
+	spin_lock_bh(&htt->tx_lock);
 	msdu = htt->pending_tx[tx_done->msdu_id];
+	spin_unlock_bh(&htt->tx_lock);
 	skb_cb = ATH10K_SKB_CB(msdu);
 
 	dma_unmap_single(dev, skb_cb->paddr, msdu->len, DMA_TO_DEVICE);
@@ -94,11 +95,13 @@
 	/* we do not own the msdu anymore */
 
 exit:
+	spin_lock_bh(&htt->tx_lock);
 	htt->pending_tx[tx_done->msdu_id] = NULL;
 	ath10k_htt_tx_free_msdu_id(htt, tx_done->msdu_id);
 	__ath10k_htt_tx_dec_pending(htt);
 	if (htt->num_pending_tx == 0)
 		wake_up(&htt->empty_tx_wq);
+	spin_unlock_bh(&htt->tx_lock);
 }
 
 /* hold conf_mutex for simple iteration, or conf_mutex+data_lock for