Skip to content

Commit 8afefc3

Browse files
Souradeep ChakrabartiPaolo Abeni
authored andcommitted
net: mana: Assigning IRQ affinity on HT cores
Existing MANA design assigns IRQ to every CPU, including sibling hyper-threads. This may cause multiple IRQs to be active simultaneously in the same core and may reduce the network performance. Improve the performance by assigning IRQ to non sibling CPUs in local NUMA node. The performance improvement we are getting using ntttcp with following patch is around 15 percent against existing design and approximately 11 percent, when trying to assign one IRQ in each core across NUMA nodes, if enough cores are present. The change will improve the performance for the system with high number of CPU, where number of CPUs in a node is more than 64 CPUs. Nodes with 64 CPUs or less than 64 CPUs will not be affected by this change. The performance study was done using ntttcp tool in Azure. The node had 2 nodes with 32 cores each, total 128 vCPU and number of channels were 32 for 32 RX rings. The below table shows a comparison between existing design and new design: IRQ node-num core-num CPU performance(%) 1 0 | 0 0 | 0 0 | 0-1 0 2 0 | 0 0 | 1 1 | 2-3 3 3 0 | 0 1 | 2 2 | 4-5 10 4 0 | 0 1 | 3 3 | 6-7 15 5 0 | 0 2 | 4 4 | 8-9 15 ... ... 25 0 | 0 12| 24 24| 48-49 12 ... 32 0 | 0 15| 31 31| 62-63 12 33 0 | 0 16| 0 32| 0-1 10 ... 64 0 | 0 31| 31 63| 62-63 0 Signed-off-by: Souradeep Chakrabarti <[email protected]> Signed-off-by: Paolo Abeni <[email protected]>
1 parent 91bfe21 commit 8afefc3

File tree

1 file changed

+50
-11
lines changed

1 file changed

+50
-11
lines changed

drivers/net/ethernet/microsoft/mana/gdma_main.c

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,7 +1249,7 @@ void mana_gd_free_res_map(struct gdma_resource *r)
12491249
r->size = 0;
12501250
}
12511251

1252-
static __maybe_unused int irq_setup(unsigned int *irqs, unsigned int len, int node)
1252+
static int irq_setup(unsigned int *irqs, unsigned int len, int node)
12531253
{
12541254
const struct cpumask *next, *prev = cpu_none_mask;
12551255
cpumask_var_t cpus __free(free_cpumask_var);
@@ -1280,22 +1280,35 @@ static __maybe_unused int irq_setup(unsigned int *irqs, unsigned int len, int no
12801280

12811281
static int mana_gd_setup_irqs(struct pci_dev *pdev)
12821282
{
1283-
unsigned int max_queues_per_port = num_online_cpus();
12841283
struct gdma_context *gc = pci_get_drvdata(pdev);
1284+
unsigned int max_queues_per_port;
12851285
struct gdma_irq_context *gic;
12861286
unsigned int max_irqs, cpu;
1287-
int nvec, irq;
1287+
int start_irq_index = 1;
1288+
int nvec, *irqs, irq;
12881289
int err, i = 0, j;
12891290

1291+
cpus_read_lock();
1292+
max_queues_per_port = num_online_cpus();
12901293
if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
12911294
max_queues_per_port = MANA_MAX_NUM_QUEUES;
12921295

12931296
/* Need 1 interrupt for the Hardware communication Channel (HWC) */
12941297
max_irqs = max_queues_per_port + 1;
12951298

12961299
nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
1297-
if (nvec < 0)
1300+
if (nvec < 0) {
1301+
cpus_read_unlock();
12981302
return nvec;
1303+
}
1304+
if (nvec <= num_online_cpus())
1305+
start_irq_index = 0;
1306+
1307+
irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL);
1308+
if (!irqs) {
1309+
err = -ENOMEM;
1310+
goto free_irq_vector;
1311+
}
12991312

13001313
gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
13011314
GFP_KERNEL);
@@ -1323,17 +1336,41 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
13231336
goto free_irq;
13241337
}
13251338

1326-
err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
1327-
if (err)
1328-
goto free_irq;
1329-
1330-
cpu = cpumask_local_spread(i, gc->numa_node);
1331-
irq_set_affinity_and_hint(irq, cpumask_of(cpu));
1339+
if (!i) {
1340+
err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
1341+
if (err)
1342+
goto free_irq;
1343+
1344+
/* If number of IRQ is one extra than number of online CPUs,
1345+
* then we need to assign IRQ0 (hwc irq) and IRQ1 to
1346+
* same CPU.
1347+
* Else we will use different CPUs for IRQ0 and IRQ1.
1348+
* Also we are using cpumask_local_spread instead of
1349+
* cpumask_first for the node, because the node can be
1350+
* mem only.
1351+
*/
1352+
if (start_irq_index) {
1353+
cpu = cpumask_local_spread(i, gc->numa_node);
1354+
irq_set_affinity_and_hint(irq, cpumask_of(cpu));
1355+
} else {
1356+
irqs[start_irq_index] = irq;
1357+
}
1358+
} else {
1359+
irqs[i - start_irq_index] = irq;
1360+
err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0,
1361+
gic->name, gic);
1362+
if (err)
1363+
goto free_irq;
1364+
}
13321365
}
13331366

1367+
err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
1368+
if (err)
1369+
goto free_irq;
1370+
13341371
gc->max_num_msix = nvec;
13351372
gc->num_msix_usable = nvec;
1336-
1373+
cpus_read_unlock();
13371374
return 0;
13381375

13391376
free_irq:
@@ -1346,8 +1383,10 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
13461383
}
13471384

13481385
kfree(gc->irq_contexts);
1386+
kfree(irqs);
13491387
gc->irq_contexts = NULL;
13501388
free_irq_vector:
1389+
cpus_read_unlock();
13511390
pci_free_irq_vectors(pdev);
13521391
return err;
13531392
}

0 commit comments

Comments
 (0)