如何用上Intel 82576中的Allocation of Tx Bandwidth to VMs
最近在看Linux 2.6.32.13中igb的代码,现在有个问题,怎么使用Intel 82576中的Allocation of Tx Bandwidth to VMs功能?
该功能可以对分配给虚拟机的虚拟网卡做带宽分配,有大侠用过吗?
igb驱动怎么改可以用上这个功能呢?以下是在网上搜的:
http://sourceforge.net/mailarchi ... um_name=e1000-devel
上述连接中,相关讨论邮件说的,我做了个实验,都不能用上,甚至VF都没虚出来了,不知哪位大侠知道,讨论讨论?
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(3)
[E1000-devel] [rfc 3/3 v3] [rfc 4/4] igb: expose 82576 bandiwidth allocation
From: Simon Horman <horms@ve...> - 2009-11-26 00:42
The 82576 has support for bandwidth allocation to VFs.
Contrary to the documentation in the 82576 datasheet v2.41 this
appears to work as follows:
* The ratio supplied is always proportional to 1Gbit/s,
regardless of if the link speed.
* The ratio supplied is an upper-bound on bandwidth available
to the VF, not a minimun guarantee
This patch exposes bandwidth control to userspace through a simple
per-device (PF) sysfs file, bandwidth_allocation.
* The file contains a whitespace delimited list of values, one per VF.
* The first value corresponds to the first VF and so on.
* Valid values are integers from 0 to 1000
* A value of 0 indicates that bandwidth_allocation is disabled.
* Other values indicate the allocated bandwidth, in 1/1000ths of a gigabit/s
e.g. The following for a PF with 4 VFs allocates ~20Mbits/ to VF 1,
~100Mbit/s to VF 2, and leave the other 2 VFs with no allocation.
echo "20 100 0 0" > /sys/class/net/eth3/device/bandwidth_allocation
This interface is intended to allow testing of the hardware feature.
There are ongoing discussions about how to expose this feature
to user-space in a more generic way.
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
Thu, 05 Nov 2009 11:58:51 +1100
* Initial post
Wed, 25 Nov 2009 16:58:23 +1100
* Refresh for changes to proceeding patches in series
* Up-port to latest net-next
Index: net-next-2.6/drivers/net/igb/igb_main.c
===================================================================
--- net-next-2.6.orig/drivers/net/igb/igb_main.c 2009-11-26 10:33:01.000000000 +1100
+++ net-next-2.6/drivers/net/igb/igb_main.c 2009-11-26 10:33:01.000000000 +1100
@@ -47,6 +47,9 @@
#ifdef CONFIG_IGB_DCA
#include <linux/dca.h>
#endif
+#ifdef CONFIG_PCI_IOV
+#include <linux/ctype.h>
+#endif
#include "igb.h"
#define DRV_VERSION "2.1.0-k2"
@@ -157,6 +160,15 @@ static unsigned int max_vfs = 0;
module_param(max_vfs, uint, 0);
MODULE_PARM_DESC(max_vfs, "Maximum number of virtual functions to allocate "
"per physical function");
+
+static ssize_t igb_set_bandwidth_allocation(struct device *,
+ struct device_attribute *,
+ const char *, size_t);
+static ssize_t igb_show_bandwidth_allocation(struct device *,
+ struct device_attribute *,
+ char *);
+DEVICE_ATTR(bandwidth_allocation, S_IRUGO | S_IWUSR,
+ igb_show_bandwidth_allocation, igb_set_bandwidth_allocation);
#endif /* CONFIG_PCI_IOV */
static pci_ers_result_t igb_io_error_detected(struct pci_dev *,
@@ -1760,6 +1772,19 @@ static void __devinit igb_init_vf(struct
if (pci_enable_sriov(pdev, adapter->vfs_allocated_count))
goto err_free;
+ if (device_create_file(&pdev->dev, &dev_attr_bandwidth_allocation))
+ goto err_sriov;
+
+ adapter->bandwidth_allocation = kcalloc(adapter->vfs_allocated_count,
+ sizeof(unsigned int),
+ GFP_KERNEL);
+ if (!adapter->bandwidth_allocation)
+ goto err_file;
+ memset(adapter->bandwidth_allocation,
+ adapter->vfs_allocated_count * sizeof(unsigned int), 0);
+
+ spin_lock_init(&adapter->bandwidth_allocation_lock);
+
dev_info(&pdev->dev, "%d vfs allocated\n",
adapter->vfs_allocated_count);
for (i = 0; i < adapter->vfs_allocated_count; i++) {
@@ -1768,6 +1793,10 @@ static void __devinit igb_init_vf(struct
}
return;
+err_file:
+ device_remove_file(&pdev->dev, &dev_attr_bandwidth_allocation);
+err_sriov:
+ pci_disable_sriov(pdev);
err_free:
kfree(adapter->vf_data);
err_zero:
@@ -1892,6 +1921,7 @@ static void igb_init_hw_timer(struct igb
static void igb_cleanup_vf(struct igb_adapter * adapter)
{
#ifdef CONFIG_PCI_IOV
+ struct pci_dev *pdev = adapter->pdev;
struct e1000_hw *hw = &adapter->hw;
if (!adapter->vf_data)
@@ -1908,6 +1938,9 @@ static void igb_cleanup_vf(struct igb_ad
wr32(E1000_IOVCTL, E1000_IOVCTL_REUSE_VFQ);
msleep(100);
dev_info(&adapter->pdev->dev, "IOV Disabled\n");
+
+ device_remove_file(&pdev->dev, &dev_attr_bandwidth_allocation);
+ kfree(adapter->bandwidth_allocation);
#endif
}
@@ -2216,6 +2249,123 @@ void igb_configure_tx_ring(struct igb_ad
wr32(E1000_TXDCTL(reg_idx), txdctl);
}
+#ifdef CONFIG_PCI_IOV
+static void igb_disable_bandwidth_allocation_vf(struct e1000_hw *hw, int vf)
+{
+ wr32(E1000_VMBASEL, vf);
+ wr32(E1000_VMBAC, 0);
+}
+
+static void igb_disable_bandwidth_allocation(struct igb_adapter *adapter)
+{
+ struct e1000_hw *hw = &adapter->hw;
+ int i;
+
+ for (i = 0; i < adapter->vfs_allocated_count; i++)
+ igb_disable_bandwidth_allocation_vf(hw, i);
+}
+
+static void igb_enable_bandwidth_allocation_vf(struct e1000_hw *hw, int vf,
+ unsigned int allocation)
+{
+ u32 rq;
+
+ /* Allocation is expressed as 1000ths of link speed [+]
+ *
+ * rq is calcualted as 1 / (allocation / 1000) = 1000 / allocation
+ *
+ * E1000_VMBAC_RF_INT_SHIFT and E1000_VMBAC_RF_MASK are used
+ * to marshal the result into the desired format: 23 bits of
+ * which 14 are to the right of the decimal point.
+ *
+ * [+] According to the the 82576 v2.41 datasheet rq should
+ * be a ratio of the link speed, however, empirically
+ * it appears to always be a ration of to 1Gbit/s,
+ * even when the link is 100Mbit/s.
+ */
+ rq = ((1000 << E1000_VMBAC_RF_INT_SHIFT) / allocation) &
+ E1000_VMBAC_RF_MASK;
+
+ wr32(E1000_VMBASEL, vf);
+ wr32(E1000_VMBAC, rq|E1000_VMBAC_RC_ENA);
+}
+
+static void igb_enable_bandwidth_allocation(struct igb_adapter *adapter)
+{
+ u32 i, reg;
+ struct e1000_hw *hw = &adapter->hw;
+
+ /* Only enable bandwidth_allocation if it has been set
+ * and the link speed is 100Mbit/s or 1Gbit/s */
+ if (!adapter->bandwidth_allocation ||
+ (adapter->link_speed != SPEED_100 &&
+ adapter->link_speed != SPEED_1000)) {
+ igb_disable_bandwidth_allocation(adapter);
+ return;
+ }
+
+ for (i = 0; i < adapter->vfs_allocated_count; i++) {
+ wr32(E1000_VMBASEL, i);
+ if (adapter->bandwidth_allocation)
+ igb_enable_bandwidth_allocation_vf(hw, i,
+ adapter->bandwidth_allocation);
+ else
+ igb_disable_bandwidth_allocation_vf(hw, i);
+
+ /* XXX:
+ *
+ * The 82576 datasheet, section 4.5.11.1.5.1 "Configuring Tx
+ * Bandwidth to VMs" states that the desired setting is:
+ * VMBAMMW.MMW_SIZE = 16 * MSS
+ *
+ * But isn't MSS a property of skbs that are using tso
+ * rather than adapters?
+ *
+ * If so, should we use the maximum value here? */
+ /* XXX: Should this go inside or outside the for loop ? */
+ reg = 64 * 16;
+ wr32(E1000_VMBAMMW, reg);
+ }
+}
+#endif
+
+static void igb_check_bandwidth_allocation(struct igb_adapter *adapter)
+{
+#ifdef CONFIG_PCI_IOV
+ u32 vmbacs;
+ struct e1000_hw *hw = &adapter->hw;
+
+ if (!adapter->vf_data)
+ return;
+
+ /* The 82576 datasheet, section 4.5.11.1.5.2 "Link Speed Change
+ * Procedure" describes the sequence below. However the
+ * SPEED_CHG never seems to be set.
+ */
+ vmbacs = rd32(E1000_VMBACS);
+ if (vmbacs & E1000_VMBACS_SPEED_CHG) {
+ /* XXX: Never seem to get here */
+ int err = 0;
+
+ if (vmbacs & E1000_VMBACS_VMBA_SET) {
+ igb_disable_bandwidth_allocation(adapter);
+ err = 1;
+ }
+
+ vmbacs &= ~E1000_VMBACS_SPEED_CHG;
+ wr32(E1000_VMBACS, vmbacs);
+
+ if (err)
+ return;
+ }
+
+ spin_lock(&adapter->bandwidth_allocation_lock);
+ igb_enable_bandwidth_allocation(adapter);
+ spin_unlock(&adapter->bandwidth_allocation_lock);
+#endif
+ return;
+}
+
/**
* igb_configure_tx - Configure transmit Unit after Reset
* @adapter: board private structure
@@ -3100,6 +3250,8 @@ static void igb_watchdog_task(struct wor
break;
}
+ igb_check_bandwidth_allocation(adapter);
+
netif_carrier_on(netdev);
igb_ping_all_vfs(adapter);
@@ -5999,4 +6151,101 @@ static void igb_vmm_control(struct igb_a
}
}
+#ifdef CONFIG_PCI_IOV
+static ssize_t igb_show_bandwidth_allocation(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = dev_get_drvdata(dev);
+ struct igb_adapter *adapter = netdev_priv(netdev);
+ int i;
+
+ if (!adapter->vf_data)
+ return -ENOENT;
+
+ *buf = '\0';
+ for (i = 0; i < adapter->vfs_allocated_count; i++) {
+ if (i > 0)
+ strcat(buf, " ");
+ sprintf(buf + strlen(buf), "%i",
+ adapter->bandwidth_allocation);
+ }
+ strcat(buf, "\n");
+
+ return strlen(buf);
+}
+
+static unsigned long igb_strtoul(const char *cp, char **endp, unsigned int base)
+{
+ const char *orig = cp;
+ unsigned long x;
+
+ while (isspace(*cp))
+ cp++;
+
+ x = simple_strtoul(cp, endp, base);
+ if (cp == *endp)
+ *endp = (char *)orig;
+
+ return x;
+}
+
+static ssize_t igb_set_bandwidth_allocation(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct net_device *netdev = dev_get_drvdata(dev);
+ struct igb_adapter *adapter = netdev_priv(netdev);
+ int i;
+ size_t len;
+ ssize_t status = -ENOENT;
+ unsigned int *new, total;
+ unsigned long x;
+ const char *p;
+ char *next_p;
+
+ if (!adapter->vf_data)
+ return -ENOENT;
+
+ len = adapter->vfs_allocated_count * sizeof(unsigned int);
+
+ new = kmalloc(len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ p = buf;
+ total = 0;
+ for (i = 0; i < adapter->vfs_allocated_count; i++) {
+ x = igb_strtoul(p, &next_p, 10);
+ if (p == next_p) {
+ dev_err(dev, "not enough values\n");
+ goto err;
+ }
+ if (x > 1000) {
+ dev_err(dev, "value is too large\n");
+ goto err;
+ }
+ new = x;
+ total += x;
+ p = next_p;
+ }
+
+ /* Check for trailing rubbish */
+ igb_strtoul(p, &next_p, 10);
+ if (p != next_p) {
+ dev_err(dev, "trailing rubbish\n");
+ goto err;
+ }
+
+ spin_lock(&adapter->bandwidth_allocation_lock);
+ memcpy(adapter->bandwidth_allocation, new, len);
+ igb_enable_bandwidth_allocation(adapter);
+ spin_unlock(&adapter->bandwidth_allocation_lock);
+
+ status = count;
+err:
+ kfree(new);
+ return status;
+}
+#endif /* CONFIG_PCI_IOV */
/* igb_main.c */
Index: net-next-2.6/drivers/net/igb/e1000_regs.h
===================================================================
--- net-next-2.6.orig/drivers/net/igb/e1000_regs.h 2009-11-26 10:32:02.000000000 +1100
+++ net-next-2.6/drivers/net/igb/e1000_regs.h 2009-11-26 10:33:01.000000000 +1100
@@ -311,6 +311,16 @@
#define E1000_VLVF(_n) (0x05D00 + (4 * (_n))) /* VLAN Virtual Machine
* Filter - RW */
+/* Tx Bandwidth Allocation to VM Registers */
+#define E1000_VMBACS 0x03600 /* VM Bandwidth Allocation
+ * Control & Status - RW */
+#define E1000_VMBAMMW 0x03670 /* VM Bandwidth Allocation
+ * Max Memory Window - RW */
+#define E1000_VMBASEL 0x03604 /* VM Bandwidth Allocation
+ * Select - RW */
+#define E1000_VMBAC 0x03608 /* VM Bandwidth Allocation
+ * Config - RW */
+
#define wr32(reg, value) (writel(value, hw->hw_addr + reg))
#define rd32(reg) (readl(hw->hw_addr + reg))
#define wrfl() ((void)rd32(E1000_STATUS))
Index: net-next-2.6/drivers/net/igb/e1000_defines.h
===================================================================
--- net-next-2.6.orig/drivers/net/igb/e1000_defines.h 2009-11-26 10:32:02.000000000 +1100
+++ net-next-2.6/drivers/net/igb/e1000_defines.h 2009-11-26 10:33:01.000000000 +1100
@@ -724,4 +724,13 @@
#define E1000_PCIEMISC_LX_DECISION 0x00000080 /* Lx power decision based
on DMA coal */
+/* VM Bandwidth Allocation Control & Status */
+#define E1000_VMBACS_VMBA_SET 0x00001000
+#define E1000_VMBACS_SPEED_CHG 0x80000000
+
+/* VM Bandwidth Allocation Config */
+#define E1000_VMBAC_RF_INT_SHIFT 14
+#define E1000_VMBAC_RF_MASK ((1<<23)-1) /* RF_DEC and RF_INT */
+#define E1000_VMBAC_RC_ENA 0x80000000
+
#endif
Index: net-next-2.6/drivers/net/igb/igb.h
===================================================================
--- net-next-2.6.orig/drivers/net/igb/igb.h 2009-11-26 10:32:02.000000000 +1100
+++ net-next-2.6/drivers/net/igb/igb.h 2009-11-26 10:33:01.000000000 +1100
@@ -312,6 +312,10 @@ struct igb_adapter {
unsigned int vfs_allocated_count;
struct vf_data_storage *vf_data;
u32 rss_queues;
+#ifdef CONFIG_PCI_IOV
+ unsigned int *bandwidth_allocation;
+ spinlock_t bandwidth_allocation_lock;
+#endif
};
#define IGB_FLAG_HAS_MSI (1 << 0)
分析上述代码,实际上主要的思想是,在igb_probe中在/sys目录下创建一个可读写的文件,
以该文件进行用户态与内核态之间信息的交互,向该文件中写入要设置的分配给VF的带宽大小。
然后驱动程序读取该文件中的值,设置82576相对应的寄存器,即可。
本帖最后由 wangpeng168 于 2011-03-29 17:31 编辑
不过上述代码参考的是Intel 82576的Datasheet版本为2.41,我看的为2.61的,2.61规范上说的设置寄存器步骤与实现的代码好像有点不一样,有看过Intel 82576的Datasheet版本为2.61的大侠吗?但是根据2.61的也做了实验,VF是正常出现了,但是分配给虚拟机时,虚拟机发现不了VF。
另外我读取 VMBACS (0x3600; RW)时,
vmbacs = rd32(E1000_VMBACS);
printk(KERN_INFO "TEST: 0x%08x\n", vmbacs);
打印出来的值一直是0x00c00804
初始的时候2.6文档说应该是:0x00400804 这个啊,不知道为什么?文档说保留位21~23必须设置为010b的啊?
有知道的么?跟这个关系大么?
另外,我设置wr32(E1000_VMBACS, 0x0F470804);
开启带宽分配功能,执行到igb_probe的末尾又变成0x00c00804
所有操作是在igb_probe函数的如下宏定义
#ifdef CONFIG_PCI_IOV
...
#end
中添加的