學(xué)習(xí)啦 > 學(xué)習(xí)電腦 > 操作系統(tǒng) > Linux教程 > 數(shù)據(jù)包接收詳解

數(shù)據(jù)包接收詳解

時(shí)間: 春健736 分享

數(shù)據(jù)包接收詳解

  學(xué)習(xí)啦小編為大家分享了Linux內(nèi)核數(shù)據(jù)包處理流程-數(shù)據(jù)包接收的詳細(xì)說明,有需要的可以參考下

  數(shù)據(jù)包接收

  一、從網(wǎng)卡說起

  這并非是一個(gè)網(wǎng)卡驅(qū)動(dòng)分析的專門文檔,只是對(duì)網(wǎng)卡處理數(shù)據(jù)包的流程進(jìn)行一個(gè)重點(diǎn)的分析。這里以Intel的e100驅(qū)動(dòng)為例進(jìn)行分析。

  大多數(shù)網(wǎng)卡都是一個(gè)PCI設(shè)備,PCI設(shè)備都包含了一個(gè)標(biāo)準(zhǔn)的配置寄存器,寄存器中,包含了PCI設(shè)備的廠商ID、設(shè)備ID等等信息,驅(qū)動(dòng)

  程序使用來描述這些寄存器的標(biāo)識(shí)符。如下:

  CODE:

  struct pci_device_id {

  __u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/

  __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */

  __u32 class, class_mask; /* (class,subclass,prog-if) triplet */

  kernel_ulong_t driver_data; /* Data private to the driver */

  };

  這樣,在驅(qū)動(dòng)程序中,常常就可以看到定義一個(gè)struct pci_device_id 類型的數(shù)組,告訴內(nèi)核支持不同類型的

  PCI設(shè)備的列表,以e100驅(qū)動(dòng)為例:

  #define INTEL_8255X_ETHERNET_DEVICE(device_id, ich) {\

  PCI_VENDOR_ID_INTEL, device_id, PCI_ANY_ID, PCI_ANY_ID, \

  PCI_CLASS_NETWORK_ETHERNET << 8, 0xFFFF00, ich }

  static struct pci_device_id e100_id_table[] = {

  INTEL_8255X_ETHERNET_DEVICE(0x1029, 0),

  INTEL_8255X_ETHERNET_DEVICE(0x1030, 0),

  INTEL_8255X_ETHERNET_DEVICE(0x1031, 3),

  ……/*略過一大堆支持的設(shè)備*/

  { 0, }

  };

  在內(nèi)核中,一個(gè)PCI設(shè)備,使用struct pci_driver結(jié)構(gòu)來描述,

  struct pci_driver {

  struct list_head node;

  char *name;

  struct module *owner;

  const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */

  int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */

  void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */

  int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */

  int (*resume) (struct pci_dev *dev); /* Device woken up */

  int (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable); /* Enable wake event */

  void (*shutdown) (struct pci_dev *dev);

  struct device_driver driver;

  struct pci_dynids dynids;

  };

  因?yàn)樵谙到y(tǒng)引導(dǎo)的時(shí)候,PCI設(shè)備已經(jīng)被識(shí)別,當(dāng)內(nèi)核發(fā)現(xiàn)一個(gè)已經(jīng)檢測(cè)到的設(shè)備同驅(qū)動(dòng)注冊(cè)的id_table中的信息相匹配時(shí),

  它就會(huì)觸發(fā)驅(qū)動(dòng)的probe函數(shù),以e100為例:

  /*

  * 定義一個(gè)名為e100_driver的PCI設(shè)備

  * 1、設(shè)備的探測(cè)函數(shù)為e100_probe;

  * 2、設(shè)備的id_table表為e100_id_table

  */

  static struct pci_driver e100_driver = {

  .name = DRV_NAME,

  .id_table = e100_id_table,

  .probe = e100_probe,

  .remove = __devexit_p(e100_remove),

  #ifdef CONFIG_PM

  .suspend = e100_suspend,

  .resume = e100_resume,

  #endif

  .driver = {

  .shutdown = e100_shutdown,

  }

  };

  這樣,如果系統(tǒng)檢測(cè)到有與id_table中對(duì)應(yīng)的設(shè)備時(shí),就調(diào)用驅(qū)動(dòng)的probe函數(shù)。

  驅(qū)動(dòng)設(shè)備在init函數(shù)中,調(diào)用pci_module_init函數(shù)初始化PCI設(shè)備e100_driver:

  static int __init e100_init_module(void)

  {

  if(((1 << debug) - 1) & NETIF_MSG_DRV) {

  printk(KERN_INFO PFX "%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);

  printk(KERN_INFO PFX "%s\n", DRV_COPYRIGHT);

  }

  return pci_module_init(&e100_driver);

  }

  一切順利的話,注冊(cè)的e100_probe函數(shù)將被內(nèi)核調(diào)用,這個(gè)函數(shù)完成兩個(gè)重要的工作:

  1、分配/初始化/注冊(cè)網(wǎng)絡(luò)設(shè)備;

  2、完成PCI設(shè)備的I/O區(qū)域的分配和映射,以及完成硬件的其它初始化工作;

  網(wǎng)絡(luò)設(shè)備使用struct net_device結(jié)構(gòu)來描述,這個(gè)結(jié)構(gòu)非常之大,許多重要的參考書籍對(duì)它都有較為深入的描述,可以參考《Linux設(shè)備驅(qū)動(dòng)程序》中網(wǎng)卡驅(qū)動(dòng)設(shè)計(jì)的相關(guān)章節(jié)。我會(huì)在后面的內(nèi)容中,對(duì)其重要的成員進(jìn)行注釋;

  當(dāng)probe函數(shù)被調(diào)用,證明已經(jīng)發(fā)現(xiàn)了我們所支持的網(wǎng)卡,這樣,就可以調(diào)用register_netdev函數(shù)向內(nèi)核注冊(cè)網(wǎng)絡(luò)設(shè)備了,注冊(cè)之前,一般會(huì)調(diào)用alloc_etherdev為以太網(wǎng)分析一個(gè)net_device,然后初始化它的重要成員。

  除了向內(nèi)核注冊(cè)網(wǎng)絡(luò)設(shè)備之外,探測(cè)函數(shù)另一項(xiàng)重要的工作就是需要對(duì)硬件進(jìn)行初始化,比如,要訪問其I/O區(qū)域,需要為I/O區(qū)域分配內(nèi)存區(qū)域,然后進(jìn)行映射,這一步一般的流程是:

  1、request_mem_region()

  2、ioremap()

  對(duì)于一般的PCI設(shè)備而言,可以調(diào)用:

  1、pci_request_regions()

  2、ioremap()

  pci_request_regions函數(shù)對(duì)PCI的6個(gè)寄存器都會(huì)調(diào)用資源分配函數(shù)進(jìn)行申請(qǐng)(需要判斷是I/O端口還是I/O內(nèi)存),例如:

  CODE:

  int pci_request_regions(struct pci_dev *pdev, char *res_name)

  {

  int i;

  for (i = 0; i < 6; i++)

  if(pci_request_region(pdev, i, res_name))

  goto err_out;

  return 0;

  CODE:

  int pci_request_region(struct pci_dev *pdev, int bar, char *res_name)

  {

  if (pci_resource_len(pdev, bar) == 0)

  return 0;

  if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) {

  if (!request_region(pci_resource_start(pdev, bar),

  pci_resource_len(pdev, bar), res_name))

  goto err_out;

  }

  else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) {

  if (!request_mem_region(pci_resource_start(pdev, bar),

  pci_resource_len(pdev, bar), res_name))

  goto err_out;

  }

  return 0;

  有了這些基礎(chǔ),我們來看設(shè)備的探測(cè)函數(shù):

  static int __devinit e100_probe(struct pci_dev *pdev,

  const struct pci_device_id *ent)

  {

  struct net_device *netdev;

  struct nic *nic;

  int err;

  /*分配網(wǎng)絡(luò)設(shè)備*/

  if(!(netdev = alloc_etherdev(sizeof(struct nic)))) {

  if(((1 << debug) - 1) & NETIF_MSG_PROBE)

  printk(KERN_ERR PFX "Etherdev alloc failed, abort.\n");

  return -ENOMEM;

  }

  /*設(shè)置各成員指針函數(shù)*/

  netdev->open = e100_open;

  netdev->stop = e100_close;

  netdev->hard_start_xmit = e100_xmit_frame;

  netdev->get_stats = e100_get_stats;

  netdev->set_multicast_list = e100_set_multicast_list;

  netdev->set_mac_address = e100_set_mac_address;

  netdev->change_mtu = e100_change_mtu;

  netdev->do_ioctl = e100_do_ioctl;

  SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops);

  netdev->tx_timeout = e100_tx_timeout;

  netdev->watchdog_timeo = E100_WATCHDOG_PERIOD;

  netdev->poll = e100_poll;

  netdev->weight = E100_NAPI_WEIGHT;

  #ifdef CONFIG_NET_POLL_CONTROLLER

  netdev->poll_controller = e100_netpoll;

  #endif

  /*設(shè)置網(wǎng)絡(luò)設(shè)備名稱*/

  strcpy(netdev->name, pci_name(pdev));

  /*取得設(shè)備私有數(shù)據(jù)結(jié)構(gòu)*/

  nic = netdev_priv(netdev);

  /*網(wǎng)絡(luò)設(shè)備指針,指向自己*/

  nic->netdev = netdev;

  /*PCIy設(shè)備指針,指向自己*/

  nic->pdev = pdev;

  nic->msg_enable = (1 << debug) - 1;

  /*將PCI設(shè)備的私有數(shù)據(jù)區(qū)指向網(wǎng)絡(luò)設(shè)備*/

  pci_set_drvdata(pdev, netdev);

  /*激活PCI設(shè)備*/

  if((err = pci_enable_device(pdev))) {

  DPRINTK(PROBE, ERR, "Cannot enable PCI device, aborting.\n");

  goto err_out_free_dev;

  }

  /*判斷I/O區(qū)域是否是I/O內(nèi)存,如果不是,則報(bào)錯(cuò)退出*/

  if(!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {

  DPRINTK(PROBE, ERR, "Cannot find proper PCI device "

  "base address, aborting.\n");

  err = -ENODEV;

  goto err_out_disable_pdev;

  }

  /*分配I/O內(nèi)存區(qū)域*/

  if((err = pci_request_regions(pdev, DRV_NAME))) {

  DPRINTK(PROBE, ERR, "Cannot obtain PCI resources, aborting.\n");

  goto err_out_disable_pdev;

  }

  /*

  * 告之內(nèi)核自己的DMA尋址能力,這里不是很明白,因?yàn)閺?xFFFFFFFF來看,本來就是內(nèi)核默認(rèn)的32了

  * 為什么還要調(diào)用pci_set_dma_mask來重復(fù)設(shè)置呢?可能是對(duì)ULL而非UL不是很了解吧。

  */

  if((err = pci_set_dma_mask(pdev, 0xFFFFFFFFULL))) {

  DPRINTK(PROBE, ERR, "No usable DMA configuration, aborting.\n");

  goto err_out_free_res;

  }

  SET_MODULE_OWNER(netdev);

  SET_NETDEV_DEV(netdev, &pdev->dev);

  /*分配完成后,映射I/O內(nèi)存*/

  nic->csr = ioremap(pci_resource_start(pdev, 0), sizeof(struct csr));

  if(!nic->csr) {

  DPRINTK(PROBE, ERR, "Cannot map device registers, aborting.\n");

  err = -ENOMEM;

  goto err_out_free_res;

  }

  if(ent->driver_data)

  nic->flags |= ich;

  else

  nic->flags &= ~ich;

  /*設(shè)置設(shè)備私有數(shù)據(jù)結(jié)構(gòu)的大部份默認(rèn)參數(shù)*/

  e100_get_defaults(nic);

  /* 初始化自旋鎖,鍋的初始化必須在調(diào)用 hw_reset 之前執(zhí)行*/

  spin_lock_init(&nic->cb_lock);

  spin_lock_init(&nic->cmd_lock);

  /* 硬件復(fù)位,通過向指定I/O端口設(shè)置復(fù)位指令實(shí)現(xiàn). */

  e100_hw_reset(nic);

  /*

  * PCI網(wǎng)卡被BIOS配置后,某些特性可能會(huì)被屏蔽掉。比如,多數(shù)BIOS都會(huì)清掉“master”位,

  * 這導(dǎo)致板卡不能隨意向主存中拷貝數(shù)據(jù)。pci_set_master函數(shù)數(shù)會(huì)檢查是否需要設(shè)置標(biāo)志位,

  * 如果需要,則會(huì)將“master”位置位。

  * PS:什么是PCI master?

  * 不同于ISA總線,PCI總線的地址總線與數(shù)據(jù)總線是分時(shí)復(fù)用的。這樣做的好處是,一方面

  * 可以節(jié)省接插件的管腳數(shù),另一方面便于實(shí)現(xiàn)突發(fā)數(shù)據(jù)傳輸。在做數(shù)據(jù)傳輸時(shí),由一個(gè)PCI

  * 設(shè)備做發(fā)起者(主控,Initiator或Master),而另一個(gè)PCI設(shè)備做目標(biāo)(從設(shè)備,Target或Slave)。

  * 總線上的所有時(shí)序的產(chǎn)生與控制,都由Master來發(fā)起。PCI總線在同一時(shí)刻只能供一對(duì)設(shè)備完成傳輸。

  */

  pci_set_master(pdev);

  /*添加兩個(gè)內(nèi)核定時(shí)器,watchdog和blink_timer*/

  init_timer(&nic->watchdog);

  nic->watchdog.function = e100_watchdog;

  nic->watchdog.data = (unsigned long)nic;

  init_timer(&nic->blink_timer);

  nic->blink_timer.function = e100_blink_led;

  nic->blink_timer.data = (unsigned long)nic;

  INIT_WORK(&nic->tx_timeout_task,

  (void (*)(void *))e100_tx_timeout_task, netdev);

  if((err = e100_alloc(nic))) {

  DPRINTK(PROBE, ERR, "Cannot alloc driver memory, aborting.\n");

  goto err_out_iounmap;

  }

  /*phy寄存器初始化*/

  e100_phy_init(nic);

  if((err = e100_eeprom_load(nic)))

  goto err_out_free;

  memcpy(netdev->dev_addr, nic->eeprom, ETH_ALEN);

  if(!is_valid_ether_addr(netdev->dev_addr)) {

  DPRINTK(PROBE, ERR, "Invalid MAC address from "

  "EEPROM, aborting.\n");

  err = -EAGAIN;

  goto err_out_free;

  }

  /* Wol magic packet can be enabled from eeprom */

  if((nic->mac >= mac_82558_D101_A4) &&

  (nic->eeprom[eeprom_id] & eeprom_id_wol))

  nic->flags |= wol_magic;

  /* ack any pending wake events, disable PME */

  pci_enable_wake(pdev, 0, 0);

  /*注冊(cè)網(wǎng)絡(luò)設(shè)備*/

  strcpy(netdev->name, "eth%d");

  if((err = register_netdev(netdev))) {

  DPRINTK(PROBE, ERR, "Cannot register net device, aborting.\n");

  goto err_out_free;

  }

  DPRINTK(PROBE, INFO, "addr 0x%lx, irq %d, "

  "MAC addr %02X:%02X:%02X:%02X:%02X:%02X\n",

  pci_resource_start(pdev, 0), pdev->irq,

  netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2],

  netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5]);

  return 0;

  err_out_free:

  e100_free(nic);

  err_out_iounmap:

  iounmap(nic->csr);

  err_out_free_res:

  pci_release_regions(pdev);

  err_out_disable_pdev:

  pci_disable_device(pdev);

  err_out_free_dev:

  pci_set_drvdata(pdev, NULL);

  free_netdev(netdev);

  return err;

  }

  執(zhí)行到這里,探測(cè)函數(shù)的使命就完成了,在對(duì)網(wǎng)絡(luò)設(shè)備重要成員初始化時(shí),有:

  netdev->open = e100_open;

  指定了設(shè)備的open函數(shù)為e100_open,這樣,當(dāng)?shù)谝淮问褂迷O(shè)備,比如使用ifconfig工具的時(shí)候,open函數(shù)將被調(diào)用。

  二、打開設(shè)備

  在探測(cè)函數(shù)中,設(shè)置了netdev->open = e100_open; 指定了設(shè)備的open函數(shù)為e100_open:

  CODE:

  static int e100_open(struct net_device *netdev)

  {

  struct nic *nic = netdev_priv(netdev);

  int err = 0;

  netif_carrier_off(netdev);

  if((err = e100_up(nic)))

  DPRINTK(IFUP, ERR, "Cannot open interface, aborting.\n");

  return err;

  }

  大多數(shù)涉及物理設(shè)備可以感知信號(hào)載波(carrier)的存在,載波的存在意味著設(shè)備可以工作

  據(jù)個(gè)例子來講:當(dāng)一個(gè)用戶拔掉了網(wǎng)線,也就意味著信號(hào)載波的消失。

  netif_carrier_off:關(guān)閉載波信號(hào);

  netif_carrier_on:打開載波信號(hào);

  netif_carrier_ok:檢測(cè)載波信號(hào);

  對(duì)于探測(cè)網(wǎng)卡網(wǎng)線是否連接,這一組函數(shù)被使用得較多;

  接著,調(diào)用e100_up函數(shù)啟動(dòng)網(wǎng)卡,這個(gè)“啟動(dòng)”的過程,最重要的步驟有:

  1、調(diào)用request_irq向內(nèi)核注冊(cè)中斷;

  2、調(diào)用netif_wake_queue函數(shù)來重新啟動(dòng)傳輸隊(duì)例;

  CODE:

  static int e100_up(struct nic *nic)

  {

  int err;

  if((err = e100_rx_alloc_list(nic)))

  return err;

  if((err = e100_alloc_cbs(nic)))

  goto err_rx_clean_list;

  if((err = e100_hw_init(nic)))

  goto err_clean_cbs;

  e100_set_multicast_list(nic->netdev);

  e100_start_receiver(nic, 0);

  mod_timer(&nic->watchdog, jiffies);

  if((err = request_irq(nic->pdev->irq, e100_intr, SA_SHIRQ,

  nic->netdev->name, nic->netdev)))

  goto err_no_irq;

  netif_wake_queue(nic->netdev);

  netif_poll_enable(nic->netdev);

  /* enable ints _after_ enabling poll, preventing a race between

  * disable ints+schedule */

  e100_enable_irq(nic);

  return 0;

  err_no_irq:

  del_timer_sync(&nic->watchdog);

  err_clean_cbs:

  e100_clean_cbs(nic);

  err_rx_clean_list:

  e100_rx_clean_list(nic);

  return err;

  }

  這樣,中斷函數(shù)e100_intr將被調(diào)用;

  三、網(wǎng)卡中斷

  從本質(zhì)上來講,中斷,是一種電信號(hào),當(dāng)設(shè)備有某種事件發(fā)生的時(shí)候,它就會(huì)產(chǎn)生中斷,通過總線把電信號(hào)發(fā)送給中斷控制器,如果中斷的線是激活的,中斷控制器就把電信號(hào)發(fā)送給處理器的某個(gè)特定引腳。處理器于是立即停止自己正在做的事,跳到內(nèi)存中內(nèi)核設(shè)置的中斷處理程序的入口點(diǎn),進(jìn)行中斷處理。

  在內(nèi)核中斷處理中,會(huì)檢測(cè)中斷與我們剛才注冊(cè)的中斷號(hào)匹配,于是,注冊(cè)的中斷處理函數(shù)就被調(diào)用了。

  當(dāng)需要發(fā)/收數(shù)據(jù),出現(xiàn)錯(cuò)誤,連接狀態(tài)變化等,網(wǎng)卡的中斷信號(hào)會(huì)被觸發(fā)。當(dāng)接收到中斷后,中斷函數(shù)讀取中斷狀態(tài)位,進(jìn)行合法性判斷,如判斷中斷信號(hào)是否是自己的等,然后,應(yīng)答設(shè)備中斷——OK,我已經(jīng)知道了,你回去繼續(xù)工作吧……

  接著,它就屏蔽此中斷,然后netif_rx_schedule函數(shù)接收,接收函數(shù) 會(huì)在未來某一時(shí)刻調(diào)用設(shè)備的poll函數(shù)(對(duì)這里而言,注冊(cè)的是e100_poll)實(shí)現(xiàn)設(shè)備的輪詢:

  CODE:

  static irqreturn_t e100_intr(int irq, void *dev_id, struct pt_regs *regs)

  {

  struct net_device *netdev = dev_id;

  struct nic *nic = netdev_priv(netdev);

  u8 stat_ack = readb(&nic->csr->scb.stat_ack);

  DPRINTK(INTR, DEBUG, "stat_ack = 0x%02X\n", stat_ack);

  if(stat_ack == stat_ack_not_ours || /* Not our interrupt */

  stat_ack == stat_ack_not_present) /* Hardware is ejected */

  return IRQ_NONE;

  /* Ack interrupt(s) */

  writeb(stat_ack, &nic->csr->scb.stat_ack);

  /* We hit Receive No Resource (RNR); restart RU after cleaning */

  if(stat_ack & stat_ack_rnr)

  nic->ru_running = RU_SUSPENDED;

  e100_disable_irq(nic);

  netif_rx_schedule(netdev);

  return IRQ_HANDLED;

  }

  對(duì)于數(shù)據(jù)包的接收而言,我們關(guān)注的是poll函數(shù)中,調(diào)用e100_rx_clean進(jìn)行數(shù)據(jù)的接收:

  CODE:

  static int e100_poll(struct net_device *netdev, int *budget)

  {

  struct nic *nic = netdev_priv(netdev);

  /*

  * netdev->quota是當(dāng)前CPU能夠從所有接口中接收數(shù)據(jù)包的最大數(shù)目,budget是在

  * 初始化階段分配給接口的weight值,輪詢函數(shù)必須接受二者之間的最小值。表示

  * 輪詢函數(shù)本次要處理的數(shù)據(jù)包個(gè)數(shù)。

  */

  unsigned int work_to_do = min(netdev->quota, *budget);

  unsigned int work_done = 0;

  int tx_cleaned;

  /*進(jìn)行數(shù)據(jù)包的接收和傳輸*/

  e100_rx_clean(nic, &work_done, work_to_do);

  tx_cleaned = e100_tx_clean(nic);

  /*接收和傳輸完成后,就退出poll模塊,重啟中斷*/

  /* If no Rx and Tx cleanup work was done, exit polling mode. */

  if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) {

  netif_rx_complete(netdev);

  e100_enable_irq(nic);

  return 0;

  }

  *budget -= work_done;

  netdev->quota -= work_done;

  return 1;

  }

  static inline void e100_rx_clean(struct nic *nic, unsigned int *work_done,

  unsigned int work_to_do)

  {

  struct rx *rx;

  int restart_required = 0;

  struct rx *rx_to_start = NULL;

  /* are we already rnr? then pay attention!!! this ensures that

  * the state machine progression never allows a start with a

  * partially cleaned list, avoiding a race between hardware

  * and rx_to_clean when in NAPI mode */

  if(RU_SUSPENDED == nic->ru_running)

  restart_required = 1;

  /* Indicate newly arrived packets */

  for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) {

  int err = e100_rx_indicate(nic, rx, work_done, work_to_do);

  if(-EAGAIN == err) {

  /* hit quota so have more work to do, restart once

  * cleanup is complete */

  restart_required = 0;

  break;

  } else if(-ENODATA == err)

  break; /* No more to clean */

  }

  /* save our starting point as the place we'll restart the receiver */

  if(restart_required)

  rx_to_start = nic->rx_to_clean;

  /* Alloc new skbs to refill list */

  for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) {

  if(unlikely(e100_rx_alloc_skb(nic, rx)))

  break; /* Better luck next time (see watchdog) */

  }

  if(restart_required) {

  // ack the rnr?

  writeb(stat_ack_rnr, &nic->csr->scb.stat_ack);

  e100_start_receiver(nic, rx_to_start);

  if(work_done)

  (*work_done)++;

  }

  }

  四、網(wǎng)卡的數(shù)據(jù)接收

  內(nèi)核如何從網(wǎng)卡接受數(shù)據(jù),傳統(tǒng)的經(jīng)典過程:

  1、數(shù)據(jù)到達(dá)網(wǎng)卡;

  2、網(wǎng)卡產(chǎn)生一個(gè)中斷給內(nèi)核;

  3、內(nèi)核使用I/O指令,從網(wǎng)卡I/O區(qū)域中去讀取數(shù)據(jù);

  我們?cè)谠S多網(wǎng)卡驅(qū)動(dòng)中,都可以在網(wǎng)卡的中斷函數(shù)中見到這一過程。

  但是,這一種方法,有一種重要的問題,就是大流量的數(shù)據(jù)來到,網(wǎng)卡會(huì)產(chǎn)生大量的中斷,內(nèi)核在中斷上下文中,會(huì)浪費(fèi)大量的資源來處理中斷本身。所以,一個(gè)問題是,“可不可以不使用中斷”,這就是輪詢技術(shù),所謂NAPI技術(shù),說來也不神秘,就是說,內(nèi)核屏蔽中斷,然后隔一會(huì)兒就去問網(wǎng)卡,“你有沒有數(shù)據(jù)啊?”……

  從這個(gè)描述本身可以看到,哪果數(shù)據(jù)量少,輪詢同樣占用大量的不必要的CPU資源,大家各有所長吧,呵呵……

  OK,另一個(gè)問題,就是從網(wǎng)卡的I/O區(qū)域,包括I/O寄存器或I/O內(nèi)存中去讀取數(shù)據(jù),這都要CPU去讀,也要占用CPU資源,“CPU從I/O區(qū)域讀,然后把它放到內(nèi)存(這個(gè)內(nèi)存指的是系統(tǒng)本身的物理內(nèi)存,跟外設(shè)的內(nèi)存不相干,也叫主內(nèi)存)中”。于是自然地,就想到了DMA技術(shù)——讓網(wǎng)卡直接從主內(nèi)存之間讀寫它們的I/O數(shù)據(jù),CPU,這兒不干你事,自己找樂子去:

  1、首先,內(nèi)核在主內(nèi)存中為收發(fā)數(shù)據(jù)建立一個(gè)環(huán)形的緩沖隊(duì)列(通常叫DMA環(huán)形緩沖區(qū))。

  2、內(nèi)核將這個(gè)緩沖區(qū)通過DMA映射,把這個(gè)隊(duì)列交給網(wǎng)卡;

  3、網(wǎng)卡收到數(shù)據(jù),就直接放進(jìn)這個(gè)環(huán)形緩沖區(qū)了——也就是直接放進(jìn)主內(nèi)存了;然后,向系統(tǒng)產(chǎn)生一個(gè)中斷;

  4、內(nèi)核收到這個(gè)中斷,就取消DMA映射,這樣,內(nèi)核就直接從主內(nèi)存中讀取數(shù)據(jù);

  ——呵呵,這一個(gè)過程比傳統(tǒng)的過程少了不少工作,因?yàn)樵O(shè)備直接把數(shù)據(jù)放進(jìn)了主內(nèi)存,不需要CPU的干預(yù),效率是不是提高不少?

  對(duì)應(yīng)以上4步,來看它的具體實(shí)現(xiàn):

  1、分配環(huán)形DMA緩沖區(qū)

  Linux內(nèi)核中,用skb來描述一個(gè)緩存,所謂分配,就是建立一定數(shù)量的skb,然后把它們組織成一個(gè)雙向鏈表;

  2、建立DMA映射

  內(nèi)核通過調(diào)用

  dma_map_single(struct device *dev,void *buffer,size_t size,enum dma_data_direction direction)

  建立映射關(guān)系。

  struct device *dev,描述一個(gè)設(shè)備;

  buffer:把哪個(gè)地址映射給設(shè)備;也就是某一個(gè)skb——要映射全部,當(dāng)然是做一個(gè)雙向鏈表的循環(huán)即可;

  size:緩存大小;

  direction:映射方向——誰傳給誰:一般來說,是“雙向”映射,數(shù)據(jù)在設(shè)備和內(nèi)存之間雙向流動(dòng);

  對(duì)于PCI設(shè)備而言(網(wǎng)卡一般是PCI的),通過另一個(gè)包裹函數(shù)pci_map_single,這樣,就把buffer交給設(shè)備了!設(shè)備可以直接從里邊讀/取數(shù)據(jù)。

  3、這一步由硬件完成;

  4、取消映射

  dma_unmap_single,對(duì)PCI而言,大多調(diào)用它的包裹函數(shù)pci_unmap_single,不取消的話,緩存控制權(quán)還在設(shè)備手里,要調(diào)用它,把主動(dòng)權(quán)掌握在CPU手里——因?yàn)槲覀円呀?jīng)接收到數(shù)據(jù)了,應(yīng)該由CPU把數(shù)據(jù)交給上層網(wǎng)絡(luò)棧;

  當(dāng)然,不取消之前,通常要讀一些狀態(tài)位信息,諸如此類,一般是調(diào)用

  dma_sync_single_for_cpu()

  讓CPU在取消映射前,就可以訪問DMA緩沖區(qū)中的內(nèi)容。

  關(guān)于DMA映射的更多內(nèi)容,可以參考《Linux設(shè)備驅(qū)動(dòng)程序》“內(nèi)存映射和DMA”章節(jié)相關(guān)內(nèi)容!

  OK,有了這些知識(shí),我們就可以來看e100的代碼了,它跟上面講的步驟基本上一樣的——繞了這么多圈子,就是想繞到e100上面了,呵呵!

  在e100_open函數(shù)中,調(diào)用e100_up,我們前面分析它時(shí),略過了一個(gè)重要的東東,就是環(huán)形緩沖區(qū)的建立,這一步,是通過

  e100_rx_alloc_list函數(shù)調(diào)用完成的:

  CODE:

  static int e100_rx_alloc_list(struct nic *nic)

  {

  struct rx *rx;

  unsigned int i, count = nic->params.rfds.count;

  nic->rx_to_use = nic->rx_to_clean = NULL;

  nic->ru_running = RU_UNINITIALIZED;

  /*結(jié)構(gòu)struct rx用來描述一個(gè)緩沖區(qū)節(jié)點(diǎn),這里分配了count個(gè)*/

  if(!(nic->rxs = kmalloc(sizeof(struct rx) * count, GFP_ATOMIC)))

  return -ENOMEM;

  memset(nic->rxs, 0, sizeof(struct rx) * count);

  /*雖然是連續(xù)分配的,不過還是遍歷它,建立雙向鏈表,然后為每一個(gè)rx的skb指針分員分配空間

  skb用來描述內(nèi)核中的一個(gè)數(shù)據(jù)包,呵呵,說到重點(diǎn)了*/

  for(rx = nic->rxs, i = 0; i < count; rx++, i++) {

  rx->next = (i + 1 < count) ? rx + 1 : nic->rxs;

  rx->prev = (i == 0) ? nic->rxs + count - 1 : rx - 1;

  if(e100_rx_alloc_skb(nic, rx)) { /*分配緩存*/

  e100_rx_clean_list(nic);

  return -ENOMEM;

  }

  }

  nic->rx_to_use = nic->rx_to_clean = nic->rxs;

  nic->ru_running = RU_SUSPENDED;

  return 0;

  }

  CODE:

  #define RFD_BUF_LEN (sizeof(struct rfd) + VLAN_ETH_FRAME_LEN)

  static inline int e100_rx_alloc_skb(struct nic *nic, struct rx *rx)

  {

  /*skb緩存的分配,是通過調(diào)用系統(tǒng)函數(shù)dev_alloc_skb來完成的,它同內(nèi)核棧中通常調(diào)用alloc_skb的區(qū)別在于,

  它是原子的,所以,通常在中斷上下文中使用*/

  if(!(rx->skb = dev_alloc_skb(RFD_BUF_LEN + NET_IP_ALIGN)))

  return -ENOMEM;

  /*初始化必要的成員 */

  rx->skb->dev = nic->netdev;

  skb_reserve(rx->skb, NET_IP_ALIGN);

  /*這里在數(shù)據(jù)區(qū)之前,留了一塊sizeof(struct rfd) 這么大的空間,該結(jié)構(gòu)的

  一個(gè)重要作用,用來保存一些狀態(tài)信息,比如,在接收數(shù)據(jù)之前,可以先通過

  它,來判斷是否真有數(shù)據(jù)到達(dá)等,諸如此類*/

  memcpy(rx->skb->data, &nic->blank_rfd, sizeof(struct rfd));

  /*這是最關(guān)鍵的一步,建立DMA映射,把每一個(gè)緩沖區(qū)rx->skb->data都映射給了設(shè)備,緩存區(qū)節(jié)點(diǎn)

  rx利用dma_addr保存了每一次映射的地址,這個(gè)地址后面會(huì)被用到*/

  rx->dma_addr = pci_map_single(nic->pdev, rx->skb->data,

  RFD_BUF_LEN, PCI_DMA_BIDIRECTIONAL);

  if(pci_dma_mapping_error(rx->dma_addr)) {

  dev_kfree_skb_any(rx->skb);

  rx->skb = 0;

  rx->dma_addr = 0;

  return -ENOMEM;

  }

  /* Link the RFD to end of RFA by linking previous RFD to

  * this one, and clearing EL bit of previous. */

  if(rx->prev->skb) {

  struct rfd *prev_rfd = (struct rfd *)rx->prev->skb->data;

  /*put_unaligned(val,ptr);用到把var放到ptr指針的地方,它能處理處理內(nèi)存對(duì)齊的問題

  prev_rfd是在緩沖區(qū)開始處保存的一點(diǎn)空間,它的link成員,也保存了映射后的地址*/

  put_unaligned(cpu_to_le32(rx->dma_addr),

  (u32 *)&prev_rfd->link);

  wmb();

  prev_rfd->command &= ~cpu_to_le16(cb_el);

  pci_dma_sync_single_for_device(nic->pdev, rx->prev->dma_addr,

  sizeof(struct rfd), PCI_DMA_TODEVICE);

  }

  return 0;

  }

  e100_rx_alloc_list函數(shù)在一個(gè)循環(huán)中,建立了環(huán)形緩沖區(qū),并調(diào)用e100_rx_alloc_skb為每個(gè)緩沖區(qū)分配了空間,并做了

  DMA映射。這樣,我們就可以來看接收數(shù)據(jù)的過程了。

  前面我們講過,中斷函數(shù)中,調(diào)用netif_rx_schedule,表明使用輪詢技術(shù),系統(tǒng)會(huì)在未來某一時(shí)刻,調(diào)用設(shè)備的poll函數(shù):

  CODE:

  static int e100_poll(struct net_device *netdev, int *budget)

  {

  struct nic *nic = netdev_priv(netdev);

  unsigned int work_to_do = min(netdev->quota, *budget);

  unsigned int work_done = 0;

  int tx_cleaned;

  e100_rx_clean(nic, &work_done, work_to_do);

  tx_cleaned = e100_tx_clean(nic);

  /* If no Rx and Tx cleanup work was done, exit polling mode. */

  if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) {

  netif_rx_complete(netdev);

  e100_enable_irq(nic);

  return 0;

  }

  *budget -= work_done;

  netdev->quota -= work_done;

  return 1;

  }

  目前,我們只關(guān)心rx,所以,e100_rx_clean函數(shù)就成了我們關(guān)注的對(duì)像,它用來從緩沖隊(duì)列中接收全部數(shù)據(jù)(這或許是取名為clean的原因吧!):

  CODE:

  static inline void e100_rx_clean(struct nic *nic, unsigned int *work_done,

  unsigned int work_to_do)

  {

  struct rx *rx;

  int restart_required = 0;

  struct rx *rx_to_start = NULL;

  /* are we already rnr? then pay attention!!! this ensures that

  * the state machine progression never allows a start with a

  * partially cleaned list, avoiding a race between hardware

  * and rx_to_clean when in NAPI mode */

  if(RU_SUSPENDED == nic->ru_running)

  restart_required = 1;

  /* 函數(shù)最重要的工作,就是遍歷環(huán)形緩沖區(qū),接收數(shù)據(jù)*/

  for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) {

  int err = e100_rx_indicate(nic, rx, work_done, work_to_do);

  if(-EAGAIN == err) {

  /* hit quota so have more work to do, restart once

  * cleanup is complete */

  restart_required = 0;

  break;

  } else if(-ENODATA == err)

  break; /* No more to clean */

  }

  /* save our starting point as the place we'll restart the receiver */

  if(restart_required)

  rx_to_start = nic->rx_to_clean;

  /* Alloc new skbs to refill list */

  for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) {

  if(unlikely(e100_rx_alloc_skb(nic, rx)))

  break; /* Better luck next time (see watchdog) */

  }

  if(restart_required) {

  // ack the rnr?

  writeb(stat_ack_rnr, &nic->csr->scb.stat_ack);

  e100_start_receiver(nic, rx_to_start);

  if(work_done)

  (*work_done)++;

  }

  }

  CODE:

  static inline int e100_rx_indicate(struct nic *nic, struct rx *rx,

  unsigned int *work_done, unsigned int work_to_do)

  {

  struct sk_buff *skb = rx->skb;

  struct rfd *rfd = (struct rfd *)skb->data;

  u16 rfd_status, actual_size;

  if(unlikely(work_done && *work_done >= work_to_do))

  return -EAGAIN;

  /* 讀取數(shù)據(jù)之前,也就是取消DMA映射之前,需要先讀取cb_complete 狀態(tài)位,

  以確定數(shù)據(jù)是否真的準(zhǔn)備好了,并且,rfd的actual_size中,也包含了真實(shí)的數(shù)據(jù)大小

  pci_dma_sync_single_for_cpu函數(shù)前面已經(jīng)介紹過,它讓CPU在取消DMA映射之前,具備

  訪問DMA緩存的能力*/

  pci_dma_sync_single_for_cpu(nic->pdev, rx->dma_addr,

  sizeof(struct rfd), PCI_DMA_FROMDEVICE);

  rfd_status = le16_to_cpu(rfd->status);

  DPRINTK(RX_STATUS, DEBUG, "status=0x%04X\n", rfd_status);

  /* If data isn't ready, nothing to indicate */

  if(unlikely(!(rfd_status & cb_complete)))

  return -ENODATA;

  /* Get actual data size */

  actual_size = le16_to_cpu(rfd->actual_size) & 0x3FFF;

  if(unlikely(actual_size > RFD_BUF_LEN - sizeof(struct rfd)))

  actual_size = RFD_BUF_LEN - sizeof(struct rfd);

  /* 取消映射,因?yàn)橥ㄟ^DMA,網(wǎng)卡已經(jīng)把數(shù)據(jù)放在了主內(nèi)存中,這里一取消,也就意味著,

  CPU可以處理主內(nèi)存中的數(shù)據(jù)了 */

  pci_unmap_single(nic->pdev, rx->dma_addr,

  RFD_BUF_LEN, PCI_DMA_FROMDEVICE);

  /* this allows for a fast restart without re-enabling interrupts */

  if(le16_to_cpu(rfd->command) & cb_el)

  nic->ru_running = RU_SUSPENDED;

  /*正確地設(shè)置data指針,因?yàn)樽钋懊嬗幸粋€(gè)sizeof(struct rfd)大小區(qū)域,跳過它*/

  skb_reserve(skb, sizeof(struct rfd));

  /*更新skb的tail和len指針,也是就更新接收到這么多數(shù)據(jù)的長度*/

  skb_put(skb, actual_size);

  /*設(shè)置協(xié)議位*/

  skb->protocol = eth_type_trans(skb, nic->netdev);

  if(unlikely(!(rfd_status & cb_ok))) {

  /* Don't indicate if hardware indicates errors */

  nic->net_stats.rx_dropped++;

  dev_kfree_skb_any(skb);

  } else if(actual_size > nic->netdev->mtu + VLAN_ETH_HLEN) {

  /* Don't indicate oversized frames */

  nic->rx_over_length_errors++;

  nic->net_stats.rx_dropped++;

  dev_kfree_skb_any(skb);

  } else {

  /*網(wǎng)卡驅(qū)動(dòng)要做的最后一步,就是統(tǒng)計(jì)接收計(jì)數(shù)器,設(shè)置接收時(shí)間戳,然后調(diào)用netif_receive_skb,

  把數(shù)據(jù)包交給上層協(xié)議棧,自己的光榮始命也就完成了*/

  nic->net_stats.rx_packets++;

  nic->net_stats.rx_bytes += actual_size;

  nic->netdev->last_rx = jiffies;

  netif_receive_skb(skb);

  if(work_done)

  (*work_done)++;

  }

  rx->skb = NULL;

  return 0;

  }

  網(wǎng)卡驅(qū)動(dòng)執(zhí)行到這里,數(shù)據(jù)接收的工作,也就處理完成了。但是,使用這一種方法的驅(qū)動(dòng),省去了網(wǎng)絡(luò)棧中一個(gè)重要的內(nèi)容,就是

  “隊(duì)列層”,讓我們來看看,傳統(tǒng)中斷接收數(shù)據(jù)包模式下,使用netif_rx函數(shù)調(diào)用,又會(huì)發(fā)生什么。

  PS:九賤沒有去研究過所謂的“零拷貝”技術(shù),不太清楚,它同這種DMA直取方式有何不同?難道是把網(wǎng)卡中的I/O內(nèi)存直接映射到主內(nèi)存中,這樣CPU就可以像讀取主內(nèi)存一樣,讀取網(wǎng)卡的內(nèi)存,但是這要求設(shè)備要有好大的I/O內(nèi)存來做緩沖呀!!^o^,外行了……希望哪位DX提點(diǎn)!

  五、隊(duì)列層

  1、軟中斷與下半部

  當(dāng)用中斷處理的時(shí)候,為了減少中斷處理的工作量,比如,一般中斷處理時(shí),需要屏蔽其它中斷,如果中斷處理時(shí)間過長,那么其它中斷

  有可能得不到及時(shí)處理,也以,有一種機(jī)制,就是把“不必馬上處理”的工作,推遲一點(diǎn),讓它在中斷處理后的某一個(gè)時(shí)刻得到處理。這就

  是下半部。

  下半部只是一個(gè)機(jī)制,它在Linux中,有多種實(shí)現(xiàn)方式,其中一種對(duì)時(shí)間要求最嚴(yán)格的實(shí)現(xiàn)方式,叫“軟中斷”,可以使用:

  open_softirq()

  來向內(nèi)核注冊(cè)一個(gè)軟中斷,

  然后,在合適的時(shí)候,調(diào)用

  raise_softirq_irqoff()

  觸發(fā)它。

  如果采用中斷方式接收數(shù)據(jù)(這一節(jié)就是在說中斷方式接收,后面,就不用這種假設(shè)了),同樣也需要軟中斷,可以調(diào)用

  open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

  向內(nèi)核注冊(cè)一個(gè)名為NET_RX_SOFTIR的軟中斷,net_rx_action是軟中斷的處理函數(shù)。

  然后,在驅(qū)動(dòng)中斷處理完后的某一個(gè)時(shí)刻,調(diào)用

  raise_softirq_irqoff(NET_RX_SOFTIRQ);

  觸發(fā)它,這樣net_rx_action將得到執(zhí)行。

  2、隊(duì)列層

  什么是隊(duì)列層?通常,在網(wǎng)卡收發(fā)數(shù)據(jù)的時(shí)候,需要維護(hù)一個(gè)緩沖區(qū)隊(duì)列,來緩存可能存在的突發(fā)數(shù)據(jù),類似于前面的DMA環(huán)形緩沖區(qū)。

  隊(duì)列層中,包含了一個(gè)叫做struct softnet_data:

  CODE:

  struct softnet_data

  {

  /*throttle 用于擁塞控制,當(dāng)擁塞發(fā)生時(shí),throttle將被設(shè)置,后續(xù)進(jìn)入的數(shù)據(jù)包將被丟棄*/

  int throttle;

  /*netif_rx函數(shù)返回的擁塞級(jí)別*/

  int cng_level;

  int avg_blog;

  /*softnet_data 結(jié)構(gòu)包含一個(gè)指向接收和傳輸隊(duì)列的指針,input_pkt_queue成員指向準(zhǔn)備傳送

  給網(wǎng)絡(luò)層的sk_buffs包鏈表的首部的指針,這個(gè)隊(duì)列中的包是由netif_rx函數(shù)遞交的*/

  struct sk_buff_head input_pkt_queue;

  struct list_head poll_list;

  struct net_device *output_queue;

  struct sk_buff *completion_queue;

  struct net_device backlog_dev; /* Sorry. 8) */

  };

  內(nèi)核使用了一個(gè)同名的變量softnet_data,它是一個(gè)Per-CPU變量,每個(gè)CPU都有一個(gè)。

  net/core/dev.c

  CODE:

  DECLARE_PER_CPU(struct softnet_data,softnet_data);

  CODE:

  /*

  * 網(wǎng)絡(luò)模塊的核心處理模塊.

  */

  static int __init net_dev_init(void)

  {

  int i, rc = -ENOMEM;

  BUG_ON(!dev_boot_phase);

  net_random_init();

  if (dev_proc_init()) /*初始化proc文件系統(tǒng)*/

  goto out;

  if (netdev_sysfs_init()) /*初始化sysfs文件系統(tǒng)*/

  goto out;

  /*ptype_all和ptype_base是重點(diǎn),后面會(huì)詳細(xì)分析,它們都是

  struct list_head類型變量,這里初始化鏈表成員*/

  INIT_LIST_HEAD(&ptype_all);

  for (i = 0; i < 16; i++)

  INIT_LIST_HEAD(&ptype_base[i]);

  for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)

  INIT_HLIST_HEAD(&dev_name_head[i]);

  for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)

  INIT_HLIST_HEAD(&dev_index_head[i]);

  /*

  * 初始化包接收隊(duì)列,這里我們的重點(diǎn)了.

  */

  /*遍歷每一個(gè)CPU,取得它的softnet_data,我們說過,它是一個(gè)struct softnet_data的Per-CPU變量*/

  for (i = 0; i < NR_CPUS; i++) {

  struct softnet_data *queue;

  /*取得第i個(gè)CPU的softnet_data,因?yàn)殛?duì)列是包含在它里邊的,所以,我會(huì)直接說,“取得隊(duì)列”*/

  queue = &per_cpu(softnet_data, i);

  /*初始化隊(duì)列頭*/

  skb_queue_head_init(&queue->input_pkt_queue);

  queue->throttle = 0;

  queue->cng_level = 0;

  queue->avg_blog = 10; /* arbitrary non-zero */

  queue->completion_queue = NULL;

  INIT_LIST_HEAD(&queue->poll_list);

  set_bit(__LINK_STATE_START, &queue->backlog_dev.state);

  queue->backlog_dev.weight = weight_p;

  /*這里,隊(duì)列中backlog_dev設(shè)備,它是一個(gè)偽網(wǎng)絡(luò)設(shè)備,不對(duì)應(yīng)任何物理設(shè)備,它的poll函數(shù),指向了

  process_backlog,后面我們會(huì)詳細(xì)分析*/

  queue->backlog_dev.poll = process_backlog;

  atomic_set(&queue->backlog_dev.refcnt, 1);

  }

  #ifdef OFFLINE_SAMPLE

  samp_timer.expires = jiffies + (10 * HZ);

  add_timer(&samp_timer);

  #endif

  dev_boot_phase = 0;

  /*注冊(cè)收/發(fā)軟中斷*/

  open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);

  open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

  hotcpu_notifier(dev_cpu_callback, 0);

  dst_init();

  dev_mcast_init();

  rc = 0;

  out:

  return rc;

  }

  這樣,初始化完成后,在驅(qū)動(dòng)程序中,在中斷處理函數(shù)中,會(huì)調(diào)用netif_rx將數(shù)據(jù)交上來,這與采用輪詢技術(shù),有本質(zhì)的不同:

  CODE:

  int netif_rx(struct sk_buff *skb)

  {

  int this_cpu;

  struct softnet_data *queue;

  unsigned long flags;

  /* if netpoll wants it, pretend we never saw it */

  if (netpoll_rx(skb))

  return NET_RX_DROP;

  /*接收時(shí)間戳未設(shè)置,設(shè)置之*/

  if (!skb->stamp.tv_sec)

  net_timestamp(&skb->stamp);

  /*

  * 這里準(zhǔn)備將數(shù)據(jù)包放入接收隊(duì)列,需要禁止本地中斷,在入隊(duì)操作完成后,再打開中斷.

  */

  local_irq_save(flags);

  /*獲取當(dāng)前CPU對(duì)應(yīng)的softnet_data變量*/

  this_cpu = smp_processor_id();

  queue = &__get_cpu_var(softnet_data);

  /*接收計(jì)數(shù)器累加*/

  __get_cpu_var(netdev_rx_stat).total++;

  /*接收隊(duì)列是否已滿*/

  if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {

  if (queue->input_pkt_queue.qlen) {

  if (queue->throttle) /*擁塞發(fā)生了,丟棄數(shù)據(jù)包*/

  goto drop;

  /*數(shù)據(jù)包入隊(duì)操作*/

  enqueue:

  dev_hold(skb->dev); /*累加設(shè)備引入計(jì)數(shù)器*/

  __skb_queue_tail(&queue->input_pkt_queue, skb); /*將數(shù)據(jù)包加入接收隊(duì)列*/

  #ifndef OFFLINE_SAMPLE

  get_sample_stats(this_cpu);

  #endif

  local_irq_restore(flags);

  return queue->cng_level;

  }

  /*

  * 驅(qū)動(dòng)程序不斷地調(diào)用net_rx函數(shù),實(shí)現(xiàn)接收數(shù)據(jù)包的入隊(duì)操作,當(dāng)queue->input_pkt_queue.qlen == 0時(shí)(?什么情況下設(shè)置)

  * 則進(jìn)入這段代碼,這里,如果已經(jīng)被設(shè)置擁塞標(biāo)志的話,則清除它,因?yàn)檫@里將要調(diào)用軟中斷,開始將數(shù)據(jù)包交給

  * 上層了,即上層協(xié)議的接收函數(shù)將執(zhí)行出隊(duì)操作,擁塞自然而然也就不存在了。

  */

  if (queue->throttle)

  queue->throttle = 0;

  /*

  * netif_rx_schedule函數(shù)完成兩件重要的工作:

  * 1、將bakclog_dev設(shè)備加入“處理數(shù)據(jù)包的設(shè)備”的鏈表當(dāng)中;

  * 2、觸發(fā)軟中斷函數(shù),進(jìn)行數(shù)據(jù)包接收處理;

  */

  netif_rx_schedule(&queue->backlog_dev);

  goto enqueue;

  }

  /*前面判斷了隊(duì)列是否已滿,如果已滿而標(biāo)志未設(shè)置,設(shè)置之,并累加擁塞計(jì)數(shù)器*/

  if (!queue->throttle) {

  queue->throttle = 1;

  __get_cpu_var(netdev_rx_stat).throttled++;

  }

  /*擁塞發(fā)生,累加丟包計(jì)數(shù)器,釋放數(shù)據(jù)包*/

  drop:

  __get_cpu_var(netdev_rx_stat).dropped++;

  local_irq_restore(flags);

  kfree_skb(skb);

  return NET_RX_DROP;

  }

  從這段代碼的分析中,我們可以看到,當(dāng)數(shù)據(jù)被接收后,netif_rx的工作,就是取得當(dāng)前CPU的隊(duì)列,然后入隊(duì),然后返回,然后中斷函數(shù)

  現(xiàn)調(diào)用它,它再把數(shù)據(jù)包入隊(duì)……

  當(dāng)隊(duì)列接收完成后,netif_rx就調(diào)用netif_rx_schedule進(jìn)一步處理數(shù)據(jù)包,我們注意到:

  1、前面討論過,采用輪詢技術(shù)時(shí),同樣地,也是調(diào)用netif_rx_schedule,把設(shè)備自己傳遞了過去;

  2、這里,采用中斷方式,傳遞的是隊(duì)列中的一個(gè)“偽設(shè)備”,并且,這個(gè)偽設(shè)備的poll函數(shù)指針,指向了一個(gè)叫做process_backlog的函數(shù);

  netif_rx_schedule函數(shù)完成兩件重要的工作:

  1、將bakclog_dev設(shè)備加入“處理數(shù)據(jù)包的設(shè)備”的鏈表當(dāng)中;

  2、觸發(fā)軟中斷函數(shù),進(jìn)行數(shù)據(jù)包接收處理;

  這樣,我們可以猜想,在軟中斷函數(shù)中,不論是偽設(shè)備bakclog_dev,還是真實(shí)的設(shè)備(如前面討論過的e100),都會(huì)被軟中斷函數(shù)以:

  dev->poll()

  的形式調(diào)用,對(duì)于e100來說,poll函數(shù)的接收過程已經(jīng)分析了,而對(duì)于其它所有沒有采用輪詢技術(shù)的網(wǎng)絡(luò)設(shè)備來說,它們將統(tǒng)統(tǒng)調(diào)用

  process_backlog函數(shù)(我覺得把它改名為pseudo-poll是否更合適一些^o^)。

  OK,我想分析到這里,關(guān)于中斷處理與輪詢技術(shù)的差異,已經(jīng)基本分析開了……

  繼續(xù)來看,netif_rx_schedule進(jìn)一步調(diào)用__netif_rx_schedule:

  CODE:

  /* Try to reschedule poll. Called by irq handler. */

  static inline void netif_rx_schedule(struct net_device *dev)

  {

  if (netif_rx_schedule_prep(dev))

  __netif_rx_schedule(dev);

  }

  CODE:

  /* Add interface to tail of rx poll list. This assumes that _prep has

  * already been called and returned 1.

  */

  static inline void __netif_rx_schedule(struct net_device *dev)

  {

  unsigned long flags;

  local_irq_save(flags);

  dev_hold(dev);

  /*偽設(shè)備也好,真實(shí)的設(shè)備也罷,都被加入了隊(duì)列層的設(shè)備列表*/

  list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);

  if (dev->quota < 0)

  dev->quota += dev->weight;

  else

  dev->quota = dev->weight;

  /*觸發(fā)軟中斷*/

  __raise_softirq_irqoff(NET_RX_SOFTIRQ);

  local_irq_restore(flags);

  }

  軟中斷被觸發(fā),注冊(cè)的net_rx_action函數(shù)將被調(diào)用:

  CODE:

  /*接收的軟中斷處理函數(shù)*/

  static void net_rx_action(struct softirq_action *h)

  {

  struct softnet_data *queue = &__get_cpu_var(softnet_data);

  unsigned long start_time = jiffies;

  int budget = netdev_max_backlog;

  local_irq_disable();

  /*

  * 遍歷隊(duì)列的設(shè)備鏈表,如前所述,__netif_rx_schedule已經(jīng)執(zhí)行了

  * list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);

  * 設(shè)備bakclog_dev已經(jīng)被添加進(jìn)來了

  */

  while (!list_empty(&queue->poll_list)) {

  struct net_device *dev;

  if (budget <= 0 || jiffies - start_time > 1)

  goto softnet_break;

  local_irq_enable();

  /*取得鏈表中的設(shè)備*/

  dev = list_entry(queue->poll_list.next,

  struct net_device, poll_list);

  netpoll_poll_lock(dev);

  /*調(diào)用設(shè)備的poll函數(shù),處理接收數(shù)據(jù)包,這樣,采用輪詢技術(shù)的網(wǎng)卡,它的真實(shí)的poll函數(shù)將被調(diào)用,

  這就回到我們上一節(jié)討論的e100_poll函數(shù)去了,而對(duì)于采用傳統(tǒng)中斷處理的設(shè)備,它們調(diào)用的,都將是

  bakclog_dev的process_backlog函數(shù)*/

  if (dev->quota <= 0 || dev->poll(dev, &budget)) {

  netpoll_poll_unlock(dev);

  /*處理完成后,把設(shè)備從設(shè)備鏈表中刪除,又重置于末尾*/

  local_irq_disable();

  list_del(&dev->poll_list);

  list_add_tail(&dev->poll_list, &queue->poll_list);

  if (dev->quota < 0)

  dev->quota += dev->weight;

  else

  dev->quota = dev->weight;

  } else {

  netpoll_poll_unlock(dev);

  dev_put(dev);

  local_irq_disable();

  }

  }

  out:

  local_irq_enable();

  return;

  softnet_break:

  __get_cpu_var(netdev_rx_stat).time_squeeze++;

  __raise_softirq_irqoff(NET_RX_SOFTIRQ);

  goto out;

  }

  對(duì)于dev->poll(dev, &budget)的調(diào)用,一個(gè)真實(shí)的poll函數(shù)的例子,我們已經(jīng)分析過了,現(xiàn)在來看process_backlog,

  CODE:

  static int process_backlog(struct net_device *backlog_dev, int *budget)

  {

  int work = 0;

  int quota = min(backlog_dev->quota, *budget);

  struct softnet_data *queue = &__get_cpu_var(softnet_data);

  unsigned long start_time = jiffies;

  backlog_dev->weight = weight_p;

  /*在這個(gè)循環(huán)中,執(zhí)行出隊(duì)操作,把數(shù)據(jù)從隊(duì)列中取出來,交給netif_receive_skb,直至隊(duì)列為空*/

  for (;;) {

  struct sk_buff *skb;

  struct net_device *dev;

  local_irq_disable();

  skb = __skb_dequeue(&queue->input_pkt_queue);

  if (!skb)

  goto job_done;

  local_irq_enable();

  dev = skb->dev;

  netif_receive_skb(skb);

  dev_put(dev);

  work++;

  if (work >= quota || jiffies - start_time > 1)

  break;

  }

  backlog_dev->quota -= work;

  *budget -= work;

  return -1;

  /*當(dāng)隊(duì)列中的數(shù)據(jù)包被全部處理后,將執(zhí)行到這里*/

  job_done:

  backlog_dev->quota -= work;

  *budget -= work;

  list_del(&backlog_dev->poll_list);

  smp_mb__before_clear_bit();

  netif_poll_enable(backlog_dev);

  if (queue->throttle)

  queue->throttle = 0;

  local_irq_enable();

  return 0;

  }

  這個(gè)函數(shù)重要的工作,就是出隊(duì),然后調(diào)用netif_receive_skb()將數(shù)據(jù)包交給上層,這與上一節(jié)討論的poll是一樣的。這也是為什么,

  在網(wǎng)卡驅(qū)動(dòng)的編寫中,采用中斷技術(shù),要調(diào)用netif_rx,而采用輪詢技術(shù),要調(diào)用netif_receive_skb啦!

  到了這里,就處理完數(shù)據(jù)包與設(shè)備相關(guān)的部分了,數(shù)據(jù)包將進(jìn)入上層協(xié)議棧

653624