page_alloc.c.frag.html

Additions are indicated as appropriate.


    1  /* page_alloc_fragment.c -- Contains *only* the function __alloc_page_internal()
    2   *   from mm/page_alloc.c, together with modifications to support AllocInfo.
    3   * Copyright C2009 by EQware Engineering, Inc.
    4   *
    5   *    page_alloc_fragment.c is part of AllocInfo.
    6   *
    7   *    AllocInfo is free software: you can redistribute it and/or modify
    8   *    it under the terms of version 3 of the GNU General Public License
    9   *    as published by the Free Software Foundation
   10   *
   11   *    AllocInfo is distributed in the hope that it will be useful,
   12   *    but WITHOUT ANY WARRANTY; without even the implied warranty of
   13   *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14   *    GNU General Public License for more details.
   15   *
   16   *    You should have received a copy of the GNU General Public License
   17   *    along with AllocInfo.  If not, see http://www.gnu.org/licenses. 
   18   *********************************************************************/
   19  
   20  /*
   21   * This is the 'heart' of the zoned buddy allocator.
   22   */
   23  struct page *
   24  __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
   25                          struct zonelist *zonelist, nodemask_t *nodemask)
   26  {
   27          extern unsigned long long heap_alloc_count[6];
   28  
   29          const gfp_t wait = gfp_mask & __GFP_WAIT;
   30          enum zone_type high_zoneidx = gfp_zone(gfp_mask);
   31          struct zoneref *z;
   32          struct zone *zone;
   33          struct page *page;
   34          struct reclaim_state reclaim_state;
   35          struct task_struct *p = current;
   36          int do_retry;
   37          int alloc_flags;
   38          unsigned long did_some_progress;
   39          unsigned long pages_reclaimed = 0;
   40  
   41          lockdep_trace_alloc(gfp_mask);
   42  
   43          might_sleep_if(wait);
   44  
   45          if (should_fail_alloc_page(gfp_mask, order))
   46                  return NULL;
   47  
   48  restart:
   49          z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */
   50  
   51          if (unlikely(!z->zone)) {
   52                  /*
   53                   * Happens if we have an empty zonelist as a result of
   54                   * GFP_THISNODE being used on a memoryless node
   55                   */
   56                  return NULL;
   57          }
   58  
   59          page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
   60                          zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
   61          if (page)
   62          {
   63                  heap_alloc_count[0]++;
   64                  goto got_pg;
   65          }
   66  
   67          /*
   68           * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
   69           * __GFP_NOWARN set) should not cause reclaim since the subsystem
   70           * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
   71           * using a larger set of nodes after it has established that the
   72           * allowed per node queues are empty and that nodes are
   73           * over allocated.
   74           */
   75          if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
   76                  goto nopage;
   77  
   78          for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
   79                  wakeup_kswapd(zone, order);
   80  
   81          /*
   82           * OK, we're below the kswapd watermark and have kicked background
   83           * reclaim. Now things get more complex, so set up alloc_flags according
   84           * to how we want to proceed.
   85           *
   86           * The caller may dip into page reserves a bit more if the caller
   87           * cannot run direct reclaim, or if the caller has realtime scheduling
   88           * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
   89           * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
   90           */
   91          alloc_flags = ALLOC_WMARK_MIN;
   92          if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
   93                  alloc_flags |= ALLOC_HARDER;
   94          if (gfp_mask & __GFP_HIGH)
   95                  alloc_flags |= ALLOC_HIGH;
   96          if (wait)
   97                  alloc_flags |= ALLOC_CPUSET;
   98  
   99          /*
  100           * Go through the zonelist again. Let __GFP_HIGH and allocations
  101           * coming from realtime tasks go deeper into reserves.
  102           *
  103           * This is the last chance, in general, before the goto nopage.
  104           * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
  105           * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
  106           */
  107          page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
  108                                                  high_zoneidx, alloc_flags);
  109          if (page)
  110          {
  111                  heap_alloc_count[1]++;
  112                  goto got_pg;
  113          }
  114  
  115          /* This allocation should allow future memory freeing. */
  116  
  117  rebalance:
  118          if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
  119                          && !in_interrupt()) {
  120                  if (!(gfp_mask & __GFP_NOMEMALLOC)) {
  121  nofail_alloc:
  122                          /* go through the zonelist yet again, ignoring mins */
  123                          page = get_page_from_freelist(gfp_mask, nodemask, order,
  124                                  zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
  125                          if (page)
  126                          {
  127                                  heap_alloc_count[2]++;
  128                                  goto got_pg;
  129                          }
  130                          if (gfp_mask & __GFP_NOFAIL) {
  131                                  congestion_wait(WRITE, HZ/50);
  132                                  goto nofail_alloc;
  133                          }
  134                  }
  135                  goto nopage;
  136          }
  137  
  138          /* Atomic allocations - we can't balance anything */
  139          if (!wait)
  140                  goto nopage;
  141  
  142          cond_resched();
  143  
  144          /* We now go into synchronous reclaim */
  145          cpuset_memory_pressure_bump();
  146          /*
  147           * The task's cpuset might have expanded its set of allowable nodes
  148           */
  149          cpuset_update_task_memory_state();
  150          p->flags |= PF_MEMALLOC;
  151  
  152          lockdep_set_current_reclaim_state(gfp_mask);
  153          reclaim_state.reclaimed_slab = 0;
  154          p->reclaim_state = &reclaim_state;
  155  
  156          did_some_progress = try_to_free_pages(zonelist, order,
  157                                                  gfp_mask, nodemask);
  158  
  159          p->reclaim_state = NULL;
  160          lockdep_clear_current_reclaim_state();
  161          p->flags &= ~PF_MEMALLOC;
  162  
  163          cond_resched();
  164  
  165          if (order != 0)
  166                  drain_all_pages();
  167  
  168          if (likely(did_some_progress)) {
  169                  page = get_page_from_freelist(gfp_mask, nodemask, order,
  170                                          zonelist, high_zoneidx, alloc_flags);
  171                  if (page)
  172                  {
  173                          heap_alloc_count[3]++;
  174                          goto got_pg;
  175                  }
  176          } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
  177                  if (!try_set_zone_oom(zonelist, gfp_mask)) {
  178                          schedule_timeout_uninterruptible(1);
  179                          goto restart;
  180                  }
  181  
  182                  /*
  183                   * Go through the zonelist yet one more time, keep
  184                   * very high watermark here, this is only to catch
  185                   * a parallel oom killing, we must fail if we're still
  186                   * under heavy pressure.
  187                   */
  188                  page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
  189                          order, zonelist, high_zoneidx,
  190                          ALLOC_WMARK_HIGH|ALLOC_CPUSET);
  191                  if (page) {
  192                          clear_zonelist_oom(zonelist, gfp_mask);
  193                          heap_alloc_count[4]++;
  194                          goto got_pg;
  195                  }
  196  
  197                  /* The OOM killer will not help higher order allocs so fail */
  198                  if (order > PAGE_ALLOC_COSTLY_ORDER) {
  199                          clear_zonelist_oom(zonelist, gfp_mask);
  200                          goto nopage;
  201                  }
  202  
  203                  out_of_memory(zonelist, gfp_mask, order);
  204                  clear_zonelist_oom(zonelist, gfp_mask);
  205                  goto restart;
  206          }
  207  
  208          /*
  209           * Don't let big-order allocations loop unless the caller explicitly
  210           * requests that.  Wait for some write requests to complete then retry.
  211           *
  212           * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
  213           * means __GFP_NOFAIL, but that may not be true in other
  214           * implementations.
  215           *
  216           * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
  217           * specified, then we retry until we no longer reclaim any pages
  218           * (above), or we've reclaimed an order of pages at least as
  219           * large as the allocation's order. In both cases, if the
  220           * allocation still fails, we stop retrying.
  221           */
  222          pages_reclaimed += did_some_progress;
  223          do_retry = 0;
  224          if (!(gfp_mask & __GFP_NORETRY)) {
  225                  if (order <= PAGE_ALLOC_COSTLY_ORDER) {
  226                          do_retry = 1;
  227                  } else {
  228                          if (gfp_mask & __GFP_REPEAT &&
  229                                  pages_reclaimed < (1 << order))
  230                                          do_retry = 1;
  231                  }
  232                  if (gfp_mask & __GFP_NOFAIL)
  233                          do_retry = 1;
  234          }
  235          if (do_retry) {
  236                  congestion_wait(WRITE, HZ/50);
  237                  goto rebalance;
  238          }
  239  
  240  nopage:
  241          heap_alloc_count[5]++;
  242          if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
  243                  printk(KERN_WARNING "%s: page allocation failure."
  244                          " order:%d, mode:0x%x\n",
  245                          p->comm, order, gfp_mask);
  246                  dump_stack();
  247                  show_mem();
  248          }
  249  got_pg:
  250          return page;
  251  }