/* page_alloc_fragment.c -- Contains *only* the function __alloc_page_internal()
 *   from mm/page_alloc.c, together with modifications to support AllocInfo.
 * Copyright C2009 by EQware Engineering, Inc.
 *
 *    page_alloc_fragment.c is part of AllocInfo.
 *
 *    AllocInfo is free software: you can redistribute it and/or modify
 *    it under the terms of version 3 of the GNU General Public License
 *    as published by the Free Software Foundation
 *
 *    AllocInfo is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with AllocInfo.  If not, see http://www.gnu.org/licenses. 
 *********************************************************************/

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *
__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
                        struct zonelist *zonelist, nodemask_t *nodemask)
{
        extern unsigned long long heap_alloc_count[6];

        const gfp_t wait = gfp_mask & __GFP_WAIT;
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        struct zoneref *z;
        struct zone *zone;
        struct page *page;
        struct reclaim_state reclaim_state;
        struct task_struct *p = current;
        int do_retry;
        int alloc_flags;
        unsigned long did_some_progress;
        unsigned long pages_reclaimed = 0;

        lockdep_trace_alloc(gfp_mask);

        might_sleep_if(wait);

        if (should_fail_alloc_page(gfp_mask, order))
                return NULL;

restart:
        z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */

        if (unlikely(!z->zone)) {
                /*
                 * Happens if we have an empty zonelist as a result of
                 * GFP_THISNODE being used on a memoryless node
                 */
                return NULL;
        }

        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
        if (page)
        {
                heap_alloc_count[0]++;
                goto got_pg;
        }

        /*
         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
         * __GFP_NOWARN set) should not cause reclaim since the subsystem
         * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
         * using a larger set of nodes after it has established that the
         * allowed per node queues are empty and that nodes are
         * over allocated.
         */
        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;

        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
                wakeup_kswapd(zone, order);

        /*
         * OK, we're below the kswapd watermark and have kicked background
         * reclaim. Now things get more complex, so set up alloc_flags according
         * to how we want to proceed.
         *
         * The caller may dip into page reserves a bit more if the caller
         * cannot run direct reclaim, or if the caller has realtime scheduling
         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
         * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
         */
        alloc_flags = ALLOC_WMARK_MIN;
        if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
                alloc_flags |= ALLOC_HARDER;
        if (gfp_mask & __GFP_HIGH)
                alloc_flags |= ALLOC_HIGH;
        if (wait)
                alloc_flags |= ALLOC_CPUSET;

        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
         * coming from realtime tasks go deeper into reserves.
         *
         * This is the last chance, in general, before the goto nopage.
         * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                                                high_zoneidx, alloc_flags);
        if (page)
        {
                heap_alloc_count[1]++;
                goto got_pg;
        }

        /* This allocation should allow future memory freeing. */

rebalance:
        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                        && !in_interrupt()) {
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
nofail_alloc:
                        /* go through the zonelist yet again, ignoring mins */
                        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
                        if (page)
                        {
                                heap_alloc_count[2]++;
                                goto got_pg;
                        }
                        if (gfp_mask & __GFP_NOFAIL) {
                                congestion_wait(WRITE, HZ/50);
                                goto nofail_alloc;
                        }
                }
                goto nopage;
        }

        /* Atomic allocations - we can't balance anything */
        if (!wait)
                goto nopage;

        cond_resched();

        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
        /*
         * The task's cpuset might have expanded its set of allowable nodes
         */
        cpuset_update_task_memory_state();
        p->flags |= PF_MEMALLOC;

        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;

        did_some_progress = try_to_free_pages(zonelist, order,
                                                gfp_mask, nodemask);

        p->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
        p->flags &= ~PF_MEMALLOC;

        cond_resched();

        if (order != 0)
                drain_all_pages();

        if (likely(did_some_progress)) {
                page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx, alloc_flags);
                if (page)
                {
                        heap_alloc_count[3]++;
                        goto got_pg;
                }
        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                if (!try_set_zone_oom(zonelist, gfp_mask)) {
                        schedule_timeout_uninterruptible(1);
                        goto restart;
                }

                /*
                 * Go through the zonelist yet one more time, keep
                 * very high watermark here, this is only to catch
                 * a parallel oom killing, we must fail if we're still
                 * under heavy pressure.
                 */
                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
                        order, zonelist, high_zoneidx,
                        ALLOC_WMARK_HIGH|ALLOC_CPUSET);
                if (page) {
                        clear_zonelist_oom(zonelist, gfp_mask);
                        heap_alloc_count[4]++;
                        goto got_pg;
                }

                /* The OOM killer will not help higher order allocs so fail */
                if (order > PAGE_ALLOC_COSTLY_ORDER) {
                        clear_zonelist_oom(zonelist, gfp_mask);
                        goto nopage;
                }

                out_of_memory(zonelist, gfp_mask, order);
                clear_zonelist_oom(zonelist, gfp_mask);
                goto restart;
        }

        /*
         * Don't let big-order allocations loop unless the caller explicitly
         * requests that.  Wait for some write requests to complete then retry.
         *
         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
         * means __GFP_NOFAIL, but that may not be true in other
         * implementations.
         *
         * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
         * specified, then we retry until we no longer reclaim any pages
         * (above), or we've reclaimed an order of pages at least as
         * large as the allocation's order. In both cases, if the
         * allocation still fails, we stop retrying.
         */
        pages_reclaimed += did_some_progress;
        do_retry = 0;
        if (!(gfp_mask & __GFP_NORETRY)) {
                if (order <= PAGE_ALLOC_COSTLY_ORDER) {
                        do_retry = 1;
                } else {
                        if (gfp_mask & __GFP_REPEAT &&
                                pages_reclaimed < (1 << order))
                                        do_retry = 1;
                }
                if (gfp_mask & __GFP_NOFAIL)
                        do_retry = 1;
        }
        if (do_retry) {
                congestion_wait(WRITE, HZ/50);
                goto rebalance;
        }

nopage:
        heap_alloc_count[5]++;
        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
                printk(KERN_WARNING "%s: page allocation failure."
                        " order:%d, mode:0x%x\n",
                        p->comm, order, gfp_mask);
                dump_stack();
                show_mem();
        }
got_pg:
        return page;
}



