page_alloc.c.frag.html
Additions are indicated as appropriate.
1 /* page_alloc_fragment.c -- Contains *only* the function __alloc_page_internal()
2 * from mm/page_alloc.c, together with modifications to support AllocInfo.
3 * Copyright C2009 by EQware Engineering, Inc.
4 *
5 * page_alloc_fragment.c is part of AllocInfo.
6 *
7 * AllocInfo is free software: you can redistribute it and/or modify
8 * it under the terms of version 3 of the GNU General Public License
9 * as published by the Free Software Foundation
10 *
11 * AllocInfo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with AllocInfo. If not, see http://www.gnu.org/licenses.
18 *********************************************************************/
19
20 /*
21 * This is the 'heart' of the zoned buddy allocator.
22 */
23 struct page *
24 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
25 struct zonelist *zonelist, nodemask_t *nodemask)
26 {
27 extern unsigned long long heap_alloc_count[6];
28
29 const gfp_t wait = gfp_mask & __GFP_WAIT;
30 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
31 struct zoneref *z;
32 struct zone *zone;
33 struct page *page;
34 struct reclaim_state reclaim_state;
35 struct task_struct *p = current;
36 int do_retry;
37 int alloc_flags;
38 unsigned long did_some_progress;
39 unsigned long pages_reclaimed = 0;
40
41 lockdep_trace_alloc(gfp_mask);
42
43 might_sleep_if(wait);
44
45 if (should_fail_alloc_page(gfp_mask, order))
46 return NULL;
47
48 restart:
49 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
50
51 if (unlikely(!z->zone)) {
52 /*
53 * Happens if we have an empty zonelist as a result of
54 * GFP_THISNODE being used on a memoryless node
55 */
56 return NULL;
57 }
58
59 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
60 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
61 if (page)
62 {
63 heap_alloc_count[0]++;
64 goto got_pg;
65 }
66
67 /*
68 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
69 * __GFP_NOWARN set) should not cause reclaim since the subsystem
70 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
71 * using a larger set of nodes after it has established that the
72 * allowed per node queues are empty and that nodes are
73 * over allocated.
74 */
75 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
76 goto nopage;
77
78 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
79 wakeup_kswapd(zone, order);
80
81 /*
82 * OK, we're below the kswapd watermark and have kicked background
83 * reclaim. Now things get more complex, so set up alloc_flags according
84 * to how we want to proceed.
85 *
86 * The caller may dip into page reserves a bit more if the caller
87 * cannot run direct reclaim, or if the caller has realtime scheduling
88 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
89 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
90 */
91 alloc_flags = ALLOC_WMARK_MIN;
92 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
93 alloc_flags |= ALLOC_HARDER;
94 if (gfp_mask & __GFP_HIGH)
95 alloc_flags |= ALLOC_HIGH;
96 if (wait)
97 alloc_flags |= ALLOC_CPUSET;
98
99 /*
100 * Go through the zonelist again. Let __GFP_HIGH and allocations
101 * coming from realtime tasks go deeper into reserves.
102 *
103 * This is the last chance, in general, before the goto nopage.
104 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
105 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
106 */
107 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
108 high_zoneidx, alloc_flags);
109 if (page)
110 {
111 heap_alloc_count[1]++;
112 goto got_pg;
113 }
114
115 /* This allocation should allow future memory freeing. */
116
117 rebalance:
118 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
119 && !in_interrupt()) {
120 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
121 nofail_alloc:
122 /* go through the zonelist yet again, ignoring mins */
123 page = get_page_from_freelist(gfp_mask, nodemask, order,
124 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
125 if (page)
126 {
127 heap_alloc_count[2]++;
128 goto got_pg;
129 }
130 if (gfp_mask & __GFP_NOFAIL) {
131 congestion_wait(WRITE, HZ/50);
132 goto nofail_alloc;
133 }
134 }
135 goto nopage;
136 }
137
138 /* Atomic allocations - we can't balance anything */
139 if (!wait)
140 goto nopage;
141
142 cond_resched();
143
144 /* We now go into synchronous reclaim */
145 cpuset_memory_pressure_bump();
146 /*
147 * The task's cpuset might have expanded its set of allowable nodes
148 */
149 cpuset_update_task_memory_state();
150 p->flags |= PF_MEMALLOC;
151
152 lockdep_set_current_reclaim_state(gfp_mask);
153 reclaim_state.reclaimed_slab = 0;
154 p->reclaim_state = &reclaim_state;
155
156 did_some_progress = try_to_free_pages(zonelist, order,
157 gfp_mask, nodemask);
158
159 p->reclaim_state = NULL;
160 lockdep_clear_current_reclaim_state();
161 p->flags &= ~PF_MEMALLOC;
162
163 cond_resched();
164
165 if (order != 0)
166 drain_all_pages();
167
168 if (likely(did_some_progress)) {
169 page = get_page_from_freelist(gfp_mask, nodemask, order,
170 zonelist, high_zoneidx, alloc_flags);
171 if (page)
172 {
173 heap_alloc_count[3]++;
174 goto got_pg;
175 }
176 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
177 if (!try_set_zone_oom(zonelist, gfp_mask)) {
178 schedule_timeout_uninterruptible(1);
179 goto restart;
180 }
181
182 /*
183 * Go through the zonelist yet one more time, keep
184 * very high watermark here, this is only to catch
185 * a parallel oom killing, we must fail if we're still
186 * under heavy pressure.
187 */
188 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
189 order, zonelist, high_zoneidx,
190 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
191 if (page) {
192 clear_zonelist_oom(zonelist, gfp_mask);
193 heap_alloc_count[4]++;
194 goto got_pg;
195 }
196
197 /* The OOM killer will not help higher order allocs so fail */
198 if (order > PAGE_ALLOC_COSTLY_ORDER) {
199 clear_zonelist_oom(zonelist, gfp_mask);
200 goto nopage;
201 }
202
203 out_of_memory(zonelist, gfp_mask, order);
204 clear_zonelist_oom(zonelist, gfp_mask);
205 goto restart;
206 }
207
208 /*
209 * Don't let big-order allocations loop unless the caller explicitly
210 * requests that. Wait for some write requests to complete then retry.
211 *
212 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
213 * means __GFP_NOFAIL, but that may not be true in other
214 * implementations.
215 *
216 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
217 * specified, then we retry until we no longer reclaim any pages
218 * (above), or we've reclaimed an order of pages at least as
219 * large as the allocation's order. In both cases, if the
220 * allocation still fails, we stop retrying.
221 */
222 pages_reclaimed += did_some_progress;
223 do_retry = 0;
224 if (!(gfp_mask & __GFP_NORETRY)) {
225 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
226 do_retry = 1;
227 } else {
228 if (gfp_mask & __GFP_REPEAT &&
229 pages_reclaimed < (1 << order))
230 do_retry = 1;
231 }
232 if (gfp_mask & __GFP_NOFAIL)
233 do_retry = 1;
234 }
235 if (do_retry) {
236 congestion_wait(WRITE, HZ/50);
237 goto rebalance;
238 }
239
240 nopage:
241 heap_alloc_count[5]++;
242 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
243 printk(KERN_WARNING "%s: page allocation failure."
244 " order:%d, mode:0x%x\n",
245 p->comm, order, gfp_mask);
246 dump_stack();
247 show_mem();
248 }
249 got_pg:
250 return page;
251 }