1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/module.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <asm/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62
63
64
65
66
67
68int number_of_cpusets __read_mostly;
69
70
71struct cgroup_subsys cpuset_subsys;
72struct cpuset;
73
74
75
76struct fmeter {
77 int cnt;
78 int val;
79 time_t time;
80 spinlock_t lock;
81};
82
83struct cpuset {
84 struct cgroup_subsys_state css;
85
86 unsigned long flags;
87 cpumask_t cpus_allowed;
88 nodemask_t mems_allowed;
89
90 struct cpuset *parent;
91
92
93
94
95
96 int mems_generation;
97
98 struct fmeter fmeter;
99
100
101 int pn;
102
103
104 int relax_domain_level;
105
106
107 struct list_head stack_list;
108};
109
110
111static inline struct cpuset *cgroup_cs(struct cgroup *cont)
112{
113 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
114 struct cpuset, css);
115}
116
117
118static inline struct cpuset *task_cs(struct task_struct *task)
119{
120 return container_of(task_subsys_state(task, cpuset_subsys_id),
121 struct cpuset, css);
122}
123struct cpuset_hotplug_scanner {
124 struct cgroup_scanner scan;
125 struct cgroup *to;
126};
127
128
129typedef enum {
130 CS_CPU_EXCLUSIVE,
131 CS_MEM_EXCLUSIVE,
132 CS_MEM_HARDWALL,
133 CS_MEMORY_MIGRATE,
134 CS_SCHED_LOAD_BALANCE,
135 CS_SPREAD_PAGE,
136 CS_SPREAD_SLAB,
137} cpuset_flagbits_t;
138
139
140static inline int is_cpu_exclusive(const struct cpuset *cs)
141{
142 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
143}
144
145static inline int is_mem_exclusive(const struct cpuset *cs)
146{
147 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
148}
149
150static inline int is_mem_hardwall(const struct cpuset *cs)
151{
152 return test_bit(CS_MEM_HARDWALL, &cs->flags);
153}
154
155static inline int is_sched_load_balance(const struct cpuset *cs)
156{
157 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
158}
159
160static inline int is_memory_migrate(const struct cpuset *cs)
161{
162 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
163}
164
165static inline int is_spread_page(const struct cpuset *cs)
166{
167 return test_bit(CS_SPREAD_PAGE, &cs->flags);
168}
169
170static inline int is_spread_slab(const struct cpuset *cs)
171{
172 return test_bit(CS_SPREAD_SLAB, &cs->flags);
173}
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194static int cpuset_mems_generation;
195
196static struct cpuset top_cpuset = {
197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
198 .cpus_allowed = CPU_MASK_ALL,
199 .mems_allowed = NODE_MASK_ALL,
200};
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240static DEFINE_MUTEX(callback_mutex);
241
242
243
244
245
246
247static int cpuset_get_sb(struct file_system_type *fs_type,
248 int flags, const char *unused_dev_name,
249 void *data, struct vfsmount *mnt)
250{
251 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
252 int ret = -ENODEV;
253 if (cgroup_fs) {
254 char mountopts[] =
255 "cpuset,noprefix,"
256 "release_agent=/sbin/cpuset_release_agent";
257 ret = cgroup_fs->get_sb(cgroup_fs, flags,
258 unused_dev_name, mountopts, mnt);
259 put_filesystem(cgroup_fs);
260 }
261 return ret;
262}
263
264static struct file_system_type cpuset_fs_type = {
265 .name = "cpuset",
266 .get_sb = cpuset_get_sb,
267};
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
284{
285 while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
286 cs = cs->parent;
287 if (cs)
288 cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
289 else
290 *pmask = cpu_online_map;
291 BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
292}
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
308{
309 while (cs && !nodes_intersects(cs->mems_allowed,
310 node_states[N_HIGH_MEMORY]))
311 cs = cs->parent;
312 if (cs)
313 nodes_and(*pmask, cs->mems_allowed,
314 node_states[N_HIGH_MEMORY]);
315 else
316 *pmask = node_states[N_HIGH_MEMORY];
317 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
318}
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361void cpuset_update_task_memory_state(void)
362{
363 int my_cpusets_mem_gen;
364 struct task_struct *tsk = current;
365 struct cpuset *cs;
366
367 if (task_cs(tsk) == &top_cpuset) {
368
369 my_cpusets_mem_gen = top_cpuset.mems_generation;
370 } else {
371 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock();
374 }
375
376 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
377 mutex_lock(&callback_mutex);
378 task_lock(tsk);
379 cs = task_cs(tsk);
380 guarantee_online_mems(cs, &tsk->mems_allowed);
381 tsk->cpuset_mems_generation = cs->mems_generation;
382 if (is_spread_page(cs))
383 tsk->flags |= PF_SPREAD_PAGE;
384 else
385 tsk->flags &= ~PF_SPREAD_PAGE;
386 if (is_spread_slab(cs))
387 tsk->flags |= PF_SPREAD_SLAB;
388 else
389 tsk->flags &= ~PF_SPREAD_SLAB;
390 task_unlock(tsk);
391 mutex_unlock(&callback_mutex);
392 mpol_rebind_task(tsk, &tsk->mems_allowed);
393 }
394}
395
396
397
398
399
400
401
402
403
404static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
405{
406 return cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
407 nodes_subset(p->mems_allowed, q->mems_allowed) &&
408 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
409 is_mem_exclusive(p) <= is_mem_exclusive(q);
410}
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
433{
434 struct cgroup *cont;
435 struct cpuset *c, *par;
436
437
438 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
439 if (!is_cpuset_subset(cgroup_cs(cont), trial))
440 return -EBUSY;
441 }
442
443
444 if (cur == &top_cpuset)
445 return 0;
446
447 par = cur->parent;
448
449
450 if (!is_cpuset_subset(trial, par))
451 return -EACCES;
452
453
454
455
456
457 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
458 c = cgroup_cs(cont);
459 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
460 c != cur &&
461 cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
462 return -EINVAL;
463 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
464 c != cur &&
465 nodes_intersects(trial->mems_allowed, c->mems_allowed))
466 return -EINVAL;
467 }
468
469
470 if (cgroup_task_count(cur->css.cgroup)) {
471 if (cpus_empty(trial->cpus_allowed) ||
472 nodes_empty(trial->mems_allowed)) {
473 return -ENOSPC;
474 }
475 }
476
477 return 0;
478}
479
480
481
482
483
484static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
485{
486 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
487}
488
489static void
490update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
491{
492 if (dattr->relax_domain_level < c->relax_domain_level)
493 dattr->relax_domain_level = c->relax_domain_level;
494 return;
495}
496
497static void
498update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
499{
500 LIST_HEAD(q);
501
502 list_add(&c->stack_list, &q);
503 while (!list_empty(&q)) {
504 struct cpuset *cp;
505 struct cgroup *cont;
506 struct cpuset *child;
507
508 cp = list_first_entry(&q, struct cpuset, stack_list);
509 list_del(q.next);
510
511 if (cpus_empty(cp->cpus_allowed))
512 continue;
513
514 if (is_sched_load_balance(cp))
515 update_domain_attr(dattr, cp);
516
517 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
518 child = cgroup_cs(cont);
519 list_add_tail(&child->stack_list, &q);
520 }
521 }
522}
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578static int generate_sched_domains(cpumask_t **domains,
579 struct sched_domain_attr **attributes)
580{
581 LIST_HEAD(q);
582 struct cpuset *cp;
583 struct cpuset **csa;
584 int csn;
585 int i, j, k;
586 cpumask_t *doms;
587 struct sched_domain_attr *dattr;
588 int ndoms = 0;
589 int nslot;
590
591 doms = NULL;
592 dattr = NULL;
593 csa = NULL;
594
595
596 if (is_sched_load_balance(&top_cpuset)) {
597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
598 if (!doms)
599 goto done;
600
601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
602 if (dattr) {
603 *dattr = SD_ATTR_INIT;
604 update_domain_attr_tree(dattr, &top_cpuset);
605 }
606 *doms = top_cpuset.cpus_allowed;
607
608 ndoms = 1;
609 goto done;
610 }
611
612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
613 if (!csa)
614 goto done;
615 csn = 0;
616
617 list_add(&top_cpuset.stack_list, &q);
618 while (!list_empty(&q)) {
619 struct cgroup *cont;
620 struct cpuset *child;
621
622 cp = list_first_entry(&q, struct cpuset, stack_list);
623 list_del(q.next);
624
625 if (cpus_empty(cp->cpus_allowed))
626 continue;
627
628
629
630
631
632
633
634 if (is_sched_load_balance(cp)) {
635 csa[csn++] = cp;
636 continue;
637 }
638
639 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
640 child = cgroup_cs(cont);
641 list_add_tail(&child->stack_list, &q);
642 }
643 }
644
645 for (i = 0; i < csn; i++)
646 csa[i]->pn = i;
647 ndoms = csn;
648
649restart:
650
651 for (i = 0; i < csn; i++) {
652 struct cpuset *a = csa[i];
653 int apn = a->pn;
654
655 for (j = 0; j < csn; j++) {
656 struct cpuset *b = csa[j];
657 int bpn = b->pn;
658
659 if (apn != bpn && cpusets_overlap(a, b)) {
660 for (k = 0; k < csn; k++) {
661 struct cpuset *c = csa[k];
662
663 if (c->pn == bpn)
664 c->pn = apn;
665 }
666 ndoms--;
667 goto restart;
668 }
669 }
670 }
671
672
673
674
675
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
677 if (!doms)
678 goto done;
679
680
681
682
683
684 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
685
686 for (nslot = 0, i = 0; i < csn; i++) {
687 struct cpuset *a = csa[i];
688 cpumask_t *dp;
689 int apn = a->pn;
690
691 if (apn < 0) {
692
693 continue;
694 }
695
696 dp = doms + nslot;
697
698 if (nslot == ndoms) {
699 static int warnings = 10;
700 if (warnings) {
701 printk(KERN_WARNING
702 "rebuild_sched_domains confused:"
703 " nslot %d, ndoms %d, csn %d, i %d,"
704 " apn %d\n",
705 nslot, ndoms, csn, i, apn);
706 warnings--;
707 }
708 continue;
709 }
710
711 cpus_clear(*dp);
712 if (dattr)
713 *(dattr + nslot) = SD_ATTR_INIT;
714 for (j = i; j < csn; j++) {
715 struct cpuset *b = csa[j];
716
717 if (apn == b->pn) {
718 cpus_or(*dp, *dp, b->cpus_allowed);
719 if (dattr)
720 update_domain_attr_tree(dattr + nslot, b);
721
722
723 b->pn = -1;
724 }
725 }
726 nslot++;
727 }
728 BUG_ON(nslot != ndoms);
729
730done:
731 kfree(csa);
732
733
734
735
736
737 if (doms == NULL)
738 ndoms = 1;
739
740 *domains = doms;
741 *attributes = dattr;
742 return ndoms;
743}
744
745
746
747
748
749
750
751
752
753
754
755static void do_rebuild_sched_domains(struct work_struct *unused)
756{
757 struct sched_domain_attr *attr;
758 cpumask_t *doms;
759 int ndoms;
760
761 get_online_cpus();
762
763
764 cgroup_lock();
765 ndoms = generate_sched_domains(&doms, &attr);
766 cgroup_unlock();
767
768
769 partition_sched_domains(ndoms, doms, attr);
770
771 put_online_cpus();
772}
773
774static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795static void async_rebuild_sched_domains(void)
796{
797 schedule_work(&rebuild_sched_domains_work);
798}
799
800
801
802
803
804
805
806
807
808
809void rebuild_sched_domains(void)
810{
811 do_rebuild_sched_domains(NULL);
812}
813
814
815
816
817
818
819
820
821
822
823
824static int cpuset_test_cpumask(struct task_struct *tsk,
825 struct cgroup_scanner *scan)
826{
827 return !cpus_equal(tsk->cpus_allowed,
828 (cgroup_cs(scan->cg))->cpus_allowed);
829}
830
831
832
833
834
835
836
837
838
839
840
841
842static void cpuset_change_cpumask(struct task_struct *tsk,
843 struct cgroup_scanner *scan)
844{
845 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
846}
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
862{
863 struct cgroup_scanner scan;
864
865 scan.cg = cs->css.cgroup;
866 scan.test_task = cpuset_test_cpumask;
867 scan.process_task = cpuset_change_cpumask;
868 scan.heap = heap;
869 cgroup_scan_tasks(&scan);
870}
871
872
873
874
875
876
877static int update_cpumask(struct cpuset *cs, const char *buf)
878{
879 struct ptr_heap heap;
880 struct cpuset trialcs;
881 int retval;
882 int is_load_balanced;
883
884
885 if (cs == &top_cpuset)
886 return -EACCES;
887
888 trialcs = *cs;
889
890
891
892
893
894
895
896 if (!*buf) {
897 cpus_clear(trialcs.cpus_allowed);
898 } else {
899 retval = cpulist_parse(buf, trialcs.cpus_allowed);
900 if (retval < 0)
901 return retval;
902
903 if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
904 return -EINVAL;
905 }
906 retval = validate_change(cs, &trialcs);
907 if (retval < 0)
908 return retval;
909
910
911 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
912 return 0;
913
914 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
915 if (retval)
916 return retval;
917
918 is_load_balanced = is_sched_load_balance(&trialcs);
919
920 mutex_lock(&callback_mutex);
921 cs->cpus_allowed = trialcs.cpus_allowed;
922 mutex_unlock(&callback_mutex);
923
924
925
926
927
928 update_tasks_cpumask(cs, &heap);
929
930 heap_free(&heap);
931
932 if (is_load_balanced)
933 async_rebuild_sched_domains();
934 return 0;
935}
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
969 const nodemask_t *to)
970{
971 struct task_struct *tsk = current;
972
973 cpuset_update_task_memory_state();
974
975 mutex_lock(&callback_mutex);
976 tsk->mems_allowed = *to;
977 mutex_unlock(&callback_mutex);
978
979 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
980
981 mutex_lock(&callback_mutex);
982 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
983 mutex_unlock(&callback_mutex);
984}
985
986static void *cpuset_being_rebound;
987
988
989
990
991
992
993
994
995
996static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
997{
998 struct task_struct *p;
999 struct mm_struct **mmarray;
1000 int i, n, ntasks;
1001 int migrate;
1002 int fudge;
1003 struct cgroup_iter it;
1004 int retval;
1005
1006 cpuset_being_rebound = cs;
1007
1008 fudge = 10;
1009 fudge += cpus_weight(cs->cpus_allowed);
1010 retval = -ENOMEM;
1011
1012
1013
1014
1015
1016
1017
1018
1019 while (1) {
1020 ntasks = cgroup_task_count(cs->css.cgroup);
1021 ntasks += fudge;
1022 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
1023 if (!mmarray)
1024 goto done;
1025 read_lock(&tasklist_lock);
1026 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
1027 break;
1028 read_unlock(&tasklist_lock);
1029 kfree(mmarray);
1030 }
1031
1032 n = 0;
1033
1034
1035 cgroup_iter_start(cs->css.cgroup, &it);
1036 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
1037 struct mm_struct *mm;
1038
1039 if (n >= ntasks) {
1040 printk(KERN_WARNING
1041 "Cpuset mempolicy rebind incomplete.\n");
1042 break;
1043 }
1044 mm = get_task_mm(p);
1045 if (!mm)
1046 continue;
1047 mmarray[n++] = mm;
1048 }
1049 cgroup_iter_end(cs->css.cgroup, &it);
1050 read_unlock(&tasklist_lock);
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065 migrate = is_memory_migrate(cs);
1066 for (i = 0; i < n; i++) {
1067 struct mm_struct *mm = mmarray[i];
1068
1069 mpol_rebind_mm(mm, &cs->mems_allowed);
1070 if (migrate)
1071 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1072 mmput(mm);
1073 }
1074
1075
1076 kfree(mmarray);
1077 cpuset_being_rebound = NULL;
1078 retval = 0;
1079done:
1080 return retval;
1081}
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096static int update_nodemask(struct cpuset *cs, const char *buf)
1097{
1098 struct cpuset trialcs;
1099 nodemask_t oldmem;
1100 int retval;
1101
1102
1103
1104
1105
1106 if (cs == &top_cpuset)
1107 return -EACCES;
1108
1109 trialcs = *cs;
1110
1111
1112
1113
1114
1115
1116
1117 if (!*buf) {
1118 nodes_clear(trialcs.mems_allowed);
1119 } else {
1120 retval = nodelist_parse(buf, trialcs.mems_allowed);
1121 if (retval < 0)
1122 goto done;
1123
1124 if (!nodes_subset(trialcs.mems_allowed,
1125 node_states[N_HIGH_MEMORY]))
1126 return -EINVAL;
1127 }
1128 oldmem = cs->mems_allowed;
1129 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
1130 retval = 0;
1131 goto done;
1132 }
1133 retval = validate_change(cs, &trialcs);
1134 if (retval < 0)
1135 goto done;
1136
1137 mutex_lock(&callback_mutex);
1138 cs->mems_allowed = trialcs.mems_allowed;
1139 cs->mems_generation = cpuset_mems_generation++;
1140 mutex_unlock(&callback_mutex);
1141
1142 retval = update_tasks_nodemask(cs, &oldmem);
1143done:
1144 return retval;
1145}
1146
1147int current_cpuset_is_being_rebound(void)
1148{
1149 return task_cs(current) == cpuset_being_rebound;
1150}
1151
1152static int update_relax_domain_level(struct cpuset *cs, s64 val)
1153{
1154 if (val < -1 || val >= SD_LV_MAX)
1155 return -EINVAL;
1156
1157 if (val != cs->relax_domain_level) {
1158 cs->relax_domain_level = val;
1159 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1160 async_rebuild_sched_domains();
1161 }
1162
1163 return 0;
1164}
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1176 int turning_on)
1177{
1178 struct cpuset trialcs;
1179 int err;
1180 int balance_flag_changed;
1181
1182 trialcs = *cs;
1183 if (turning_on)
1184 set_bit(bit, &trialcs.flags);
1185 else
1186 clear_bit(bit, &trialcs.flags);
1187
1188 err = validate_change(cs, &trialcs);
1189 if (err < 0)
1190 return err;
1191
1192 balance_flag_changed = (is_sched_load_balance(cs) !=
1193 is_sched_load_balance(&trialcs));
1194
1195 mutex_lock(&callback_mutex);
1196 cs->flags = trialcs.flags;
1197 mutex_unlock(&callback_mutex);
1198
1199 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
1200 async_rebuild_sched_domains();
1201
1202 return 0;
1203}
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250#define FM_COEF 933
1251#define FM_MAXTICKS ((time_t)99)
1252#define FM_MAXCNT 1000000
1253#define FM_SCALE 1000
1254
1255
1256static void fmeter_init(struct fmeter *fmp)
1257{
1258 fmp->cnt = 0;
1259 fmp->val = 0;
1260 fmp->time = 0;
1261 spin_lock_init(&fmp->lock);
1262}
1263
1264
1265static void fmeter_update(struct fmeter *fmp)
1266{
1267 time_t now = get_seconds();
1268 time_t ticks = now - fmp->time;
1269
1270 if (ticks == 0)
1271 return;
1272
1273 ticks = min(FM_MAXTICKS, ticks);
1274 while (ticks-- > 0)
1275 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1276 fmp->time = now;
1277
1278 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1279 fmp->cnt = 0;
1280}
1281
1282
1283static void fmeter_markevent(struct fmeter *fmp)
1284{
1285 spin_lock(&fmp->lock);
1286 fmeter_update(fmp);
1287 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1288 spin_unlock(&fmp->lock);
1289}
1290
1291
1292static int fmeter_getrate(struct fmeter *fmp)
1293{
1294 int val;
1295
1296 spin_lock(&fmp->lock);
1297 fmeter_update(fmp);
1298 val = fmp->val;
1299 spin_unlock(&fmp->lock);
1300 return val;
1301}
1302
1303
1304static int cpuset_can_attach(struct cgroup_subsys *ss,
1305 struct cgroup *cont, struct task_struct *tsk)
1306{
1307 struct cpuset *cs = cgroup_cs(cont);
1308
1309 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1310 return -ENOSPC;
1311 if (tsk->flags & PF_THREAD_BOUND) {
1312 cpumask_t mask;
1313
1314 mutex_lock(&callback_mutex);
1315 mask = cs->cpus_allowed;
1316 mutex_unlock(&callback_mutex);
1317 if (!cpus_equal(tsk->cpus_allowed, mask))
1318 return -EINVAL;
1319 }
1320
1321 return security_task_setscheduler(tsk, 0, NULL);
1322}
1323
1324static void cpuset_attach(struct cgroup_subsys *ss,
1325 struct cgroup *cont, struct cgroup *oldcont,
1326 struct task_struct *tsk)
1327{
1328 cpumask_t cpus;
1329 nodemask_t from, to;
1330 struct mm_struct *mm;
1331 struct cpuset *cs = cgroup_cs(cont);
1332 struct cpuset *oldcs = cgroup_cs(oldcont);
1333 int err;
1334
1335 mutex_lock(&callback_mutex);
1336 guarantee_online_cpus(cs, &cpus);
1337 err = set_cpus_allowed_ptr(tsk, &cpus);
1338 mutex_unlock(&callback_mutex);
1339 if (err)
1340 return;
1341
1342 from = oldcs->mems_allowed;
1343 to = cs->mems_allowed;
1344 mm = get_task_mm(tsk);
1345 if (mm) {
1346 mpol_rebind_mm(mm, &to);
1347 if (is_memory_migrate(cs))
1348 cpuset_migrate_mm(mm, &from, &to);
1349 mmput(mm);
1350 }
1351
1352}
1353
1354
1355
1356typedef enum {
1357 FILE_MEMORY_MIGRATE,
1358 FILE_CPULIST,
1359 FILE_MEMLIST,
1360 FILE_CPU_EXCLUSIVE,
1361 FILE_MEM_EXCLUSIVE,
1362 FILE_MEM_HARDWALL,
1363 FILE_SCHED_LOAD_BALANCE,
1364 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1365 FILE_MEMORY_PRESSURE_ENABLED,
1366 FILE_MEMORY_PRESSURE,
1367 FILE_SPREAD_PAGE,
1368 FILE_SPREAD_SLAB,
1369} cpuset_filetype_t;
1370
1371static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1372{
1373 int retval = 0;
1374 struct cpuset *cs = cgroup_cs(cgrp);
1375 cpuset_filetype_t type = cft->private;
1376
1377 if (!cgroup_lock_live_group(cgrp))
1378 return -ENODEV;
1379
1380 switch (type) {
1381 case FILE_CPU_EXCLUSIVE:
1382 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1383 break;
1384 case FILE_MEM_EXCLUSIVE:
1385 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1386 break;
1387 case FILE_MEM_HARDWALL:
1388 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1389 break;
1390 case FILE_SCHED_LOAD_BALANCE:
1391 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1392 break;
1393 case FILE_MEMORY_MIGRATE:
1394 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1395 break;
1396 case FILE_MEMORY_PRESSURE_ENABLED:
1397 cpuset_memory_pressure_enabled = !!val;
1398 break;
1399 case FILE_MEMORY_PRESSURE:
1400 retval = -EACCES;
1401 break;
1402 case FILE_SPREAD_PAGE:
1403 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1404 cs->mems_generation = cpuset_mems_generation++;
1405 break;
1406 case FILE_SPREAD_SLAB:
1407 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1408 cs->mems_generation = cpuset_mems_generation++;
1409 break;
1410 default:
1411 retval = -EINVAL;
1412 break;
1413 }
1414 cgroup_unlock();
1415 return retval;
1416}
1417
1418static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1419{
1420 int retval = 0;
1421 struct cpuset *cs = cgroup_cs(cgrp);
1422 cpuset_filetype_t type = cft->private;
1423
1424 if (!cgroup_lock_live_group(cgrp))
1425 return -ENODEV;
1426
1427 switch (type) {
1428 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1429 retval = update_relax_domain_level(cs, val);
1430 break;
1431 default:
1432 retval = -EINVAL;
1433 break;
1434 }
1435 cgroup_unlock();
1436 return retval;
1437}
1438
1439
1440
1441
1442static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1443 const char *buf)
1444{
1445 int retval = 0;
1446
1447 if (!cgroup_lock_live_group(cgrp))
1448 return -ENODEV;
1449
1450 switch (cft->private) {
1451 case FILE_CPULIST:
1452 retval = update_cpumask(cgroup_cs(cgrp), buf);
1453 break;
1454 case FILE_MEMLIST:
1455 retval = update_nodemask(cgroup_cs(cgrp), buf);
1456 break;
1457 default:
1458 retval = -EINVAL;
1459 break;
1460 }
1461 cgroup_unlock();
1462 return retval;
1463}
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1478{
1479 cpumask_t mask;
1480
1481 mutex_lock(&callback_mutex);
1482 mask = cs->cpus_allowed;
1483 mutex_unlock(&callback_mutex);
1484
1485 return cpulist_scnprintf(page, PAGE_SIZE, mask);
1486}
1487
1488static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1489{
1490 nodemask_t mask;
1491
1492 mutex_lock(&callback_mutex);
1493 mask = cs->mems_allowed;
1494 mutex_unlock(&callback_mutex);
1495
1496 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1497}
1498
1499static ssize_t cpuset_common_file_read(struct cgroup *cont,
1500 struct cftype *cft,
1501 struct file *file,
1502 char __user *buf,
1503 size_t nbytes, loff_t *ppos)
1504{
1505 struct cpuset *cs = cgroup_cs(cont);
1506 cpuset_filetype_t type = cft->private;
1507 char *page;
1508 ssize_t retval = 0;
1509 char *s;
1510
1511 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1512 return -ENOMEM;
1513
1514 s = page;
1515
1516 switch (type) {
1517 case FILE_CPULIST:
1518 s += cpuset_sprintf_cpulist(s, cs);
1519 break;
1520 case FILE_MEMLIST:
1521 s += cpuset_sprintf_memlist(s, cs);
1522 break;
1523 default:
1524 retval = -EINVAL;
1525 goto out;
1526 }
1527 *s++ = '\n';
1528
1529 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1530out:
1531 free_page((unsigned long)page);
1532 return retval;
1533}
1534
1535static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1536{
1537 struct cpuset *cs = cgroup_cs(cont);
1538 cpuset_filetype_t type = cft->private;
1539 switch (type) {
1540 case FILE_CPU_EXCLUSIVE:
1541 return is_cpu_exclusive(cs);
1542 case FILE_MEM_EXCLUSIVE:
1543 return is_mem_exclusive(cs);
1544 case FILE_MEM_HARDWALL:
1545 return is_mem_hardwall(cs);
1546 case FILE_SCHED_LOAD_BALANCE:
1547 return is_sched_load_balance(cs);
1548 case FILE_MEMORY_MIGRATE:
1549 return is_memory_migrate(cs);
1550 case FILE_MEMORY_PRESSURE_ENABLED:
1551 return cpuset_memory_pressure_enabled;
1552 case FILE_MEMORY_PRESSURE:
1553 return fmeter_getrate(&cs->fmeter);
1554 case FILE_SPREAD_PAGE:
1555 return is_spread_page(cs);
1556 case FILE_SPREAD_SLAB:
1557 return is_spread_slab(cs);
1558 default:
1559 BUG();
1560 }
1561
1562
1563 return 0;
1564}
1565
1566static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1567{
1568 struct cpuset *cs = cgroup_cs(cont);
1569 cpuset_filetype_t type = cft->private;
1570 switch (type) {
1571 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1572 return cs->relax_domain_level;
1573 default:
1574 BUG();
1575 }
1576
1577
1578 return 0;
1579}
1580
1581
1582
1583
1584
1585
1586static struct cftype files[] = {
1587 {
1588 .name = "cpus",
1589 .read = cpuset_common_file_read,
1590 .write_string = cpuset_write_resmask,
1591 .max_write_len = (100U + 6 * NR_CPUS),
1592 .private = FILE_CPULIST,
1593 },
1594
1595 {
1596 .name = "mems",
1597 .read = cpuset_common_file_read,
1598 .write_string = cpuset_write_resmask,
1599 .max_write_len = (100U + 6 * MAX_NUMNODES),
1600 .private = FILE_MEMLIST,
1601 },
1602
1603 {
1604 .name = "cpu_exclusive",
1605 .read_u64 = cpuset_read_u64,
1606 .write_u64 = cpuset_write_u64,
1607 .private = FILE_CPU_EXCLUSIVE,
1608 },
1609
1610 {
1611 .name = "mem_exclusive",
1612 .read_u64 = cpuset_read_u64,
1613 .write_u64 = cpuset_write_u64,
1614 .private = FILE_MEM_EXCLUSIVE,
1615 },
1616
1617 {
1618 .name = "mem_hardwall",
1619 .read_u64 = cpuset_read_u64,
1620 .write_u64 = cpuset_write_u64,
1621 .private = FILE_MEM_HARDWALL,
1622 },
1623
1624 {
1625 .name = "sched_load_balance",
1626 .read_u64 = cpuset_read_u64,
1627 .write_u64 = cpuset_write_u64,
1628 .private = FILE_SCHED_LOAD_BALANCE,
1629 },
1630
1631 {
1632 .name = "sched_relax_domain_level",
1633 .read_s64 = cpuset_read_s64,
1634 .write_s64 = cpuset_write_s64,
1635 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1636 },
1637
1638 {
1639 .name = "memory_migrate",
1640 .read_u64 = cpuset_read_u64,
1641 .write_u64 = cpuset_write_u64,
1642 .private = FILE_MEMORY_MIGRATE,
1643 },
1644
1645 {
1646 .name = "memory_pressure",
1647 .read_u64 = cpuset_read_u64,
1648 .write_u64 = cpuset_write_u64,
1649 .private = FILE_MEMORY_PRESSURE,
1650 },
1651
1652 {
1653 .name = "memory_spread_page",
1654 .read_u64 = cpuset_read_u64,
1655 .write_u64 = cpuset_write_u64,
1656 .private = FILE_SPREAD_PAGE,
1657 },
1658
1659 {
1660 .name = "memory_spread_slab",
1661 .read_u64 = cpuset_read_u64,
1662 .write_u64 = cpuset_write_u64,
1663 .private = FILE_SPREAD_SLAB,
1664 },
1665};
1666
1667static struct cftype cft_memory_pressure_enabled = {
1668 .name = "memory_pressure_enabled",
1669 .read_u64 = cpuset_read_u64,
1670 .write_u64 = cpuset_write_u64,
1671 .private = FILE_MEMORY_PRESSURE_ENABLED,
1672};
1673
1674static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1675{
1676 int err;
1677
1678 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1679 if (err)
1680 return err;
1681
1682 if (!cont->parent)
1683 err = cgroup_add_file(cont, ss,
1684 &cft_memory_pressure_enabled);
1685 return err;
1686}
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705static void cpuset_post_clone(struct cgroup_subsys *ss,
1706 struct cgroup *cgroup)
1707{
1708 struct cgroup *parent, *child;
1709 struct cpuset *cs, *parent_cs;
1710
1711 parent = cgroup->parent;
1712 list_for_each_entry(child, &parent->children, sibling) {
1713 cs = cgroup_cs(child);
1714 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1715 return;
1716 }
1717 cs = cgroup_cs(cgroup);
1718 parent_cs = cgroup_cs(parent);
1719
1720 cs->mems_allowed = parent_cs->mems_allowed;
1721 cs->cpus_allowed = parent_cs->cpus_allowed;
1722 return;
1723}
1724
1725
1726
1727
1728
1729
1730
1731static struct cgroup_subsys_state *cpuset_create(
1732 struct cgroup_subsys *ss,
1733 struct cgroup *cont)
1734{
1735 struct cpuset *cs;
1736 struct cpuset *parent;
1737
1738 if (!cont->parent) {
1739
1740 top_cpuset.mems_generation = cpuset_mems_generation++;
1741 return &top_cpuset.css;
1742 }
1743 parent = cgroup_cs(cont->parent);
1744 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1745 if (!cs)
1746 return ERR_PTR(-ENOMEM);
1747
1748 cpuset_update_task_memory_state();
1749 cs->flags = 0;
1750 if (is_spread_page(parent))
1751 set_bit(CS_SPREAD_PAGE, &cs->flags);
1752 if (is_spread_slab(parent))
1753 set_bit(CS_SPREAD_SLAB, &cs->flags);
1754 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1755 cpus_clear(cs->cpus_allowed);
1756 nodes_clear(cs->mems_allowed);
1757 cs->mems_generation = cpuset_mems_generation++;
1758 fmeter_init(&cs->fmeter);
1759 cs->relax_domain_level = -1;
1760
1761 cs->parent = parent;
1762 number_of_cpusets++;
1763 return &cs->css ;
1764}
1765
1766
1767
1768
1769
1770
1771
1772static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1773{
1774 struct cpuset *cs = cgroup_cs(cont);
1775
1776 cpuset_update_task_memory_state();
1777
1778 if (is_sched_load_balance(cs))
1779 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1780
1781 number_of_cpusets--;
1782 kfree(cs);
1783}
1784
1785struct cgroup_subsys cpuset_subsys = {
1786 .name = "cpuset",
1787 .create = cpuset_create,
1788 .destroy = cpuset_destroy,
1789 .can_attach = cpuset_can_attach,
1790 .attach = cpuset_attach,
1791 .populate = cpuset_populate,
1792 .post_clone = cpuset_post_clone,
1793 .subsys_id = cpuset_subsys_id,
1794 .early_init = 1,
1795};
1796
1797
1798
1799
1800
1801
1802
1803int __init cpuset_init_early(void)
1804{
1805 top_cpuset.mems_generation = cpuset_mems_generation++;
1806 return 0;
1807}
1808
1809
1810
1811
1812
1813
1814
1815
1816int __init cpuset_init(void)
1817{
1818 int err = 0;
1819
1820 cpus_setall(top_cpuset.cpus_allowed);
1821 nodes_setall(top_cpuset.mems_allowed);
1822
1823 fmeter_init(&top_cpuset.fmeter);
1824 top_cpuset.mems_generation = cpuset_mems_generation++;
1825 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1826 top_cpuset.relax_domain_level = -1;
1827
1828 err = register_filesystem(&cpuset_fs_type);
1829 if (err < 0)
1830 return err;
1831
1832 number_of_cpusets = 1;
1833 return 0;
1834}
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844static void cpuset_do_move_task(struct task_struct *tsk,
1845 struct cgroup_scanner *scan)
1846{
1847 struct cpuset_hotplug_scanner *chsp;
1848
1849 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
1850 cgroup_attach_task(chsp->to, tsk);
1851}
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1865{
1866 struct cpuset_hotplug_scanner scan;
1867
1868 scan.scan.cg = from->css.cgroup;
1869 scan.scan.test_task = NULL;
1870 scan.scan.process_task = cpuset_do_move_task;
1871 scan.scan.heap = NULL;
1872 scan.to = to->css.cgroup;
1873
1874 if (cgroup_scan_tasks(&scan.scan))
1875 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1876 "cgroup_scan_tasks failed\n");
1877}
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1890{
1891 struct cpuset *parent;
1892
1893
1894
1895
1896
1897
1898 if (list_empty(&cs->css.cgroup->css_sets))
1899 return;
1900
1901
1902
1903
1904
1905 parent = cs->parent;
1906 while (cpus_empty(parent->cpus_allowed) ||
1907 nodes_empty(parent->mems_allowed))
1908 parent = parent->parent;
1909
1910 move_member_tasks_to_cpuset(cs, parent);
1911}
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928static void scan_for_empty_cpusets(struct cpuset *root)
1929{
1930 LIST_HEAD(queue);
1931 struct cpuset *cp;
1932 struct cpuset *child;
1933 struct cgroup *cont;
1934 nodemask_t oldmems;
1935
1936 list_add_tail((struct list_head *)&root->stack_list, &queue);
1937
1938 while (!list_empty(&queue)) {
1939 cp = list_first_entry(&queue, struct cpuset, stack_list);
1940 list_del(queue.next);
1941 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1942 child = cgroup_cs(cont);
1943 list_add_tail(&child->stack_list, &queue);
1944 }
1945
1946
1947 if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
1948 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1949 continue;
1950
1951 oldmems = cp->mems_allowed;
1952
1953
1954 mutex_lock(&callback_mutex);
1955 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
1956 nodes_and(cp->mems_allowed, cp->mems_allowed,
1957 node_states[N_HIGH_MEMORY]);
1958 mutex_unlock(&callback_mutex);
1959
1960
1961 if (cpus_empty(cp->cpus_allowed) ||
1962 nodes_empty(cp->mems_allowed))
1963 remove_tasks_in_empty_cpuset(cp);
1964 else {
1965 update_tasks_cpumask(cp, NULL);
1966 update_tasks_nodemask(cp, &oldmems);
1967 }
1968 }
1969}
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1984 unsigned long phase, void *unused_cpu)
1985{
1986 struct sched_domain_attr *attr;
1987 cpumask_t *doms;
1988 int ndoms;
1989
1990 switch (phase) {
1991 case CPU_ONLINE:
1992 case CPU_ONLINE_FROZEN:
1993 case CPU_DEAD:
1994 case CPU_DEAD_FROZEN:
1995 break;
1996
1997 default:
1998 return NOTIFY_DONE;
1999 }
2000
2001 cgroup_lock();
2002 top_cpuset.cpus_allowed = cpu_online_map;
2003 scan_for_empty_cpusets(&top_cpuset);
2004 ndoms = generate_sched_domains(&doms, &attr);
2005 cgroup_unlock();
2006
2007
2008 partition_sched_domains(ndoms, doms, attr);
2009
2010 return NOTIFY_OK;
2011}
2012
2013#ifdef CONFIG_MEMORY_HOTPLUG
2014
2015
2016
2017
2018
2019static int cpuset_track_online_nodes(struct notifier_block *self,
2020 unsigned long action, void *arg)
2021{
2022 cgroup_lock();
2023 switch (action) {
2024 case MEM_ONLINE:
2025 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2026 break;
2027 case MEM_OFFLINE:
2028 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2029 scan_for_empty_cpusets(&top_cpuset);
2030 break;
2031 default:
2032 break;
2033 }
2034 cgroup_unlock();
2035 return NOTIFY_OK;
2036}
2037#endif
2038
2039
2040
2041
2042
2043
2044
2045void __init cpuset_init_smp(void)
2046{
2047 top_cpuset.cpus_allowed = cpu_online_map;
2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2049
2050 hotcpu_notifier(cpuset_track_online_cpus, 0);
2051 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2052}
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
2066{
2067 mutex_lock(&callback_mutex);
2068 cpuset_cpus_allowed_locked(tsk, pmask);
2069 mutex_unlock(&callback_mutex);
2070}
2071
2072
2073
2074
2075
2076void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
2077{
2078 task_lock(tsk);
2079 guarantee_online_cpus(task_cs(tsk), pmask);
2080 task_unlock(tsk);
2081}
2082
2083void cpuset_init_current_mems_allowed(void)
2084{
2085 nodes_setall(current->mems_allowed);
2086}
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2099{
2100 nodemask_t mask;
2101
2102 mutex_lock(&callback_mutex);
2103 task_lock(tsk);
2104 guarantee_online_mems(task_cs(tsk), &mask);
2105 task_unlock(tsk);
2106 mutex_unlock(&callback_mutex);
2107
2108 return mask;
2109}
2110
2111
2112
2113
2114
2115
2116
2117int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2118{
2119 return nodes_intersects(*nodemask, current->mems_allowed);
2120}
2121
2122
2123
2124
2125
2126
2127
2128static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2129{
2130 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
2131 cs = cs->parent;
2132 return cs;
2133}
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2200{
2201 int node;
2202 const struct cpuset *cs;
2203 int allowed;
2204
2205 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2206 return 1;
2207 node = zone_to_nid(z);
2208 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2209 if (node_isset(node, current->mems_allowed))
2210 return 1;
2211
2212
2213
2214
2215 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2216 return 1;
2217 if (gfp_mask & __GFP_HARDWALL)
2218 return 0;
2219
2220 if (current->flags & PF_EXITING)
2221 return 1;
2222
2223
2224 mutex_lock(&callback_mutex);
2225
2226 task_lock(current);
2227 cs = nearest_hardwall_ancestor(task_cs(current));
2228 task_unlock(current);
2229
2230 allowed = node_isset(node, cs->mems_allowed);
2231 mutex_unlock(&callback_mutex);
2232 return allowed;
2233}
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2260{
2261 int node;
2262
2263 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2264 return 1;
2265 node = zone_to_nid(z);
2266 if (node_isset(node, current->mems_allowed))
2267 return 1;
2268
2269
2270
2271
2272 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2273 return 1;
2274 return 0;
2275}
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288void cpuset_lock(void)
2289{
2290 mutex_lock(&callback_mutex);
2291}
2292
2293
2294
2295
2296
2297
2298
2299void cpuset_unlock(void)
2300{
2301 mutex_unlock(&callback_mutex);
2302}
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330int cpuset_mem_spread_node(void)
2331{
2332 int node;
2333
2334 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2335 if (node == MAX_NUMNODES)
2336 node = first_node(current->mems_allowed);
2337 current->cpuset_mem_spread_rotor = node;
2338 return node;
2339}
2340EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2354 const struct task_struct *tsk2)
2355{
2356 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2357}
2358
2359
2360
2361
2362
2363
2364
2365int cpuset_memory_pressure_enabled __read_mostly;
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385void __cpuset_memory_pressure_bump(void)
2386{
2387 task_lock(current);
2388 fmeter_markevent(&task_cs(current)->fmeter);
2389 task_unlock(current);
2390}
2391
2392#ifdef CONFIG_PROC_PID_CPUSET
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2403{
2404 struct pid *pid;
2405 struct task_struct *tsk;
2406 char *buf;
2407 struct cgroup_subsys_state *css;
2408 int retval;
2409
2410 retval = -ENOMEM;
2411 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2412 if (!buf)
2413 goto out;
2414
2415 retval = -ESRCH;
2416 pid = m->private;
2417 tsk = get_pid_task(pid, PIDTYPE_PID);
2418 if (!tsk)
2419 goto out_free;
2420
2421 retval = -EINVAL;
2422 cgroup_lock();
2423 css = task_subsys_state(tsk, cpuset_subsys_id);
2424 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2425 if (retval < 0)
2426 goto out_unlock;
2427 seq_puts(m, buf);
2428 seq_putc(m, '\n');
2429out_unlock:
2430 cgroup_unlock();
2431 put_task_struct(tsk);
2432out_free:
2433 kfree(buf);
2434out:
2435 return retval;
2436}
2437
2438static int cpuset_open(struct inode *inode, struct file *file)
2439{
2440 struct pid *pid = PROC_I(inode)->pid;
2441 return single_open(file, proc_cpuset_show, pid);
2442}
2443
2444const struct file_operations proc_cpuset_operations = {
2445 .open = cpuset_open,
2446 .read = seq_read,
2447 .llseek = seq_lseek,
2448 .release = single_release,
2449};
2450#endif
2451
2452
2453void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2454{
2455 seq_printf(m, "Cpus_allowed:\t");
2456 seq_cpumask(m, &task->cpus_allowed);
2457 seq_printf(m, "\n");
2458 seq_printf(m, "Cpus_allowed_list:\t");
2459 seq_cpumask_list(m, &task->cpus_allowed);
2460 seq_printf(m, "\n");
2461 seq_printf(m, "Mems_allowed:\t");
2462 seq_nodemask(m, &task->mems_allowed);
2463 seq_printf(m, "\n");
2464 seq_printf(m, "Mems_allowed_list:\t");
2465 seq_nodemask_list(m, &task->mems_allowed);
2466 seq_printf(m, "\n");
2467}