1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
59#include <linux/seq_file.h>
60#include <linux/sysctl.h>
61#include <linux/syscalls.h>
62#include <linux/times.h>
63#include <linux/tsacct_kern.h>
64#include <linux/kprobes.h>
65#include <linux/delayacct.h>
66#include <linux/reciprocal_div.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h>
73#include <linux/ctype.h>
74#include <linux/ftrace.h>
75#include <trace/sched.h>
76
77#include <asm/tlb.h>
78#include <asm/irq_regs.h>
79
80#include "sched_cpupri.h"
81
82
83
84
85
86
87#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
88#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
89#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
90
91
92
93
94
95
96#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
97#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
98#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
99
100
101
102
103#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
104
105#define NICE_0_LOAD SCHED_LOAD_SCALE
106#define NICE_0_SHIFT SCHED_LOAD_SHIFT
107
108
109
110
111
112
113
114#define DEF_TIMESLICE (100 * HZ / 1000)
115
116
117
118
119#define RUNTIME_INF ((u64)~0ULL)
120
121#ifdef CONFIG_SMP
122
123
124
125
126static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
127{
128 return reciprocal_divide(load, sg->reciprocal_cpu_power);
129}
130
131
132
133
134
135static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
136{
137 sg->__cpu_power += val;
138 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
139}
140#endif
141
142static inline int rt_policy(int policy)
143{
144 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
145 return 1;
146 return 0;
147}
148
149static inline int task_has_rt_policy(struct task_struct *p)
150{
151 return rt_policy(p->policy);
152}
153
154
155
156
157struct rt_prio_array {
158 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
159 struct list_head queue[MAX_RT_PRIO];
160};
161
162struct rt_bandwidth {
163
164 spinlock_t rt_runtime_lock;
165 ktime_t rt_period;
166 u64 rt_runtime;
167 struct hrtimer rt_period_timer;
168};
169
170static struct rt_bandwidth def_rt_bandwidth;
171
172static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
173
174static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
175{
176 struct rt_bandwidth *rt_b =
177 container_of(timer, struct rt_bandwidth, rt_period_timer);
178 ktime_t now;
179 int overrun;
180 int idle = 0;
181
182 for (;;) {
183 now = hrtimer_cb_get_time(timer);
184 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
185
186 if (!overrun)
187 break;
188
189 idle = do_sched_rt_period_timer(rt_b, overrun);
190 }
191
192 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
193}
194
195static
196void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
197{
198 rt_b->rt_period = ns_to_ktime(period);
199 rt_b->rt_runtime = runtime;
200
201 spin_lock_init(&rt_b->rt_runtime_lock);
202
203 hrtimer_init(&rt_b->rt_period_timer,
204 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 rt_b->rt_period_timer.function = sched_rt_period_timer;
206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207}
208
209static inline int rt_bandwidth_enabled(void)
210{
211 return sysctl_sched_rt_runtime >= 0;
212}
213
214static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
215{
216 ktime_t now;
217
218 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
219 return;
220
221 if (hrtimer_active(&rt_b->rt_period_timer))
222 return;
223
224 spin_lock(&rt_b->rt_runtime_lock);
225 for (;;) {
226 if (hrtimer_active(&rt_b->rt_period_timer))
227 break;
228
229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
231 hrtimer_start_expires(&rt_b->rt_period_timer,
232 HRTIMER_MODE_ABS);
233 }
234 spin_unlock(&rt_b->rt_runtime_lock);
235}
236
237#ifdef CONFIG_RT_GROUP_SCHED
238static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
239{
240 hrtimer_cancel(&rt_b->rt_period_timer);
241}
242#endif
243
244
245
246
247
248static DEFINE_MUTEX(sched_domains_mutex);
249
250#ifdef CONFIG_GROUP_SCHED
251
252#include <linux/cgroup.h>
253
254struct cfs_rq;
255
256static LIST_HEAD(task_groups);
257
258
259struct task_group {
260#ifdef CONFIG_CGROUP_SCHED
261 struct cgroup_subsys_state css;
262#endif
263
264#ifdef CONFIG_FAIR_GROUP_SCHED
265
266 struct sched_entity **se;
267
268 struct cfs_rq **cfs_rq;
269 unsigned long shares;
270#endif
271
272#ifdef CONFIG_RT_GROUP_SCHED
273 struct sched_rt_entity **rt_se;
274 struct rt_rq **rt_rq;
275
276 struct rt_bandwidth rt_bandwidth;
277#endif
278
279 struct rcu_head rcu;
280 struct list_head list;
281
282 struct task_group *parent;
283 struct list_head siblings;
284 struct list_head children;
285};
286
287#ifdef CONFIG_USER_SCHED
288
289
290
291
292
293
294struct task_group root_task_group;
295
296#ifdef CONFIG_FAIR_GROUP_SCHED
297
298static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
299
300static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
301#endif
302
303#ifdef CONFIG_RT_GROUP_SCHED
304static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
305static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
306#endif
307#else
308#define root_task_group init_task_group
309#endif
310
311
312
313
314static DEFINE_SPINLOCK(task_group_lock);
315
316#ifdef CONFIG_FAIR_GROUP_SCHED
317#ifdef CONFIG_USER_SCHED
318# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
319#else
320# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
321#endif
322
323
324
325
326
327
328
329
330
331#define MIN_SHARES 2
332#define MAX_SHARES (1UL << 18)
333
334static int init_task_group_load = INIT_TASK_GROUP_LOAD;
335#endif
336
337
338
339
340struct task_group init_task_group;
341
342
343static inline struct task_group *task_group(struct task_struct *p)
344{
345 struct task_group *tg;
346
347#ifdef CONFIG_USER_SCHED
348 tg = p->user->tg;
349#elif defined(CONFIG_CGROUP_SCHED)
350 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
351 struct task_group, css);
352#else
353 tg = &init_task_group;
354#endif
355 return tg;
356}
357
358
359static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
360{
361#ifdef CONFIG_FAIR_GROUP_SCHED
362 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
363 p->se.parent = task_group(p)->se[cpu];
364#endif
365
366#ifdef CONFIG_RT_GROUP_SCHED
367 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
368 p->rt.parent = task_group(p)->rt_se[cpu];
369#endif
370}
371
372#else
373
374static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
375static inline struct task_group *task_group(struct task_struct *p)
376{
377 return NULL;
378}
379
380#endif
381
382
383struct cfs_rq {
384 struct load_weight load;
385 unsigned long nr_running;
386
387 u64 exec_clock;
388 u64 min_vruntime;
389
390 struct rb_root tasks_timeline;
391 struct rb_node *rb_leftmost;
392
393 struct list_head tasks;
394 struct list_head *balance_iterator;
395
396
397
398
399
400 struct sched_entity *curr, *next, *last;
401
402 unsigned int nr_spread_over;
403
404#ifdef CONFIG_FAIR_GROUP_SCHED
405 struct rq *rq;
406
407
408
409
410
411
412
413
414
415 struct list_head leaf_cfs_rq_list;
416 struct task_group *tg;
417
418#ifdef CONFIG_SMP
419
420
421
422 unsigned long task_weight;
423
424
425
426
427
428
429
430 unsigned long h_load;
431
432
433
434
435 unsigned long shares;
436
437
438
439
440 unsigned long rq_weight;
441#endif
442#endif
443};
444
445
446struct rt_rq {
447 struct rt_prio_array active;
448 unsigned long rt_nr_running;
449#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
450 int highest_prio;
451#endif
452#ifdef CONFIG_SMP
453 unsigned long rt_nr_migratory;
454 int overloaded;
455#endif
456 int rt_throttled;
457 u64 rt_time;
458 u64 rt_runtime;
459
460 spinlock_t rt_runtime_lock;
461
462#ifdef CONFIG_RT_GROUP_SCHED
463 unsigned long rt_nr_boosted;
464
465 struct rq *rq;
466 struct list_head leaf_rt_rq_list;
467 struct task_group *tg;
468 struct sched_rt_entity *rt_se;
469#endif
470};
471
472#ifdef CONFIG_SMP
473
474
475
476
477
478
479
480
481
482struct root_domain {
483 atomic_t refcount;
484 cpumask_t span;
485 cpumask_t online;
486
487
488
489
490
491 cpumask_t rto_mask;
492 atomic_t rto_count;
493#ifdef CONFIG_SMP
494 struct cpupri cpupri;
495#endif
496};
497
498
499
500
501
502static struct root_domain def_root_domain;
503
504#endif
505
506
507
508
509
510
511
512
513struct rq {
514
515 spinlock_t lock;
516
517
518
519
520
521 unsigned long nr_running;
522 #define CPU_LOAD_IDX_MAX 5
523 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
524 unsigned char idle_at_tick;
525#ifdef CONFIG_NO_HZ
526 unsigned long last_tick_seen;
527 unsigned char in_nohz_recently;
528#endif
529
530 struct load_weight load;
531 unsigned long nr_load_updates;
532 u64 nr_switches;
533
534 struct cfs_rq cfs;
535 struct rt_rq rt;
536
537#ifdef CONFIG_FAIR_GROUP_SCHED
538
539 struct list_head leaf_cfs_rq_list;
540#endif
541#ifdef CONFIG_RT_GROUP_SCHED
542 struct list_head leaf_rt_rq_list;
543#endif
544
545
546
547
548
549
550
551 unsigned long nr_uninterruptible;
552
553 struct task_struct *curr, *idle;
554 unsigned long next_balance;
555 struct mm_struct *prev_mm;
556
557 u64 clock;
558
559 atomic_t nr_iowait;
560
561#ifdef CONFIG_SMP
562 struct root_domain *rd;
563 struct sched_domain *sd;
564
565
566 int active_balance;
567 int push_cpu;
568
569 int cpu;
570 int online;
571
572 unsigned long avg_load_per_task;
573
574 struct task_struct *migration_thread;
575 struct list_head migration_queue;
576#endif
577
578#ifdef CONFIG_SCHED_HRTICK
579#ifdef CONFIG_SMP
580 int hrtick_csd_pending;
581 struct call_single_data hrtick_csd;
582#endif
583 struct hrtimer hrtick_timer;
584#endif
585
586#ifdef CONFIG_SCHEDSTATS
587
588 struct sched_info rq_sched_info;
589
590
591 unsigned int yld_exp_empty;
592 unsigned int yld_act_empty;
593 unsigned int yld_both_empty;
594 unsigned int yld_count;
595
596
597 unsigned int sched_switch;
598 unsigned int sched_count;
599 unsigned int sched_goidle;
600
601
602 unsigned int ttwu_count;
603 unsigned int ttwu_local;
604
605
606 unsigned int bkl_count;
607#endif
608};
609
610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
611
612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
613{
614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
615}
616
617static inline int cpu_of(struct rq *rq)
618{
619#ifdef CONFIG_SMP
620 return rq->cpu;
621#else
622 return 0;
623#endif
624}
625
626
627
628
629
630
631
632
633#define for_each_domain(cpu, __sd) \
634 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
635
636#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
637#define this_rq() (&__get_cpu_var(runqueues))
638#define task_rq(p) cpu_rq(task_cpu(p))
639#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
640
641static inline void update_rq_clock(struct rq *rq)
642{
643 rq->clock = sched_clock_cpu(cpu_of(rq));
644}
645
646
647
648
649#ifdef CONFIG_SCHED_DEBUG
650# define const_debug __read_mostly
651#else
652# define const_debug static const
653#endif
654
655
656
657
658
659
660
661
662int runqueue_is_locked(void)
663{
664 int cpu = get_cpu();
665 struct rq *rq = cpu_rq(cpu);
666 int ret;
667
668 ret = spin_is_locked(&rq->lock);
669 put_cpu();
670 return ret;
671}
672
673
674
675
676
677#define SCHED_FEAT(name, enabled) \
678 __SCHED_FEAT_##name ,
679
680enum {
681#include "sched_features.h"
682};
683
684#undef SCHED_FEAT
685
686#define SCHED_FEAT(name, enabled) \
687 (1UL << __SCHED_FEAT_##name) * enabled |
688
689const_debug unsigned int sysctl_sched_features =
690#include "sched_features.h"
691 0;
692
693#undef SCHED_FEAT
694
695#ifdef CONFIG_SCHED_DEBUG
696#define SCHED_FEAT(name, enabled) \
697 #name ,
698
699static __read_mostly char *sched_feat_names[] = {
700#include "sched_features.h"
701 NULL
702};
703
704#undef SCHED_FEAT
705
706static int sched_feat_open(struct inode *inode, struct file *filp)
707{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i;
720
721 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]);
723 len += 4;
724 }
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 }
736
737 r += sprintf(buf + r, "\n");
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745}
746
747static ssize_t
748sched_feat_write(struct file *filp, const char __user *ubuf,
749 size_t cnt, loff_t *ppos)
750{
751 char buf[64];
752 char *cmp = buf;
753 int neg = 0;
754 int i;
755
756 if (cnt > 63)
757 cnt = 63;
758
759 if (copy_from_user(&buf, ubuf, cnt))
760 return -EFAULT;
761
762 buf[cnt] = 0;
763
764 if (strncmp(buf, "NO_", 3) == 0) {
765 neg = 1;
766 cmp += 3;
767 }
768
769 for (i = 0; sched_feat_names[i]; i++) {
770 int len = strlen(sched_feat_names[i]);
771
772 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
773 if (neg)
774 sysctl_sched_features &= ~(1UL << i);
775 else
776 sysctl_sched_features |= (1UL << i);
777 break;
778 }
779 }
780
781 if (!sched_feat_names[i])
782 return -EINVAL;
783
784 filp->f_pos += cnt;
785
786 return cnt;
787}
788
789static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open,
791 .read = sched_feat_read,
792 .write = sched_feat_write,
793};
794
795static __init int sched_init_debug(void)
796{
797 debugfs_create_file("sched_features", 0644, NULL, NULL,
798 &sched_feat_fops);
799
800 return 0;
801}
802late_initcall(sched_init_debug);
803
804#endif
805
806#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
807
808
809
810
811
812const_debug unsigned int sysctl_sched_nr_migrate = 32;
813
814
815
816
817
818unsigned int sysctl_sched_shares_ratelimit = 250000;
819
820
821
822
823
824
825unsigned int sysctl_sched_shares_thresh = 4;
826
827
828
829
830
831unsigned int sysctl_sched_rt_period = 1000000;
832
833static __read_mostly int scheduler_running;
834
835
836
837
838
839int sysctl_sched_rt_runtime = 950000;
840
841static inline u64 global_rt_period(void)
842{
843 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
844}
845
846static inline u64 global_rt_runtime(void)
847{
848 if (sysctl_sched_rt_runtime < 0)
849 return RUNTIME_INF;
850
851 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
852}
853
854#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0)
856#endif
857#ifndef finish_arch_switch
858# define finish_arch_switch(prev) do { } while (0)
859#endif
860
861static inline int task_current(struct rq *rq, struct task_struct *p)
862{
863 return rq->curr == p;
864}
865
866#ifndef __ARCH_WANT_UNLOCKED_CTXSW
867static inline int task_running(struct rq *rq, struct task_struct *p)
868{
869 return task_current(rq, p);
870}
871
872static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
873{
874}
875
876static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
877{
878#ifdef CONFIG_DEBUG_SPINLOCK
879
880 rq->lock.owner = current;
881#endif
882
883
884
885
886
887 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
888
889 spin_unlock_irq(&rq->lock);
890}
891
892#else
893static inline int task_running(struct rq *rq, struct task_struct *p)
894{
895#ifdef CONFIG_SMP
896 return p->oncpu;
897#else
898 return task_current(rq, p);
899#endif
900}
901
902static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
903{
904#ifdef CONFIG_SMP
905
906
907
908
909
910 next->oncpu = 1;
911#endif
912#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
913 spin_unlock_irq(&rq->lock);
914#else
915 spin_unlock(&rq->lock);
916#endif
917}
918
919static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
920{
921#ifdef CONFIG_SMP
922
923
924
925
926
927 smp_wmb();
928 prev->oncpu = 0;
929#endif
930#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
931 local_irq_enable();
932#endif
933}
934#endif
935
936
937
938
939
940static inline struct rq *__task_rq_lock(struct task_struct *p)
941 __acquires(rq->lock)
942{
943 for (;;) {
944 struct rq *rq = task_rq(p);
945 spin_lock(&rq->lock);
946 if (likely(rq == task_rq(p)))
947 return rq;
948 spin_unlock(&rq->lock);
949 }
950}
951
952
953
954
955
956
957static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
958 __acquires(rq->lock)
959{
960 struct rq *rq;
961
962 for (;;) {
963 local_irq_save(*flags);
964 rq = task_rq(p);
965 spin_lock(&rq->lock);
966 if (likely(rq == task_rq(p)))
967 return rq;
968 spin_unlock_irqrestore(&rq->lock, *flags);
969 }
970}
971
972void task_rq_unlock_wait(struct task_struct *p)
973{
974 struct rq *rq = task_rq(p);
975
976 smp_mb();
977 spin_unlock_wait(&rq->lock);
978}
979
980static void __task_rq_unlock(struct rq *rq)
981 __releases(rq->lock)
982{
983 spin_unlock(&rq->lock);
984}
985
986static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
987 __releases(rq->lock)
988{
989 spin_unlock_irqrestore(&rq->lock, *flags);
990}
991
992
993
994
995static struct rq *this_rq_lock(void)
996 __acquires(rq->lock)
997{
998 struct rq *rq;
999
1000 local_irq_disable();
1001 rq = this_rq();
1002 spin_lock(&rq->lock);
1003
1004 return rq;
1005}
1006
1007#ifdef CONFIG_SCHED_HRTICK
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024static inline int hrtick_enabled(struct rq *rq)
1025{
1026 if (!sched_feat(HRTICK))
1027 return 0;
1028 if (!cpu_active(cpu_of(rq)))
1029 return 0;
1030 return hrtimer_is_hres_active(&rq->hrtick_timer);
1031}
1032
1033static void hrtick_clear(struct rq *rq)
1034{
1035 if (hrtimer_active(&rq->hrtick_timer))
1036 hrtimer_cancel(&rq->hrtick_timer);
1037}
1038
1039
1040
1041
1042
1043static enum hrtimer_restart hrtick(struct hrtimer *timer)
1044{
1045 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1046
1047 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1048
1049 spin_lock(&rq->lock);
1050 update_rq_clock(rq);
1051 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1052 spin_unlock(&rq->lock);
1053
1054 return HRTIMER_NORESTART;
1055}
1056
1057#ifdef CONFIG_SMP
1058
1059
1060
1061static void __hrtick_start(void *arg)
1062{
1063 struct rq *rq = arg;
1064
1065 spin_lock(&rq->lock);
1066 hrtimer_restart(&rq->hrtick_timer);
1067 rq->hrtick_csd_pending = 0;
1068 spin_unlock(&rq->lock);
1069}
1070
1071
1072
1073
1074
1075
1076static void hrtick_start(struct rq *rq, u64 delay)
1077{
1078 struct hrtimer *timer = &rq->hrtick_timer;
1079 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1080
1081 hrtimer_set_expires(timer, time);
1082
1083 if (rq == this_rq()) {
1084 hrtimer_restart(timer);
1085 } else if (!rq->hrtick_csd_pending) {
1086 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1087 rq->hrtick_csd_pending = 1;
1088 }
1089}
1090
1091static int
1092hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1093{
1094 int cpu = (int)(long)hcpu;
1095
1096 switch (action) {
1097 case CPU_UP_CANCELED:
1098 case CPU_UP_CANCELED_FROZEN:
1099 case CPU_DOWN_PREPARE:
1100 case CPU_DOWN_PREPARE_FROZEN:
1101 case CPU_DEAD:
1102 case CPU_DEAD_FROZEN:
1103 hrtick_clear(cpu_rq(cpu));
1104 return NOTIFY_OK;
1105 }
1106
1107 return NOTIFY_DONE;
1108}
1109
1110static __init void init_hrtick(void)
1111{
1112 hotcpu_notifier(hotplug_hrtick, 0);
1113}
1114#else
1115
1116
1117
1118
1119
1120static void hrtick_start(struct rq *rq, u64 delay)
1121{
1122 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1123}
1124
1125static inline void init_hrtick(void)
1126{
1127}
1128#endif
1129
1130static void init_rq_hrtick(struct rq *rq)
1131{
1132#ifdef CONFIG_SMP
1133 rq->hrtick_csd_pending = 0;
1134
1135 rq->hrtick_csd.flags = 0;
1136 rq->hrtick_csd.func = __hrtick_start;
1137 rq->hrtick_csd.info = rq;
1138#endif
1139
1140 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1141 rq->hrtick_timer.function = hrtick;
1142 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1143}
1144#else
1145static inline void hrtick_clear(struct rq *rq)
1146{
1147}
1148
1149static inline void init_rq_hrtick(struct rq *rq)
1150{
1151}
1152
1153static inline void init_hrtick(void)
1154{
1155}
1156#endif
1157
1158
1159
1160
1161
1162
1163
1164
1165#ifdef CONFIG_SMP
1166
1167#ifndef tsk_is_polling
1168#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1169#endif
1170
1171static void resched_task(struct task_struct *p)
1172{
1173 int cpu;
1174
1175 assert_spin_locked(&task_rq(p)->lock);
1176
1177 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1178 return;
1179
1180 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1181
1182 cpu = task_cpu(p);
1183 if (cpu == smp_processor_id())
1184 return;
1185
1186
1187 smp_mb();
1188 if (!tsk_is_polling(p))
1189 smp_send_reschedule(cpu);
1190}
1191
1192static void resched_cpu(int cpu)
1193{
1194 struct rq *rq = cpu_rq(cpu);
1195 unsigned long flags;
1196
1197 if (!spin_trylock_irqsave(&rq->lock, flags))
1198 return;
1199 resched_task(cpu_curr(cpu));
1200 spin_unlock_irqrestore(&rq->lock, flags);
1201}
1202
1203#ifdef CONFIG_NO_HZ
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214void wake_up_idle_cpu(int cpu)
1215{
1216 struct rq *rq = cpu_rq(cpu);
1217
1218 if (cpu == smp_processor_id())
1219 return;
1220
1221
1222
1223
1224
1225
1226
1227
1228 if (rq->curr != rq->idle)
1229 return;
1230
1231
1232
1233
1234
1235
1236 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1237
1238
1239 smp_mb();
1240 if (!tsk_is_polling(rq->idle))
1241 smp_send_reschedule(cpu);
1242}
1243#endif
1244
1245#else
1246static void resched_task(struct task_struct *p)
1247{
1248 assert_spin_locked(&task_rq(p)->lock);
1249 set_tsk_need_resched(p);
1250}
1251#endif
1252
1253#if BITS_PER_LONG == 32
1254# define WMULT_CONST (~0UL)
1255#else
1256# define WMULT_CONST (1UL << 32)
1257#endif
1258
1259#define WMULT_SHIFT 32
1260
1261
1262
1263
1264#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1265
1266
1267
1268
1269static unsigned long
1270calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1271 struct load_weight *lw)
1272{
1273 u64 tmp;
1274
1275 if (!lw->inv_weight) {
1276 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1277 lw->inv_weight = 1;
1278 else
1279 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1280 / (lw->weight+1);
1281 }
1282
1283 tmp = (u64)delta_exec * weight;
1284
1285
1286
1287 if (unlikely(tmp > WMULT_CONST))
1288 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1289 WMULT_SHIFT/2);
1290 else
1291 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1292
1293 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1294}
1295
1296static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1297{
1298 lw->weight += inc;
1299 lw->inv_weight = 0;
1300}
1301
1302static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1303{
1304 lw->weight -= dec;
1305 lw->inv_weight = 0;
1306}
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317#define WEIGHT_IDLEPRIO 2
1318#define WMULT_IDLEPRIO (1 << 31)
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332static const int prio_to_weight[40] = {
1333 88761, 71755, 56483, 46273, 36291,
1334 29154, 23254, 18705, 14949, 11916,
1335 9548, 7620, 6100, 4904, 3906,
1336 3121, 2501, 1991, 1586, 1277,
1337 1024, 820, 655, 526, 423,
1338 335, 272, 215, 172, 137,
1339 110, 87, 70, 56, 45,
1340 36, 29, 23, 18, 15,
1341};
1342
1343
1344
1345
1346
1347
1348
1349
1350static const u32 prio_to_wmult[40] = {
1351 48388, 59856, 76040, 92818, 118348,
1352 147320, 184698, 229616, 287308, 360437,
1353 449829, 563644, 704093, 875809, 1099582,
1354 1376151, 1717300, 2157191, 2708050, 3363326,
1355 4194304, 5237765, 6557202, 8165337, 10153587,
1356 12820798, 15790321, 19976592, 24970740, 31350126,
1357 39045157, 49367440, 61356676, 76695844, 95443717,
1358 119304647, 148102320, 186737708, 238609294, 286331153,
1359};
1360
1361static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1362
1363
1364
1365
1366
1367
1368struct rq_iterator {
1369 void *arg;
1370 struct task_struct *(*start)(void *);
1371 struct task_struct *(*next)(void *);
1372};
1373
1374#ifdef CONFIG_SMP
1375static unsigned long
1376balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1377 unsigned long max_load_move, struct sched_domain *sd,
1378 enum cpu_idle_type idle, int *all_pinned,
1379 int *this_best_prio, struct rq_iterator *iterator);
1380
1381static int
1382iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1383 struct sched_domain *sd, enum cpu_idle_type idle,
1384 struct rq_iterator *iterator);
1385#endif
1386
1387#ifdef CONFIG_CGROUP_CPUACCT
1388static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1389#else
1390static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1391#endif
1392
1393static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1394{
1395 update_load_add(&rq->load, load);
1396}
1397
1398static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1399{
1400 update_load_sub(&rq->load, load);
1401}
1402
1403#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1404typedef int (*tg_visitor)(struct task_group *, void *);
1405
1406
1407
1408
1409
1410static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1411{
1412 struct task_group *parent, *child;
1413 int ret;
1414
1415 rcu_read_lock();
1416 parent = &root_task_group;
1417down:
1418 ret = (*down)(parent, data);
1419 if (ret)
1420 goto out_unlock;
1421 list_for_each_entry_rcu(child, &parent->children, siblings) {
1422 parent = child;
1423 goto down;
1424
1425up:
1426 continue;
1427 }
1428 ret = (*up)(parent, data);
1429 if (ret)
1430 goto out_unlock;
1431
1432 child = parent;
1433 parent = parent->parent;
1434 if (parent)
1435 goto up;
1436out_unlock:
1437 rcu_read_unlock();
1438
1439 return ret;
1440}
1441
1442static int tg_nop(struct task_group *tg, void *data)
1443{
1444 return 0;
1445}
1446#endif
1447
1448#ifdef CONFIG_SMP
1449static unsigned long source_load(int cpu, int type);
1450static unsigned long target_load(int cpu, int type);
1451static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1452
1453static unsigned long cpu_avg_load_per_task(int cpu)
1454{
1455 struct rq *rq = cpu_rq(cpu);
1456 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1457
1458 if (nr_running)
1459 rq->avg_load_per_task = rq->load.weight / nr_running;
1460 else
1461 rq->avg_load_per_task = 0;
1462
1463 return rq->avg_load_per_task;
1464}
1465
1466#ifdef CONFIG_FAIR_GROUP_SCHED
1467
1468static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1469
1470
1471
1472
1473static void
1474update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{
1477 int boost = 0;
1478 unsigned long shares;
1479 unsigned long rq_weight;
1480
1481 if (!tg->se[cpu])
1482 return;
1483
1484 rq_weight = tg->cfs_rq[cpu]->load.weight;
1485
1486
1487
1488
1489
1490
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498
1499
1500
1501
1502
1503
1504
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507
1508 if (abs(shares - tg->se[cpu]->load.weight) >
1509 sysctl_sched_shares_thresh) {
1510 struct rq *rq = cpu_rq(cpu);
1511 unsigned long flags;
1512
1513 spin_lock_irqsave(&rq->lock, flags);
1514
1515
1516
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519
1520 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags);
1522 }
1523}
1524
1525
1526
1527
1528
1529
1530static int tg_shares_up(struct task_group *tg, void *data)
1531{
1532 unsigned long rq_weight = 0;
1533 unsigned long shares = 0;
1534 struct sched_domain *sd = data;
1535 int i;
1536
1537 for_each_cpu_mask(i, sd->span) {
1538 rq_weight += tg->cfs_rq[i]->load.weight;
1539 shares += tg->cfs_rq[i]->shares;
1540 }
1541
1542 if ((!shares && rq_weight) || shares > tg->shares)
1543 shares = tg->shares;
1544
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares;
1547
1548 if (!rq_weight)
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight);
1553
1554 return 0;
1555}
1556
1557
1558
1559
1560
1561
1562static int tg_load_down(struct task_group *tg, void *data)
1563{
1564 unsigned long load;
1565 long cpu = (long)data;
1566
1567 if (!tg->parent) {
1568 load = cpu_rq(cpu)->load.weight;
1569 } else {
1570 load = tg->parent->cfs_rq[cpu]->h_load;
1571 load *= tg->cfs_rq[cpu]->shares;
1572 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1573 }
1574
1575 tg->cfs_rq[cpu]->h_load = load;
1576
1577 return 0;
1578}
1579
1580static void update_shares(struct sched_domain *sd)
1581{
1582 u64 now = cpu_clock(raw_smp_processor_id());
1583 s64 elapsed = now - sd->last_update;
1584
1585 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1586 sd->last_update = now;
1587 walk_tg_tree(tg_nop, tg_shares_up, sd);
1588 }
1589}
1590
1591static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1592{
1593 spin_unlock(&rq->lock);
1594 update_shares(sd);
1595 spin_lock(&rq->lock);
1596}
1597
1598static void update_h_load(long cpu)
1599{
1600 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1601}
1602
1603#else
1604
1605static inline void update_shares(struct sched_domain *sd)
1606{
1607}
1608
1609static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610{
1611}
1612
1613#endif
1614
1615#endif
1616
1617#ifdef CONFIG_FAIR_GROUP_SCHED
1618static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1619{
1620#ifdef CONFIG_SMP
1621 cfs_rq->shares = shares;
1622#endif
1623}
1624#endif
1625
1626#include "sched_stats.h"
1627#include "sched_idletask.c"
1628#include "sched_fair.c"
1629#include "sched_rt.c"
1630#ifdef CONFIG_SCHED_DEBUG
1631# include "sched_debug.c"
1632#endif
1633
1634#define sched_class_highest (&rt_sched_class)
1635#define for_each_class(class) \
1636 for (class = sched_class_highest; class; class = class->next)
1637
1638static void inc_nr_running(struct rq *rq)
1639{
1640 rq->nr_running++;
1641}
1642
1643static void dec_nr_running(struct rq *rq)
1644{
1645 rq->nr_running--;
1646}
1647
1648static void set_load_weight(struct task_struct *p)
1649{
1650 if (task_has_rt_policy(p)) {
1651 p->se.load.weight = prio_to_weight[0] * 2;
1652 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1653 return;
1654 }
1655
1656
1657
1658
1659 if (p->policy == SCHED_IDLE) {
1660 p->se.load.weight = WEIGHT_IDLEPRIO;
1661 p->se.load.inv_weight = WMULT_IDLEPRIO;
1662 return;
1663 }
1664
1665 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1666 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1667}
1668
1669static void update_avg(u64 *avg, u64 sample)
1670{
1671 s64 diff = sample - *avg;
1672 *avg += diff >> 3;
1673}
1674
1675static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1676{
1677 sched_info_queued(p);
1678 p->sched_class->enqueue_task(rq, p, wakeup);
1679 p->se.on_rq = 1;
1680}
1681
1682static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1683{
1684 if (sleep && p->se.last_wakeup) {
1685 update_avg(&p->se.avg_overlap,
1686 p->se.sum_exec_runtime - p->se.last_wakeup);
1687 p->se.last_wakeup = 0;
1688 }
1689
1690 sched_info_dequeued(p);
1691 p->sched_class->dequeue_task(rq, p, sleep);
1692 p->se.on_rq = 0;
1693}
1694
1695
1696
1697
1698static inline int __normal_prio(struct task_struct *p)
1699{
1700 return p->static_prio;
1701}
1702
1703
1704
1705
1706
1707
1708
1709
1710static inline int normal_prio(struct task_struct *p)
1711{
1712 int prio;
1713
1714 if (task_has_rt_policy(p))
1715 prio = MAX_RT_PRIO-1 - p->rt_priority;
1716 else
1717 prio = __normal_prio(p);
1718 return prio;
1719}
1720
1721
1722
1723
1724
1725
1726
1727
1728static int effective_prio(struct task_struct *p)
1729{
1730 p->normal_prio = normal_prio(p);
1731
1732
1733
1734
1735
1736 if (!rt_prio(p->prio))
1737 return p->normal_prio;
1738 return p->prio;
1739}
1740
1741
1742
1743
1744static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1745{
1746 if (task_contributes_to_load(p))
1747 rq->nr_uninterruptible--;
1748
1749 enqueue_task(rq, p, wakeup);
1750 inc_nr_running(rq);
1751}
1752
1753
1754
1755
1756static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1757{
1758 if (task_contributes_to_load(p))
1759 rq->nr_uninterruptible++;
1760
1761 dequeue_task(rq, p, sleep);
1762 dec_nr_running(rq);
1763}
1764
1765
1766
1767
1768
1769inline int task_curr(const struct task_struct *p)
1770{
1771 return cpu_curr(task_cpu(p)) == p;
1772}
1773
1774static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1775{
1776 set_task_rq(p, cpu);
1777#ifdef CONFIG_SMP
1778
1779
1780
1781
1782
1783 smp_wmb();
1784 task_thread_info(p)->cpu = cpu;
1785#endif
1786}
1787
1788static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1789 const struct sched_class *prev_class,
1790 int oldprio, int running)
1791{
1792 if (prev_class != p->sched_class) {
1793 if (prev_class->switched_from)
1794 prev_class->switched_from(rq, p, running);
1795 p->sched_class->switched_to(rq, p, running);
1796 } else
1797 p->sched_class->prio_changed(rq, p, oldprio, running);
1798}
1799
1800#ifdef CONFIG_SMP
1801
1802
1803static unsigned long weighted_cpuload(const int cpu)
1804{
1805 return cpu_rq(cpu)->load.weight;
1806}
1807
1808
1809
1810
1811static int
1812task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1813{
1814 s64 delta;
1815
1816
1817
1818
1819 if (sched_feat(CACHE_HOT_BUDDY) &&
1820 (&p->se == cfs_rq_of(&p->se)->next ||
1821 &p->se == cfs_rq_of(&p->se)->last))
1822 return 1;
1823
1824 if (p->sched_class != &fair_sched_class)
1825 return 0;
1826
1827 if (sysctl_sched_migration_cost == -1)
1828 return 1;
1829 if (sysctl_sched_migration_cost == 0)
1830 return 0;
1831
1832 delta = now - p->se.exec_start;
1833
1834 return delta < (s64)sysctl_sched_migration_cost;
1835}
1836
1837
1838void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1839{
1840 int old_cpu = task_cpu(p);
1841 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1842 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1843 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1844 u64 clock_offset;
1845
1846 clock_offset = old_rq->clock - new_rq->clock;
1847
1848#ifdef CONFIG_SCHEDSTATS
1849 if (p->se.wait_start)
1850 p->se.wait_start -= clock_offset;
1851 if (p->se.sleep_start)
1852 p->se.sleep_start -= clock_offset;
1853 if (p->se.block_start)
1854 p->se.block_start -= clock_offset;
1855 if (old_cpu != new_cpu) {
1856 schedstat_inc(p, se.nr_migrations);
1857 if (task_hot(p, old_rq->clock, NULL))
1858 schedstat_inc(p, se.nr_forced2_migrations);
1859 }
1860#endif
1861 p->se.vruntime -= old_cfsrq->min_vruntime -
1862 new_cfsrq->min_vruntime;
1863
1864 __set_task_cpu(p, new_cpu);
1865}
1866
1867struct migration_req {
1868 struct list_head list;
1869
1870 struct task_struct *task;
1871 int dest_cpu;
1872
1873 struct completion done;
1874};
1875
1876
1877
1878
1879
1880static int
1881migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1882{
1883 struct rq *rq = task_rq(p);
1884
1885
1886
1887
1888
1889 if (!p->se.on_rq && !task_running(rq, p)) {
1890 set_task_cpu(p, dest_cpu);
1891 return 0;
1892 }
1893
1894 init_completion(&req->done);
1895 req->task = p;
1896 req->dest_cpu = dest_cpu;
1897 list_add(&req->list, &rq->migration_queue);
1898
1899 return 1;
1900}
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1919{
1920 unsigned long flags;
1921 int running, on_rq;
1922 unsigned long ncsw;
1923 struct rq *rq;
1924
1925 for (;;) {
1926
1927
1928
1929
1930
1931
1932 rq = task_rq(p);
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945 while (task_running(rq, p)) {
1946 if (match_state && unlikely(p->state != match_state))
1947 return 0;
1948 cpu_relax();
1949 }
1950
1951
1952
1953
1954
1955
1956 rq = task_rq_lock(p, &flags);
1957 trace_sched_wait_task(rq, p);
1958 running = task_running(rq, p);
1959 on_rq = p->se.on_rq;
1960 ncsw = 0;
1961 if (!match_state || p->state == match_state)
1962 ncsw = p->nvcsw | LONG_MIN;
1963 task_rq_unlock(rq, &flags);
1964
1965
1966
1967
1968 if (unlikely(!ncsw))
1969 break;
1970
1971
1972
1973
1974
1975
1976
1977 if (unlikely(running)) {
1978 cpu_relax();
1979 continue;
1980 }
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991 if (unlikely(on_rq)) {
1992 schedule_timeout_uninterruptible(1);
1993 continue;
1994 }
1995
1996
1997
1998
1999
2000
2001 break;
2002 }
2003
2004 return ncsw;
2005}
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020void kick_process(struct task_struct *p)
2021{
2022 int cpu;
2023
2024 preempt_disable();
2025 cpu = task_cpu(p);
2026 if ((cpu != smp_processor_id()) && task_curr(p))
2027 smp_send_reschedule(cpu);
2028 preempt_enable();
2029}
2030
2031
2032
2033
2034
2035
2036
2037
2038static unsigned long source_load(int cpu, int type)
2039{
2040 struct rq *rq = cpu_rq(cpu);
2041 unsigned long total = weighted_cpuload(cpu);
2042
2043 if (type == 0 || !sched_feat(LB_BIAS))
2044 return total;
2045
2046 return min(rq->cpu_load[type-1], total);
2047}
2048
2049
2050
2051
2052
2053static unsigned long target_load(int cpu, int type)
2054{
2055 struct rq *rq = cpu_rq(cpu);
2056 unsigned long total = weighted_cpuload(cpu);
2057
2058 if (type == 0 || !sched_feat(LB_BIAS))
2059 return total;
2060
2061 return max(rq->cpu_load[type-1], total);
2062}
2063
2064
2065
2066
2067
2068static struct sched_group *
2069find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2070{
2071 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2072 unsigned long min_load = ULONG_MAX, this_load = 0;
2073 int load_idx = sd->forkexec_idx;
2074 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2075
2076 do {
2077 unsigned long load, avg_load;
2078 int local_group;
2079 int i;
2080
2081
2082 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
2083 continue;
2084
2085 local_group = cpu_isset(this_cpu, group->cpumask);
2086
2087
2088 avg_load = 0;
2089
2090 for_each_cpu_mask_nr(i, group->cpumask) {
2091
2092 if (local_group)
2093 load = source_load(i, load_idx);
2094 else
2095 load = target_load(i, load_idx);
2096
2097 avg_load += load;
2098 }
2099
2100
2101 avg_load = sg_div_cpu_power(group,
2102 avg_load * SCHED_LOAD_SCALE);
2103
2104 if (local_group) {
2105 this_load = avg_load;
2106 this = group;
2107 } else if (avg_load < min_load) {
2108 min_load = avg_load;
2109 idlest = group;
2110 }
2111 } while (group = group->next, group != sd->groups);
2112
2113 if (!idlest || 100*this_load < imbalance*min_load)
2114 return NULL;
2115 return idlest;
2116}
2117
2118
2119
2120
2121static int
2122find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2123 cpumask_t *tmp)
2124{
2125 unsigned long load, min_load = ULONG_MAX;
2126 int idlest = -1;
2127 int i;
2128
2129
2130 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
2131
2132 for_each_cpu_mask_nr(i, *tmp) {
2133 load = weighted_cpuload(i);
2134
2135 if (load < min_load || (load == min_load && i == this_cpu)) {
2136 min_load = load;
2137 idlest = i;
2138 }
2139 }
2140
2141 return idlest;
2142}
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155static int sched_balance_self(int cpu, int flag)
2156{
2157 struct task_struct *t = current;
2158 struct sched_domain *tmp, *sd = NULL;
2159
2160 for_each_domain(cpu, tmp) {
2161
2162
2163
2164 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2165 break;
2166 if (tmp->flags & flag)
2167 sd = tmp;
2168 }
2169
2170 if (sd)
2171 update_shares(sd);
2172
2173 while (sd) {
2174 cpumask_t span, tmpmask;
2175 struct sched_group *group;
2176 int new_cpu, weight;
2177
2178 if (!(sd->flags & flag)) {
2179 sd = sd->child;
2180 continue;
2181 }
2182
2183 span = sd->span;
2184 group = find_idlest_group(sd, t, cpu);
2185 if (!group) {
2186 sd = sd->child;
2187 continue;
2188 }
2189
2190 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
2191 if (new_cpu == -1 || new_cpu == cpu) {
2192
2193 sd = sd->child;
2194 continue;
2195 }
2196
2197
2198 cpu = new_cpu;
2199 sd = NULL;
2200 weight = cpus_weight(span);
2201 for_each_domain(cpu, tmp) {
2202 if (weight <= cpus_weight(tmp->span))
2203 break;
2204 if (tmp->flags & flag)
2205 sd = tmp;
2206 }
2207
2208 }
2209
2210 return cpu;
2211}
2212
2213#endif
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2230{
2231 int cpu, orig_cpu, this_cpu, success = 0;
2232 unsigned long flags;
2233 long old_state;
2234 struct rq *rq;
2235
2236 if (!sched_feat(SYNC_WAKEUPS))
2237 sync = 0;
2238
2239#ifdef CONFIG_SMP
2240 if (sched_feat(LB_WAKEUP_UPDATE)) {
2241 struct sched_domain *sd;
2242
2243 this_cpu = raw_smp_processor_id();
2244 cpu = task_cpu(p);
2245
2246 for_each_domain(this_cpu, sd) {
2247 if (cpu_isset(cpu, sd->span)) {
2248 update_shares(sd);
2249 break;
2250 }
2251 }
2252 }
2253#endif
2254
2255 smp_wmb();
2256 rq = task_rq_lock(p, &flags);
2257 old_state = p->state;
2258 if (!(old_state & state))
2259 goto out;
2260
2261 if (p->se.on_rq)
2262 goto out_running;
2263
2264 cpu = task_cpu(p);
2265 orig_cpu = cpu;
2266 this_cpu = smp_processor_id();
2267
2268#ifdef CONFIG_SMP
2269 if (unlikely(task_running(rq, p)))
2270 goto out_activate;
2271
2272 cpu = p->sched_class->select_task_rq(p, sync);
2273 if (cpu != orig_cpu) {
2274 set_task_cpu(p, cpu);
2275 task_rq_unlock(rq, &flags);
2276
2277 rq = task_rq_lock(p, &flags);
2278 old_state = p->state;
2279 if (!(old_state & state))
2280 goto out;
2281 if (p->se.on_rq)
2282 goto out_running;
2283
2284 this_cpu = smp_processor_id();
2285 cpu = task_cpu(p);
2286 }
2287
2288#ifdef CONFIG_SCHEDSTATS
2289 schedstat_inc(rq, ttwu_count);
2290 if (cpu == this_cpu)
2291 schedstat_inc(rq, ttwu_local);
2292 else {
2293 struct sched_domain *sd;
2294 for_each_domain(this_cpu, sd) {
2295 if (cpu_isset(cpu, sd->span)) {
2296 schedstat_inc(sd, ttwu_wake_remote);
2297 break;
2298 }
2299 }
2300 }
2301#endif
2302
2303out_activate:
2304#endif
2305 schedstat_inc(p, se.nr_wakeups);
2306 if (sync)
2307 schedstat_inc(p, se.nr_wakeups_sync);
2308 if (orig_cpu != cpu)
2309 schedstat_inc(p, se.nr_wakeups_migrate);
2310 if (cpu == this_cpu)
2311 schedstat_inc(p, se.nr_wakeups_local);
2312 else
2313 schedstat_inc(p, se.nr_wakeups_remote);
2314 update_rq_clock(rq);
2315 activate_task(rq, p, 1);
2316 success = 1;
2317
2318out_running:
2319 trace_sched_wakeup(rq, p);
2320 check_preempt_curr(rq, p, sync);
2321
2322 p->state = TASK_RUNNING;
2323#ifdef CONFIG_SMP
2324 if (p->sched_class->task_wake_up)
2325 p->sched_class->task_wake_up(rq, p);
2326#endif
2327out:
2328 current->se.last_wakeup = current->se.sum_exec_runtime;
2329
2330 task_rq_unlock(rq, &flags);
2331
2332 return success;
2333}
2334
2335int wake_up_process(struct task_struct *p)
2336{
2337 return try_to_wake_up(p, TASK_ALL, 0);
2338}
2339EXPORT_SYMBOL(wake_up_process);
2340
2341int wake_up_state(struct task_struct *p, unsigned int state)
2342{
2343 return try_to_wake_up(p, state, 0);
2344}
2345
2346
2347
2348
2349
2350
2351
2352static void __sched_fork(struct task_struct *p)
2353{
2354 p->se.exec_start = 0;
2355 p->se.sum_exec_runtime = 0;
2356 p->se.prev_sum_exec_runtime = 0;
2357 p->se.last_wakeup = 0;
2358 p->se.avg_overlap = 0;
2359
2360#ifdef CONFIG_SCHEDSTATS
2361 p->se.wait_start = 0;
2362 p->se.sum_sleep_runtime = 0;
2363 p->se.sleep_start = 0;
2364 p->se.block_start = 0;
2365 p->se.sleep_max = 0;
2366 p->se.block_max = 0;
2367 p->se.exec_max = 0;
2368 p->se.slice_max = 0;
2369 p->se.wait_max = 0;
2370#endif
2371
2372 INIT_LIST_HEAD(&p->rt.run_list);
2373 p->se.on_rq = 0;
2374 INIT_LIST_HEAD(&p->se.group_node);
2375
2376#ifdef CONFIG_PREEMPT_NOTIFIERS
2377 INIT_HLIST_HEAD(&p->preempt_notifiers);
2378#endif
2379
2380
2381
2382
2383
2384
2385
2386 p->state = TASK_RUNNING;
2387}
2388
2389
2390
2391
2392void sched_fork(struct task_struct *p, int clone_flags)
2393{
2394 int cpu = get_cpu();
2395
2396 __sched_fork(p);
2397
2398#ifdef CONFIG_SMP
2399 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2400#endif
2401 set_task_cpu(p, cpu);
2402
2403
2404
2405
2406 p->prio = current->normal_prio;
2407 if (!rt_prio(p->prio))
2408 p->sched_class = &fair_sched_class;
2409
2410#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2411 if (likely(sched_info_on()))
2412 memset(&p->sched_info, 0, sizeof(p->sched_info));
2413#endif
2414#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2415 p->oncpu = 0;
2416#endif
2417#ifdef CONFIG_PREEMPT
2418
2419 task_thread_info(p)->preempt_count = 1;
2420#endif
2421 put_cpu();
2422}
2423
2424
2425
2426
2427
2428
2429
2430
2431void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2432{
2433 unsigned long flags;
2434 struct rq *rq;
2435
2436 rq = task_rq_lock(p, &flags);
2437 BUG_ON(p->state != TASK_RUNNING);
2438 update_rq_clock(rq);
2439
2440 p->prio = effective_prio(p);
2441
2442 if (!p->sched_class->task_new || !current->se.on_rq) {
2443 activate_task(rq, p, 0);
2444 } else {
2445
2446
2447
2448
2449 p->sched_class->task_new(rq, p);
2450 inc_nr_running(rq);
2451 }
2452 trace_sched_wakeup_new(rq, p);
2453 check_preempt_curr(rq, p, 0);
2454#ifdef CONFIG_SMP
2455 if (p->sched_class->task_wake_up)
2456 p->sched_class->task_wake_up(rq, p);
2457#endif
2458 task_rq_unlock(rq, &flags);
2459}
2460
2461#ifdef CONFIG_PREEMPT_NOTIFIERS
2462
2463
2464
2465
2466
2467void preempt_notifier_register(struct preempt_notifier *notifier)
2468{
2469 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2470}
2471EXPORT_SYMBOL_GPL(preempt_notifier_register);
2472
2473
2474
2475
2476
2477
2478
2479void preempt_notifier_unregister(struct preempt_notifier *notifier)
2480{
2481 hlist_del(¬ifier->link);
2482}
2483EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2484
2485static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2486{
2487 struct preempt_notifier *notifier;
2488 struct hlist_node *node;
2489
2490 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2491 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2492}
2493
2494static void
2495fire_sched_out_preempt_notifiers(struct task_struct *curr,
2496 struct task_struct *next)
2497{
2498 struct preempt_notifier *notifier;
2499 struct hlist_node *node;
2500
2501 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2502 notifier->ops->sched_out(notifier, next);
2503}
2504
2505#else
2506
2507static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2508{
2509}
2510
2511static void
2512fire_sched_out_preempt_notifiers(struct task_struct *curr,
2513 struct task_struct *next)
2514{
2515}
2516
2517#endif
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532static inline void
2533prepare_task_switch(struct rq *rq, struct task_struct *prev,
2534 struct task_struct *next)
2535{
2536 fire_sched_out_preempt_notifiers(prev, next);
2537 prepare_lock_switch(rq, next);
2538 prepare_arch_switch(next);
2539}
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2557 __releases(rq->lock)
2558{
2559 struct mm_struct *mm = rq->prev_mm;
2560 long prev_state;
2561
2562 rq->prev_mm = NULL;
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575 prev_state = prev->state;
2576 finish_arch_switch(prev);
2577 finish_lock_switch(rq, prev);
2578#ifdef CONFIG_SMP
2579 if (current->sched_class->post_schedule)
2580 current->sched_class->post_schedule(rq);
2581#endif
2582
2583 fire_sched_in_preempt_notifiers(current);
2584 if (mm)
2585 mmdrop(mm);
2586 if (unlikely(prev_state == TASK_DEAD)) {
2587
2588
2589
2590
2591 kprobe_flush_task(prev);
2592 put_task_struct(prev);
2593 }
2594}
2595
2596
2597
2598
2599
2600asmlinkage void schedule_tail(struct task_struct *prev)
2601 __releases(rq->lock)
2602{
2603 struct rq *rq = this_rq();
2604
2605 finish_task_switch(rq, prev);
2606#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2607
2608 preempt_enable();
2609#endif
2610 if (current->set_child_tid)
2611 put_user(task_pid_vnr(current), current->set_child_tid);
2612}
2613
2614
2615
2616
2617
2618static inline void
2619context_switch(struct rq *rq, struct task_struct *prev,
2620 struct task_struct *next)
2621{
2622 struct mm_struct *mm, *oldmm;
2623
2624 prepare_task_switch(rq, prev, next);
2625 trace_sched_switch(rq, prev, next);
2626 mm = next->mm;
2627 oldmm = prev->active_mm;
2628
2629
2630
2631
2632
2633 arch_enter_lazy_cpu_mode();
2634
2635 if (unlikely(!mm)) {
2636 next->active_mm = oldmm;
2637 atomic_inc(&oldmm->mm_count);
2638 enter_lazy_tlb(oldmm, next);
2639 } else
2640 switch_mm(oldmm, mm, next);
2641
2642 if (unlikely(!prev->mm)) {
2643 prev->active_mm = NULL;
2644 rq->prev_mm = oldmm;
2645 }
2646
2647
2648
2649
2650
2651
2652#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2653 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2654#endif
2655
2656
2657 switch_to(prev, next, prev);
2658
2659 barrier();
2660
2661
2662
2663
2664
2665 finish_task_switch(this_rq(), prev);
2666}
2667
2668
2669
2670
2671
2672
2673
2674
2675unsigned long nr_running(void)
2676{
2677 unsigned long i, sum = 0;
2678
2679 for_each_online_cpu(i)
2680 sum += cpu_rq(i)->nr_running;
2681
2682 return sum;
2683}
2684
2685unsigned long nr_uninterruptible(void)
2686{
2687 unsigned long i, sum = 0;
2688
2689 for_each_possible_cpu(i)
2690 sum += cpu_rq(i)->nr_uninterruptible;
2691
2692
2693
2694
2695
2696 if (unlikely((long)sum < 0))
2697 sum = 0;
2698
2699 return sum;
2700}
2701
2702unsigned long long nr_context_switches(void)
2703{
2704 int i;
2705 unsigned long long sum = 0;
2706
2707 for_each_possible_cpu(i)
2708 sum += cpu_rq(i)->nr_switches;
2709
2710 return sum;
2711}
2712
2713unsigned long nr_iowait(void)
2714{
2715 unsigned long i, sum = 0;
2716
2717 for_each_possible_cpu(i)
2718 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2719
2720 return sum;
2721}
2722
2723unsigned long nr_active(void)
2724{
2725 unsigned long i, running = 0, uninterruptible = 0;
2726
2727 for_each_online_cpu(i) {
2728 running += cpu_rq(i)->nr_running;
2729 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2730 }
2731
2732 if (unlikely((long)uninterruptible < 0))
2733 uninterruptible = 0;
2734
2735 return running + uninterruptible;
2736}
2737
2738
2739
2740
2741
2742static void update_cpu_load(struct rq *this_rq)
2743{
2744 unsigned long this_load = this_rq->load.weight;
2745 int i, scale;
2746
2747 this_rq->nr_load_updates++;
2748
2749
2750 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2751 unsigned long old_load, new_load;
2752
2753
2754
2755 old_load = this_rq->cpu_load[i];
2756 new_load = this_load;
2757
2758
2759
2760
2761
2762 if (new_load > old_load)
2763 new_load += scale-1;
2764 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2765 }
2766}
2767
2768#ifdef CONFIG_SMP
2769
2770
2771
2772
2773
2774
2775
2776static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2777 __acquires(rq1->lock)
2778 __acquires(rq2->lock)
2779{
2780 BUG_ON(!irqs_disabled());
2781 if (rq1 == rq2) {
2782 spin_lock(&rq1->lock);
2783 __acquire(rq2->lock);
2784 } else {
2785 if (rq1 < rq2) {
2786 spin_lock(&rq1->lock);
2787 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2788 } else {
2789 spin_lock(&rq2->lock);
2790 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2791 }
2792 }
2793 update_rq_clock(rq1);
2794 update_rq_clock(rq2);
2795}
2796
2797
2798
2799
2800
2801
2802
2803static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2804 __releases(rq1->lock)
2805 __releases(rq2->lock)
2806{
2807 spin_unlock(&rq1->lock);
2808 if (rq1 != rq2)
2809 spin_unlock(&rq2->lock);
2810 else
2811 __release(rq2->lock);
2812}
2813
2814
2815
2816
2817static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2818 __releases(this_rq->lock)
2819 __acquires(busiest->lock)
2820 __acquires(this_rq->lock)
2821{
2822 int ret = 0;
2823
2824 if (unlikely(!irqs_disabled())) {
2825
2826 spin_unlock(&this_rq->lock);
2827 BUG_ON(1);
2828 }
2829 if (unlikely(!spin_trylock(&busiest->lock))) {
2830 if (busiest < this_rq) {
2831 spin_unlock(&this_rq->lock);
2832 spin_lock(&busiest->lock);
2833 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2834 ret = 1;
2835 } else
2836 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2837 }
2838 return ret;
2839}
2840
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock)
2843{
2844 spin_unlock(&busiest->lock);
2845 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2846}
2847
2848
2849
2850
2851
2852
2853
2854static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2855{
2856 struct migration_req req;
2857 unsigned long flags;
2858 struct rq *rq;
2859
2860 rq = task_rq_lock(p, &flags);
2861 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2862 || unlikely(!cpu_active(dest_cpu)))
2863 goto out;
2864
2865 trace_sched_migrate_task(rq, p, dest_cpu);
2866
2867 if (migrate_task(p, dest_cpu, &req)) {
2868
2869 struct task_struct *mt = rq->migration_thread;
2870
2871 get_task_struct(mt);
2872 task_rq_unlock(rq, &flags);
2873 wake_up_process(mt);
2874 put_task_struct(mt);
2875 wait_for_completion(&req.done);
2876
2877 return;
2878 }
2879out:
2880 task_rq_unlock(rq, &flags);
2881}
2882
2883
2884
2885
2886
2887void sched_exec(void)
2888{
2889 int new_cpu, this_cpu = get_cpu();
2890 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2891 put_cpu();
2892 if (new_cpu != this_cpu)
2893 sched_migrate_task(current, new_cpu);
2894}
2895
2896
2897
2898
2899
2900static void pull_task(struct rq *src_rq, struct task_struct *p,
2901 struct rq *this_rq, int this_cpu)
2902{
2903 deactivate_task(src_rq, p, 0);
2904 set_task_cpu(p, this_cpu);
2905 activate_task(this_rq, p, 0);
2906
2907
2908
2909
2910 check_preempt_curr(this_rq, p, 0);
2911}
2912
2913
2914
2915
2916static
2917int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2918 struct sched_domain *sd, enum cpu_idle_type idle,
2919 int *all_pinned)
2920{
2921
2922
2923
2924
2925
2926
2927 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2928 schedstat_inc(p, se.nr_failed_migrations_affine);
2929 return 0;
2930 }
2931 *all_pinned = 0;
2932
2933 if (task_running(rq, p)) {
2934 schedstat_inc(p, se.nr_failed_migrations_running);
2935 return 0;
2936 }
2937
2938
2939
2940
2941
2942
2943
2944 if (!task_hot(p, rq->clock, sd) ||
2945 sd->nr_balance_failed > sd->cache_nice_tries) {
2946#ifdef CONFIG_SCHEDSTATS
2947 if (task_hot(p, rq->clock, sd)) {
2948 schedstat_inc(sd, lb_hot_gained[idle]);
2949 schedstat_inc(p, se.nr_forced_migrations);
2950 }
2951#endif
2952 return 1;
2953 }
2954
2955 if (task_hot(p, rq->clock, sd)) {
2956 schedstat_inc(p, se.nr_failed_migrations_hot);
2957 return 0;
2958 }
2959 return 1;
2960}
2961
2962static unsigned long
2963balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2964 unsigned long max_load_move, struct sched_domain *sd,
2965 enum cpu_idle_type idle, int *all_pinned,
2966 int *this_best_prio, struct rq_iterator *iterator)
2967{
2968 int loops = 0, pulled = 0, pinned = 0;
2969 struct task_struct *p;
2970 long rem_load_move = max_load_move;
2971
2972 if (max_load_move == 0)
2973 goto out;
2974
2975 pinned = 1;
2976
2977
2978
2979
2980 p = iterator->start(iterator->arg);
2981next:
2982 if (!p || loops++ > sysctl_sched_nr_migrate)
2983 goto out;
2984
2985 if ((p->se.load.weight >> 1) > rem_load_move ||
2986 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2987 p = iterator->next(iterator->arg);
2988 goto next;
2989 }
2990
2991 pull_task(busiest, p, this_rq, this_cpu);
2992 pulled++;
2993 rem_load_move -= p->se.load.weight;
2994
2995
2996
2997
2998 if (rem_load_move > 0) {
2999 if (p->prio < *this_best_prio)
3000 *this_best_prio = p->prio;
3001 p = iterator->next(iterator->arg);
3002 goto next;
3003 }
3004out:
3005
3006
3007
3008
3009
3010 schedstat_add(sd, lb_gained[idle], pulled);
3011
3012 if (all_pinned)
3013 *all_pinned = pinned;
3014
3015 return max_load_move - rem_load_move;
3016}
3017
3018
3019
3020
3021
3022
3023
3024
3025static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3026 unsigned long max_load_move,
3027 struct sched_domain *sd, enum cpu_idle_type idle,
3028 int *all_pinned)
3029{
3030 const struct sched_class *class = sched_class_highest;
3031 unsigned long total_load_moved = 0;
3032 int this_best_prio = this_rq->curr->prio;
3033
3034 do {
3035 total_load_moved +=
3036 class->load_balance(this_rq, this_cpu, busiest,
3037 max_load_move - total_load_moved,
3038 sd, idle, all_pinned, &this_best_prio);
3039 class = class->next;
3040
3041 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3042 break;
3043
3044 } while (class && max_load_move > total_load_moved);
3045
3046 return total_load_moved > 0;
3047}
3048
3049static int
3050iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3051 struct sched_domain *sd, enum cpu_idle_type idle,
3052 struct rq_iterator *iterator)
3053{
3054 struct task_struct *p = iterator->start(iterator->arg);
3055 int pinned = 0;
3056
3057 while (p) {
3058 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3059 pull_task(busiest, p, this_rq, this_cpu);
3060
3061
3062
3063
3064
3065 schedstat_inc(sd, lb_gained[idle]);
3066
3067 return 1;
3068 }
3069 p = iterator->next(iterator->arg);
3070 }
3071
3072 return 0;
3073}
3074
3075
3076
3077
3078
3079
3080
3081
3082static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083 struct sched_domain *sd, enum cpu_idle_type idle)
3084{
3085 const struct sched_class *class;
3086
3087 for (class = sched_class_highest; class; class = class->next)
3088 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3089 return 1;
3090
3091 return 0;
3092}
3093
3094
3095
3096
3097
3098
3099static struct sched_group *
3100find_busiest_group(struct sched_domain *sd, int this_cpu,
3101 unsigned long *imbalance, enum cpu_idle_type idle,
3102 int *sd_idle, const cpumask_t *cpus, int *balance)
3103{
3104 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3105 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
3106 unsigned long max_pull;
3107 unsigned long busiest_load_per_task, busiest_nr_running;
3108 unsigned long this_load_per_task, this_nr_running;
3109 int load_idx, group_imb = 0;
3110#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3111 int power_savings_balance = 1;
3112 unsigned long leader_nr_running = 0, min_load_per_task = 0;
3113 unsigned long min_nr_running = ULONG_MAX;
3114 struct sched_group *group_min = NULL, *group_leader = NULL;
3115#endif
3116
3117 max_load = this_load = total_load = total_pwr = 0;
3118 busiest_load_per_task = busiest_nr_running = 0;
3119 this_load_per_task = this_nr_running = 0;
3120
3121 if (idle == CPU_NOT_IDLE)
3122 load_idx = sd->busy_idx;
3123 else if (idle == CPU_NEWLY_IDLE)
3124 load_idx = sd->newidle_idx;
3125 else
3126 load_idx = sd->idle_idx;
3127
3128 do {
3129 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
3130 int local_group;
3131 int i;
3132 int __group_imb = 0;
3133 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3134 unsigned long sum_nr_running, sum_weighted_load;
3135 unsigned long sum_avg_load_per_task;
3136 unsigned long avg_load_per_task;
3137
3138 local_group = cpu_isset(this_cpu, group->cpumask);
3139
3140 if (local_group)
3141 balance_cpu = first_cpu(group->cpumask);
3142
3143
3144 sum_weighted_load = sum_nr_running = avg_load = 0;
3145 sum_avg_load_per_task = avg_load_per_task = 0;
3146
3147 max_cpu_load = 0;
3148 min_cpu_load = ~0UL;
3149
3150 for_each_cpu_mask_nr(i, group->cpumask) {
3151 struct rq *rq;
3152
3153 if (!cpu_isset(i, *cpus))
3154 continue;
3155
3156 rq = cpu_rq(i);
3157
3158 if (*sd_idle && rq->nr_running)
3159 *sd_idle = 0;
3160
3161
3162 if (local_group) {
3163 if (idle_cpu(i) && !first_idle_cpu) {
3164 first_idle_cpu = 1;
3165 balance_cpu = i;
3166 }
3167
3168 load = target_load(i, load_idx);
3169 } else {
3170 load = source_load(i, load_idx);
3171 if (load > max_cpu_load)
3172 max_cpu_load = load;
3173 if (min_cpu_load > load)
3174 min_cpu_load = load;
3175 }
3176
3177 avg_load += load;
3178 sum_nr_running += rq->nr_running;
3179 sum_weighted_load += weighted_cpuload(i);
3180
3181 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3182 }
3183
3184
3185
3186
3187
3188
3189
3190 if (idle != CPU_NEWLY_IDLE && local_group &&
3191 balance_cpu != this_cpu && balance) {
3192 *balance = 0;
3193 goto ret;
3194 }
3195
3196 total_load += avg_load;
3197 total_pwr += group->__cpu_power;
3198
3199
3200 avg_load = sg_div_cpu_power(group,
3201 avg_load * SCHED_LOAD_SCALE);
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213 avg_load_per_task = sg_div_cpu_power(group,
3214 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3215
3216 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3217 __group_imb = 1;
3218
3219 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3220
3221 if (local_group) {
3222 this_load = avg_load;
3223 this = group;
3224 this_nr_running = sum_nr_running;
3225 this_load_per_task = sum_weighted_load;
3226 } else if (avg_load > max_load &&
3227 (sum_nr_running > group_capacity || __group_imb)) {
3228 max_load = avg_load;
3229 busiest = group;
3230 busiest_nr_running = sum_nr_running;
3231 busiest_load_per_task = sum_weighted_load;
3232 group_imb = __group_imb;
3233 }
3234
3235#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3236
3237
3238
3239
3240 if (idle == CPU_NOT_IDLE ||
3241 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3242 goto group_next;
3243
3244
3245
3246
3247
3248 if (local_group && (this_nr_running >= group_capacity ||
3249 !this_nr_running))
3250 power_savings_balance = 0;
3251
3252
3253
3254
3255
3256 if (!power_savings_balance || sum_nr_running >= group_capacity
3257 || !sum_nr_running)
3258 goto group_next;
3259
3260
3261
3262
3263
3264
3265 if ((sum_nr_running < min_nr_running) ||
3266 (sum_nr_running == min_nr_running &&
3267 first_cpu(group->cpumask) <
3268 first_cpu(group_min->cpumask))) {
3269 group_min = group;
3270 min_nr_running = sum_nr_running;
3271 min_load_per_task = sum_weighted_load /
3272 sum_nr_running;
3273 }
3274
3275
3276
3277
3278
3279
3280 if (sum_nr_running <= group_capacity - 1) {
3281 if (sum_nr_running > leader_nr_running ||
3282 (sum_nr_running == leader_nr_running &&
3283 first_cpu(group->cpumask) >
3284 first_cpu(group_leader->cpumask))) {
3285 group_leader = group;
3286 leader_nr_running = sum_nr_running;
3287 }
3288 }
3289group_next:
3290#endif
3291 group = group->next;
3292 } while (group != sd->groups);
3293
3294 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
3295 goto out_balanced;
3296
3297 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3298
3299 if (this_load >= avg_load ||
3300 100*max_load <= sd->imbalance_pct*this_load)
3301 goto out_balanced;
3302
3303 busiest_load_per_task /= busiest_nr_running;
3304 if (group_imb)
3305 busiest_load_per_task = min(busiest_load_per_task, avg_load);
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318 if (max_load <= busiest_load_per_task)
3319 goto out_balanced;
3320
3321
3322
3323
3324
3325
3326 if (max_load < avg_load) {
3327 *imbalance = 0;
3328 goto small_imbalance;
3329 }
3330
3331
3332 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
3333
3334
3335 *imbalance = min(max_pull * busiest->__cpu_power,
3336 (avg_load - this_load) * this->__cpu_power)
3337 / SCHED_LOAD_SCALE;
3338
3339
3340
3341
3342
3343
3344
3345 if (*imbalance < busiest_load_per_task) {
3346 unsigned long tmp, pwr_now, pwr_move;
3347 unsigned int imbn;
3348
3349small_imbalance:
3350 pwr_move = pwr_now = 0;
3351 imbn = 2;
3352 if (this_nr_running) {
3353 this_load_per_task /= this_nr_running;
3354 if (busiest_load_per_task > this_load_per_task)
3355 imbn = 1;
3356 } else
3357 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3358
3359 if (max_load - this_load + busiest_load_per_task >=
3360 busiest_load_per_task * imbn) {
3361 *imbalance = busiest_load_per_task;
3362 return busiest;
3363 }
3364
3365
3366
3367
3368
3369
3370
3371 pwr_now += busiest->__cpu_power *
3372 min(busiest_load_per_task, max_load);
3373 pwr_now += this->__cpu_power *
3374 min(this_load_per_task, this_load);
3375 pwr_now /= SCHED_LOAD_SCALE;
3376
3377
3378 tmp = sg_div_cpu_power(busiest,
3379 busiest_load_per_task * SCHED_LOAD_SCALE);
3380 if (max_load > tmp)
3381 pwr_move += busiest->__cpu_power *
3382 min(busiest_load_per_task, max_load - tmp);
3383
3384
3385 if (max_load * busiest->__cpu_power <
3386 busiest_load_per_task * SCHED_LOAD_SCALE)
3387 tmp = sg_div_cpu_power(this,
3388 max_load * busiest->__cpu_power);
3389 else
3390 tmp = sg_div_cpu_power(this,
3391 busiest_load_per_task * SCHED_LOAD_SCALE);
3392 pwr_move += this->__cpu_power *
3393 min(this_load_per_task, this_load + tmp);
3394 pwr_move /= SCHED_LOAD_SCALE;
3395
3396
3397 if (pwr_move > pwr_now)
3398 *imbalance = busiest_load_per_task;
3399 }
3400
3401 return busiest;
3402
3403out_balanced:
3404#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3405 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3406 goto ret;
3407
3408 if (this == group_leader && group_leader != group_min) {
3409 *imbalance = min_load_per_task;
3410 return group_min;
3411 }
3412#endif
3413ret:
3414 *imbalance = 0;
3415 return NULL;
3416}
3417
3418
3419
3420
3421static struct rq *
3422find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3423 unsigned long imbalance, const cpumask_t *cpus)
3424{
3425 struct rq *busiest = NULL, *rq;
3426 unsigned long max_load = 0;
3427 int i;
3428
3429 for_each_cpu_mask_nr(i, group->cpumask) {
3430 unsigned long wl;
3431
3432 if (!cpu_isset(i, *cpus))
3433 continue;
3434
3435 rq = cpu_rq(i);
3436 wl = weighted_cpuload(i);
3437
3438 if (rq->nr_running == 1 && wl > imbalance)
3439 continue;
3440
3441 if (wl > max_load) {
3442 max_load = wl;
3443 busiest = rq;
3444 }
3445 }
3446
3447 return busiest;
3448}
3449
3450
3451
3452
3453
3454#define MAX_PINNED_INTERVAL 512
3455
3456
3457
3458
3459
3460static int load_balance(int this_cpu, struct rq *this_rq,
3461 struct sched_domain *sd, enum cpu_idle_type idle,
3462 int *balance, cpumask_t *cpus)
3463{
3464 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3465 struct sched_group *group;
3466 unsigned long imbalance;
3467 struct rq *busiest;
3468 unsigned long flags;
3469
3470 cpus_setall(*cpus);
3471
3472
3473
3474
3475
3476
3477
3478 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3479 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3480 sd_idle = 1;
3481
3482 schedstat_inc(sd, lb_count[idle]);
3483
3484redo:
3485 update_shares(sd);
3486 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3487 cpus, balance);
3488
3489 if (*balance == 0)
3490 goto out_balanced;
3491
3492 if (!group) {
3493 schedstat_inc(sd, lb_nobusyg[idle]);
3494 goto out_balanced;
3495 }
3496
3497 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3498 if (!busiest) {
3499 schedstat_inc(sd, lb_nobusyq[idle]);
3500 goto out_balanced;
3501 }
3502
3503 BUG_ON(busiest == this_rq);
3504
3505 schedstat_add(sd, lb_imbalance[idle], imbalance);
3506
3507 ld_moved = 0;
3508 if (busiest->nr_running > 1) {
3509
3510
3511
3512
3513
3514
3515 local_irq_save(flags);
3516 double_rq_lock(this_rq, busiest);
3517 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3518 imbalance, sd, idle, &all_pinned);
3519 double_rq_unlock(this_rq, busiest);
3520 local_irq_restore(flags);
3521
3522
3523
3524
3525 if (ld_moved && this_cpu != smp_processor_id())
3526 resched_cpu(this_cpu);
3527
3528
3529 if (unlikely(all_pinned)) {
3530 cpu_clear(cpu_of(busiest), *cpus);
3531 if (!cpus_empty(*cpus))
3532 goto redo;
3533 goto out_balanced;
3534 }
3535 }
3536
3537 if (!ld_moved) {
3538 schedstat_inc(sd, lb_failed[idle]);
3539 sd->nr_balance_failed++;
3540
3541 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3542
3543 spin_lock_irqsave(&busiest->lock, flags);
3544
3545
3546
3547
3548 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
3549 spin_unlock_irqrestore(&busiest->lock, flags);
3550 all_pinned = 1;
3551 goto out_one_pinned;
3552 }
3553
3554 if (!busiest->active_balance) {
3555 busiest->active_balance = 1;
3556 busiest->push_cpu = this_cpu;
3557 active_balance = 1;
3558 }
3559 spin_unlock_irqrestore(&busiest->lock, flags);
3560 if (active_balance)
3561 wake_up_process(busiest->migration_thread);
3562
3563
3564
3565
3566
3567 sd->nr_balance_failed = sd->cache_nice_tries+1;
3568 }
3569 } else
3570 sd->nr_balance_failed = 0;
3571
3572 if (likely(!active_balance)) {
3573
3574 sd->balance_interval = sd->min_interval;
3575 } else {
3576
3577
3578
3579
3580
3581
3582 if (sd->balance_interval < sd->max_interval)
3583 sd->balance_interval *= 2;
3584 }
3585
3586 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3587 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3588 ld_moved = -1;
3589
3590 goto out;
3591
3592out_balanced:
3593 schedstat_inc(sd, lb_balanced[idle]);
3594
3595 sd->nr_balance_failed = 0;
3596
3597out_one_pinned:
3598
3599 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3600 (sd->balance_interval < sd->max_interval))
3601 sd->balance_interval *= 2;
3602
3603 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3604 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3605 ld_moved = -1;
3606 else
3607 ld_moved = 0;
3608out:
3609 if (ld_moved)
3610 update_shares(sd);
3611 return ld_moved;
3612}
3613
3614
3615
3616
3617
3618
3619
3620
3621static int
3622load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3623 cpumask_t *cpus)
3624{
3625 struct sched_group *group;
3626 struct rq *busiest = NULL;
3627 unsigned long imbalance;
3628 int ld_moved = 0;
3629 int sd_idle = 0;
3630 int all_pinned = 0;
3631
3632 cpus_setall(*cpus);
3633
3634
3635
3636
3637
3638
3639
3640 if (sd->flags & SD_SHARE_CPUPOWER &&
3641 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3642 sd_idle = 1;
3643
3644 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3645redo:
3646 update_shares_locked(this_rq, sd);
3647 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3648 &sd_idle, cpus, NULL);
3649 if (!group) {
3650 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3651 goto out_balanced;
3652 }
3653
3654 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
3655 if (!busiest) {
3656 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3657 goto out_balanced;
3658 }
3659
3660 BUG_ON(busiest == this_rq);
3661
3662 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
3663
3664 ld_moved = 0;
3665 if (busiest->nr_running > 1) {
3666
3667 double_lock_balance(this_rq, busiest);
3668
3669 update_rq_clock(busiest);
3670 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3671 imbalance, sd, CPU_NEWLY_IDLE,
3672 &all_pinned);
3673 double_unlock_balance(this_rq, busiest);
3674
3675 if (unlikely(all_pinned)) {
3676 cpu_clear(cpu_of(busiest), *cpus);
3677 if (!cpus_empty(*cpus))
3678 goto redo;
3679 }
3680 }
3681
3682 if (!ld_moved) {
3683 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3684 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3685 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3686 return -1;
3687 } else
3688 sd->nr_balance_failed = 0;
3689
3690 update_shares_locked(this_rq, sd);
3691 return ld_moved;
3692
3693out_balanced:
3694 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
3695 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3696 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3697 return -1;
3698 sd->nr_balance_failed = 0;
3699
3700 return 0;
3701}
3702
3703
3704
3705
3706
3707static void idle_balance(int this_cpu, struct rq *this_rq)
3708{
3709 struct sched_domain *sd;
3710 int pulled_task = -1;
3711 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask;
3713
3714 for_each_domain(this_cpu, sd) {
3715 unsigned long interval;
3716
3717 if (!(sd->flags & SD_LOAD_BALANCE))
3718 continue;
3719
3720 if (sd->flags & SD_BALANCE_NEWIDLE)
3721
3722 pulled_task = load_balance_newidle(this_cpu, this_rq,
3723 sd, &tmpmask);
3724
3725 interval = msecs_to_jiffies(sd->balance_interval);
3726 if (time_after(next_balance, sd->last_balance + interval))
3727 next_balance = sd->last_balance + interval;
3728 if (pulled_task)
3729 break;
3730 }
3731 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3732
3733
3734
3735
3736 this_rq->next_balance = next_balance;
3737 }
3738}
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3749{
3750 int target_cpu = busiest_rq->push_cpu;
3751 struct sched_domain *sd;
3752 struct rq *target_rq;
3753
3754
3755 if (busiest_rq->nr_running <= 1)
3756 return;
3757
3758 target_rq = cpu_rq(target_cpu);
3759
3760
3761
3762
3763
3764
3765 BUG_ON(busiest_rq == target_rq);
3766
3767
3768 double_lock_balance(busiest_rq, target_rq);
3769 update_rq_clock(busiest_rq);
3770 update_rq_clock(target_rq);
3771
3772
3773 for_each_domain(target_cpu, sd) {
3774 if ((sd->flags & SD_LOAD_BALANCE) &&
3775 cpu_isset(busiest_cpu, sd->span))
3776 break;
3777 }
3778
3779 if (likely(sd)) {
3780 schedstat_inc(sd, alb_count);
3781
3782 if (move_one_task(target_rq, target_cpu, busiest_rq,
3783 sd, CPU_IDLE))
3784 schedstat_inc(sd, alb_pushed);
3785 else
3786 schedstat_inc(sd, alb_failed);
3787 }
3788 double_unlock_balance(busiest_rq, target_rq);
3789}
3790
3791#ifdef CONFIG_NO_HZ
3792static struct {
3793 atomic_t load_balancer;
3794 cpumask_t cpu_mask;
3795} nohz ____cacheline_aligned = {
3796 .load_balancer = ATOMIC_INIT(-1),
3797 .cpu_mask = CPU_MASK_NONE,
3798};
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820int select_nohz_load_balancer(int stop_tick)
3821{
3822 int cpu = smp_processor_id();
3823
3824 if (stop_tick) {
3825 cpu_set(cpu, nohz.cpu_mask);
3826 cpu_rq(cpu)->in_nohz_recently = 1;
3827
3828
3829
3830
3831 if (!cpu_active(cpu) &&
3832 atomic_read(&nohz.load_balancer) == cpu) {
3833 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3834 BUG();
3835 return 0;
3836 }
3837
3838
3839 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3840 if (atomic_read(&nohz.load_balancer) == cpu)
3841 atomic_set(&nohz.load_balancer, -1);
3842 return 0;
3843 }
3844
3845 if (atomic_read(&nohz.load_balancer) == -1) {
3846
3847 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3848 return 1;
3849 } else if (atomic_read(&nohz.load_balancer) == cpu)
3850 return 1;
3851 } else {
3852 if (!cpu_isset(cpu, nohz.cpu_mask))
3853 return 0;
3854
3855 cpu_clear(cpu, nohz.cpu_mask);
3856
3857 if (atomic_read(&nohz.load_balancer) == cpu)
3858 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3859 BUG();
3860 }
3861 return 0;
3862}
3863#endif
3864
3865static DEFINE_SPINLOCK(balancing);
3866
3867
3868
3869
3870
3871
3872
3873static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3874{
3875 int balance = 1;
3876 struct rq *rq = cpu_rq(cpu);
3877 unsigned long interval;
3878 struct sched_domain *sd;
3879
3880 unsigned long next_balance = jiffies + 60*HZ;
3881 int update_next_balance = 0;
3882 int need_serialize;
3883 cpumask_t tmp;
3884
3885 for_each_domain(cpu, sd) {
3886 if (!(sd->flags & SD_LOAD_BALANCE))
3887 continue;
3888
3889 interval = sd->balance_interval;
3890 if (idle != CPU_IDLE)
3891 interval *= sd->busy_factor;
3892
3893
3894 interval = msecs_to_jiffies(interval);
3895 if (unlikely(!interval))
3896 interval = 1;
3897 if (interval > HZ*NR_CPUS/10)
3898 interval = HZ*NR_CPUS/10;
3899
3900 need_serialize = sd->flags & SD_SERIALIZE;
3901
3902 if (need_serialize) {
3903 if (!spin_trylock(&balancing))
3904 goto out;
3905 }
3906
3907 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3908 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
3909
3910
3911
3912
3913
3914 idle = CPU_NOT_IDLE;
3915 }
3916 sd->last_balance = jiffies;
3917 }
3918 if (need_serialize)
3919 spin_unlock(&balancing);
3920out:
3921 if (time_after(next_balance, sd->last_balance + interval)) {
3922 next_balance = sd->last_balance + interval;
3923 update_next_balance = 1;
3924 }
3925
3926
3927
3928
3929
3930
3931 if (!balance)
3932 break;
3933 }
3934
3935
3936
3937
3938
3939
3940 if (likely(update_next_balance))
3941 rq->next_balance = next_balance;
3942}
3943
3944
3945
3946
3947
3948
3949static void run_rebalance_domains(struct softirq_action *h)
3950{
3951 int this_cpu = smp_processor_id();
3952 struct rq *this_rq = cpu_rq(this_cpu);
3953 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3954 CPU_IDLE : CPU_NOT_IDLE;
3955
3956 rebalance_domains(this_cpu, idle);
3957
3958#ifdef CONFIG_NO_HZ
3959
3960
3961
3962
3963
3964 if (this_rq->idle_at_tick &&
3965 atomic_read(&nohz.load_balancer) == this_cpu) {
3966 cpumask_t cpus = nohz.cpu_mask;
3967 struct rq *rq;
3968 int balance_cpu;
3969
3970 cpu_clear(this_cpu, cpus);
3971 for_each_cpu_mask_nr(balance_cpu, cpus) {
3972
3973
3974
3975
3976
3977 if (need_resched())
3978 break;
3979
3980 rebalance_domains(balance_cpu, CPU_IDLE);
3981
3982 rq = cpu_rq(balance_cpu);
3983 if (time_after(this_rq->next_balance, rq->next_balance))
3984 this_rq->next_balance = rq->next_balance;
3985 }
3986 }
3987#endif
3988}
3989
3990
3991
3992
3993
3994
3995
3996
3997static inline void trigger_load_balance(struct rq *rq, int cpu)
3998{
3999#ifdef CONFIG_NO_HZ
4000
4001
4002
4003
4004
4005 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4006 rq->in_nohz_recently = 0;
4007
4008 if (atomic_read(&nohz.load_balancer) == cpu) {
4009 cpu_clear(cpu, nohz.cpu_mask);
4010 atomic_set(&nohz.load_balancer, -1);
4011 }
4012
4013 if (atomic_read(&nohz.load_balancer) == -1) {
4014
4015
4016
4017
4018
4019
4020
4021
4022 int ilb = first_cpu(nohz.cpu_mask);
4023
4024 if (ilb < nr_cpu_ids)
4025 resched_cpu(ilb);
4026 }
4027 }
4028
4029
4030
4031
4032
4033 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4034 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4035 resched_cpu(cpu);
4036 return;
4037 }
4038
4039
4040
4041
4042
4043 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4044 cpu_isset(cpu, nohz.cpu_mask))
4045 return;
4046#endif
4047 if (time_after_eq(jiffies, rq->next_balance))
4048 raise_softirq(SCHED_SOFTIRQ);
4049}
4050
4051#else
4052
4053
4054
4055
4056static inline void idle_balance(int cpu, struct rq *rq)
4057{
4058}
4059
4060#endif
4061
4062DEFINE_PER_CPU(struct kernel_stat, kstat);
4063
4064EXPORT_PER_CPU_SYMBOL(kstat);
4065
4066
4067
4068
4069
4070unsigned long long task_delta_exec(struct task_struct *p)
4071{
4072 unsigned long flags;
4073 struct rq *rq;
4074 u64 ns = 0;
4075
4076 rq = task_rq_lock(p, &flags);
4077
4078 if (task_current(rq, p)) {
4079 u64 delta_exec;
4080
4081 update_rq_clock(rq);
4082 delta_exec = rq->clock - p->se.exec_start;
4083 if ((s64)delta_exec > 0)
4084 ns = delta_exec;
4085 }
4086
4087 task_rq_unlock(rq, &flags);
4088
4089 return ns;
4090}
4091
4092
4093
4094
4095
4096
4097void account_user_time(struct task_struct *p, cputime_t cputime)
4098{
4099 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4100 cputime64_t tmp;
4101
4102 p->utime = cputime_add(p->utime, cputime);
4103 account_group_user_time(p, cputime);
4104
4105
4106 tmp = cputime_to_cputime64(cputime);
4107 if (TASK_NICE(p) > 0)
4108 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4109 else
4110 cpustat->user = cputime64_add(cpustat->user, tmp);
4111
4112 acct_update_integrals(p);
4113}
4114
4115
4116
4117
4118
4119
4120static void account_guest_time(struct task_struct *p, cputime_t cputime)
4121{
4122 cputime64_t tmp;
4123 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4124
4125 tmp = cputime_to_cputime64(cputime);
4126
4127 p->utime = cputime_add(p->utime, cputime);
4128 account_group_user_time(p, cputime);
4129 p->gtime = cputime_add(p->gtime, cputime);
4130
4131 cpustat->user = cputime64_add(cpustat->user, tmp);
4132 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4133}
4134
4135
4136
4137
4138
4139
4140void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4141{
4142 p->utimescaled = cputime_add(p->utimescaled, cputime);
4143}
4144
4145
4146
4147
4148
4149
4150
4151void account_system_time(struct task_struct *p, int hardirq_offset,
4152 cputime_t cputime)
4153{
4154 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4155 struct rq *rq = this_rq();
4156 cputime64_t tmp;
4157
4158 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4159 account_guest_time(p, cputime);
4160 return;
4161 }
4162
4163 p->stime = cputime_add(p->stime, cputime);
4164 account_group_system_time(p, cputime);
4165
4166
4167 tmp = cputime_to_cputime64(cputime);
4168 if (hardirq_count() - hardirq_offset)
4169 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4170 else if (softirq_count())
4171 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4172 else if (p != rq->idle)
4173 cpustat->system = cputime64_add(cpustat->system, tmp);
4174 else if (atomic_read(&rq->nr_iowait) > 0)
4175 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4176 else
4177 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4178
4179 acct_update_integrals(p);
4180}
4181
4182
4183
4184
4185
4186
4187
4188void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
4189{
4190 p->stimescaled = cputime_add(p->stimescaled, cputime);
4191}
4192
4193
4194
4195
4196
4197
4198void account_steal_time(struct task_struct *p, cputime_t steal)
4199{
4200 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4201 cputime64_t tmp = cputime_to_cputime64(steal);
4202 struct rq *rq = this_rq();
4203
4204 if (p == rq->idle) {
4205 p->stime = cputime_add(p->stime, steal);
4206 account_group_system_time(p, steal);
4207 if (atomic_read(&rq->nr_iowait) > 0)
4208 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4209 else
4210 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4211 } else
4212 cpustat->steal = cputime64_add(cpustat->steal, tmp);
4213}
4214
4215
4216
4217
4218#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4219cputime_t task_utime(struct task_struct *p)
4220{
4221 return p->utime;
4222}
4223
4224cputime_t task_stime(struct task_struct *p)
4225{
4226 return p->stime;
4227}
4228#else
4229cputime_t task_utime(struct task_struct *p)
4230{
4231 clock_t utime = cputime_to_clock_t(p->utime),
4232 total = utime + cputime_to_clock_t(p->stime);
4233 u64 temp;
4234
4235
4236
4237
4238 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4239
4240 if (total) {
4241 temp *= utime;
4242 do_div(temp, total);
4243 }
4244 utime = (clock_t)temp;
4245
4246 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4247 return p->prev_utime;
4248}
4249
4250cputime_t task_stime(struct task_struct *p)
4251{
4252 clock_t stime;
4253
4254
4255
4256
4257
4258
4259 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4260 cputime_to_clock_t(task_utime(p));
4261
4262 if (stime >= 0)
4263 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4264
4265 return p->prev_stime;
4266}
4267#endif
4268
4269inline cputime_t task_gtime(struct task_struct *p)
4270{
4271 return p->gtime;
4272}
4273
4274
4275
4276
4277
4278
4279
4280
4281void scheduler_tick(void)
4282{
4283 int cpu = smp_processor_id();
4284 struct rq *rq = cpu_rq(cpu);
4285 struct task_struct *curr = rq->curr;
4286
4287 sched_clock_tick();
4288
4289 spin_lock(&rq->lock);
4290 update_rq_clock(rq);
4291 update_cpu_load(rq);
4292 curr->sched_class->task_tick(rq, curr, 0);
4293 spin_unlock(&rq->lock);
4294
4295#ifdef CONFIG_SMP
4296 rq->idle_at_tick = idle_cpu(cpu);
4297 trigger_load_balance(rq, cpu);
4298#endif
4299}
4300
4301#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4302 defined(CONFIG_PREEMPT_TRACER))
4303
4304static inline unsigned long get_parent_ip(unsigned long addr)
4305{
4306 if (in_lock_functions(addr)) {
4307 addr = CALLER_ADDR2;
4308 if (in_lock_functions(addr))
4309 addr = CALLER_ADDR3;
4310 }
4311 return addr;
4312}
4313
4314void __kprobes add_preempt_count(int val)
4315{
4316#ifdef CONFIG_DEBUG_PREEMPT
4317
4318
4319
4320 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4321 return;
4322#endif
4323 preempt_count() += val;
4324#ifdef CONFIG_DEBUG_PREEMPT
4325
4326
4327
4328 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4329 PREEMPT_MASK - 10);
4330#endif
4331 if (preempt_count() == val)
4332 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4333}
4334EXPORT_SYMBOL(add_preempt_count);
4335
4336void __kprobes sub_preempt_count(int val)
4337{
4338#ifdef CONFIG_DEBUG_PREEMPT
4339
4340
4341
4342 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4343 return;
4344
4345
4346
4347 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4348 !(preempt_count() & PREEMPT_MASK)))
4349 return;
4350#endif
4351
4352 if (preempt_count() == val)
4353 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4354 preempt_count() -= val;
4355}
4356EXPORT_SYMBOL(sub_preempt_count);
4357
4358#endif
4359
4360
4361
4362
4363static noinline void __schedule_bug(struct task_struct *prev)
4364{
4365 struct pt_regs *regs = get_irq_regs();
4366
4367 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4368 prev->comm, prev->pid, preempt_count());
4369
4370 debug_show_held_locks(prev);
4371 print_modules();
4372 if (irqs_disabled())
4373 print_irqtrace_events(prev);
4374
4375 if (regs)
4376 show_regs(regs);
4377 else
4378 dump_stack();
4379}
4380
4381
4382
4383
4384static inline void schedule_debug(struct task_struct *prev)
4385{
4386
4387
4388
4389
4390
4391 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4392 __schedule_bug(prev);
4393
4394 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4395
4396 schedstat_inc(this_rq(), sched_count);
4397#ifdef CONFIG_SCHEDSTATS
4398 if (unlikely(prev->lock_depth >= 0)) {
4399 schedstat_inc(this_rq(), bkl_count);
4400 schedstat_inc(prev, sched_info.bkl_count);
4401 }
4402#endif
4403}
4404
4405
4406
4407
4408static inline struct task_struct *
4409pick_next_task(struct rq *rq, struct task_struct *prev)
4410{
4411 const struct sched_class *class;
4412 struct task_struct *p;
4413
4414
4415
4416
4417
4418 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4419 p = fair_sched_class.pick_next_task(rq);
4420 if (likely(p))
4421 return p;
4422 }
4423
4424 class = sched_class_highest;
4425 for ( ; ; ) {
4426 p = class->pick_next_task(rq);
4427 if (p)
4428 return p;
4429
4430
4431
4432
4433 class = class->next;
4434 }
4435}
4436
4437
4438
4439
4440asmlinkage void __sched schedule(void)
4441{
4442 struct task_struct *prev, *next;
4443 unsigned long *switch_count;
4444 struct rq *rq;
4445 int cpu;
4446
4447need_resched:
4448 preempt_disable();
4449 cpu = smp_processor_id();
4450 rq = cpu_rq(cpu);
4451 rcu_qsctr_inc(cpu);
4452 prev = rq->curr;
4453 switch_count = &prev->nivcsw;
4454
4455 release_kernel_lock(prev);
4456need_resched_nonpreemptible:
4457
4458 schedule_debug(prev);
4459
4460 if (sched_feat(HRTICK))
4461 hrtick_clear(rq);
4462
4463 spin_lock_irq(&rq->lock);
4464 update_rq_clock(rq);
4465 clear_tsk_need_resched(prev);
4466
4467 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4468 if (unlikely(signal_pending_state(prev->state, prev)))
4469 prev->state = TASK_RUNNING;
4470 else
4471 deactivate_task(rq, prev, 1);
4472 switch_count = &prev->nvcsw;
4473 }
4474
4475#ifdef CONFIG_SMP
4476 if (prev->sched_class->pre_schedule)
4477 prev->sched_class->pre_schedule(rq, prev);
4478#endif
4479
4480 if (unlikely(!rq->nr_running))
4481 idle_balance(cpu, rq);
4482
4483 prev->sched_class->put_prev_task(rq, prev);
4484 next = pick_next_task(rq, prev);
4485
4486 if (likely(prev != next)) {
4487 sched_info_switch(prev, next);
4488
4489 rq->nr_switches++;
4490 rq->curr = next;
4491 ++*switch_count;
4492
4493 context_switch(rq, prev, next);
4494
4495
4496
4497
4498 cpu = smp_processor_id();
4499 rq = cpu_rq(cpu);
4500 } else
4501 spin_unlock_irq(&rq->lock);
4502
4503 if (unlikely(reacquire_kernel_lock(current) < 0))
4504 goto need_resched_nonpreemptible;
4505
4506 preempt_enable_no_resched();
4507 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4508 goto need_resched;
4509}
4510EXPORT_SYMBOL(schedule);
4511
4512#ifdef CONFIG_PREEMPT
4513
4514
4515
4516
4517
4518asmlinkage void __sched preempt_schedule(void)
4519{
4520 struct thread_info *ti = current_thread_info();
4521
4522
4523
4524
4525
4526 if (likely(ti->preempt_count || irqs_disabled()))
4527 return;
4528
4529 do {
4530 add_preempt_count(PREEMPT_ACTIVE);
4531 schedule();
4532 sub_preempt_count(PREEMPT_ACTIVE);
4533
4534
4535
4536
4537
4538 barrier();
4539 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4540}
4541EXPORT_SYMBOL(preempt_schedule);
4542
4543
4544
4545
4546
4547
4548
4549asmlinkage void __sched preempt_schedule_irq(void)
4550{
4551 struct thread_info *ti = current_thread_info();
4552
4553
4554 BUG_ON(ti->preempt_count || !irqs_disabled());
4555
4556 do {
4557 add_preempt_count(PREEMPT_ACTIVE);
4558 local_irq_enable();
4559 schedule();
4560 local_irq_disable();
4561 sub_preempt_count(PREEMPT_ACTIVE);
4562
4563
4564
4565
4566
4567 barrier();
4568 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4569}
4570
4571#endif
4572
4573int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
4574 void *key)
4575{
4576 return try_to_wake_up(curr->private, mode, sync);
4577}
4578EXPORT_SYMBOL(default_wake_function);
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4590 int nr_exclusive, int sync, void *key)
4591{
4592 wait_queue_t *curr, *next;
4593
4594 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4595 unsigned flags = curr->flags;
4596
4597 if (curr->func(curr, mode, sync, key) &&
4598 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4599 break;
4600 }
4601}
4602
4603
4604
4605
4606
4607
4608
4609
4610void __wake_up(wait_queue_head_t *q, unsigned int mode,
4611 int nr_exclusive, void *key)
4612{
4613 unsigned long flags;
4614
4615 spin_lock_irqsave(&q->lock, flags);
4616 __wake_up_common(q, mode, nr_exclusive, 0, key);
4617 spin_unlock_irqrestore(&q->lock, flags);
4618}
4619EXPORT_SYMBOL(__wake_up);
4620
4621
4622
4623
4624void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4625{
4626 __wake_up_common(q, mode, 1, 0, NULL);
4627}
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642void
4643__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4644{
4645 unsigned long flags;
4646 int sync = 1;
4647
4648 if (unlikely(!q))
4649 return;
4650
4651 if (unlikely(!nr_exclusive))
4652 sync = 0;
4653
4654 spin_lock_irqsave(&q->lock, flags);
4655 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
4656 spin_unlock_irqrestore(&q->lock, flags);
4657}
4658EXPORT_SYMBOL_GPL(__wake_up_sync);
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669void complete(struct completion *x)
4670{
4671 unsigned long flags;
4672
4673 spin_lock_irqsave(&x->wait.lock, flags);
4674 x->done++;
4675 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4676 spin_unlock_irqrestore(&x->wait.lock, flags);
4677}
4678EXPORT_SYMBOL(complete);
4679
4680
4681
4682
4683
4684
4685
4686void complete_all(struct completion *x)
4687{
4688 unsigned long flags;
4689
4690 spin_lock_irqsave(&x->wait.lock, flags);
4691 x->done += UINT_MAX/2;
4692 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4693 spin_unlock_irqrestore(&x->wait.lock, flags);
4694}
4695EXPORT_SYMBOL(complete_all);
4696
4697static inline long __sched
4698do_wait_for_common(struct completion *x, long timeout, int state)
4699{
4700 if (!x->done) {
4701 DECLARE_WAITQUEUE(wait, current);
4702
4703 wait.flags |= WQ_FLAG_EXCLUSIVE;
4704 __add_wait_queue_tail(&x->wait, &wait);
4705 do {
4706 if (signal_pending_state(state, current)) {
4707 timeout = -ERESTARTSYS;
4708 break;
4709 }
4710 __set_current_state(state);
4711 spin_unlock_irq(&x->wait.lock);
4712 timeout = schedule_timeout(timeout);
4713 spin_lock_irq(&x->wait.lock);
4714 } while (!x->done && timeout);
4715 __remove_wait_queue(&x->wait, &wait);
4716 if (!x->done)
4717 return timeout;
4718 }
4719 x->done--;
4720 return timeout ?: 1;
4721}
4722
4723static long __sched
4724wait_for_common(struct completion *x, long timeout, int state)
4725{
4726 might_sleep();
4727
4728 spin_lock_irq(&x->wait.lock);
4729 timeout = do_wait_for_common(x, timeout, state);
4730 spin_unlock_irq(&x->wait.lock);
4731 return timeout;
4732}
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744void __sched wait_for_completion(struct completion *x)
4745{
4746 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4747}
4748EXPORT_SYMBOL(wait_for_completion);
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759unsigned long __sched
4760wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4761{
4762 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4763}
4764EXPORT_SYMBOL(wait_for_completion_timeout);
4765
4766
4767
4768
4769
4770
4771
4772
4773int __sched wait_for_completion_interruptible(struct completion *x)
4774{
4775 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4776 if (t == -ERESTARTSYS)
4777 return t;
4778 return 0;
4779}
4780EXPORT_SYMBOL(wait_for_completion_interruptible);
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790unsigned long __sched
4791wait_for_completion_interruptible_timeout(struct completion *x,
4792 unsigned long timeout)
4793{
4794 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4795}
4796EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4797
4798
4799
4800
4801
4802
4803
4804
4805int __sched wait_for_completion_killable(struct completion *x)
4806{
4807 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4808 if (t == -ERESTARTSYS)
4809 return t;
4810 return 0;
4811}
4812EXPORT_SYMBOL(wait_for_completion_killable);
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826bool try_wait_for_completion(struct completion *x)
4827{
4828 int ret = 1;
4829
4830 spin_lock_irq(&x->wait.lock);
4831 if (!x->done)
4832 ret = 0;
4833 else
4834 x->done--;
4835 spin_unlock_irq(&x->wait.lock);
4836 return ret;
4837}
4838EXPORT_SYMBOL(try_wait_for_completion);
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848bool completion_done(struct completion *x)
4849{
4850 int ret = 1;
4851
4852 spin_lock_irq(&x->wait.lock);
4853 if (!x->done)
4854 ret = 0;
4855 spin_unlock_irq(&x->wait.lock);
4856 return ret;
4857}
4858EXPORT_SYMBOL(completion_done);
4859
4860static long __sched
4861sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4862{
4863 unsigned long flags;
4864 wait_queue_t wait;
4865
4866 init_waitqueue_entry(&wait, current);
4867
4868 __set_current_state(state);
4869
4870 spin_lock_irqsave(&q->lock, flags);
4871 __add_wait_queue(q, &wait);
4872 spin_unlock(&q->lock);
4873 timeout = schedule_timeout(timeout);
4874 spin_lock_irq(&q->lock);
4875 __remove_wait_queue(q, &wait);
4876 spin_unlock_irqrestore(&q->lock, flags);
4877
4878 return timeout;
4879}
4880
4881void __sched interruptible_sleep_on(wait_queue_head_t *q)
4882{
4883 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4884}
4885EXPORT_SYMBOL(interruptible_sleep_on);
4886
4887long __sched
4888interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4889{
4890 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4891}
4892EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4893
4894void __sched sleep_on(wait_queue_head_t *q)
4895{
4896 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4897}
4898EXPORT_SYMBOL(sleep_on);
4899
4900long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4901{
4902 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4903}
4904EXPORT_SYMBOL(sleep_on_timeout);
4905
4906#ifdef CONFIG_RT_MUTEXES
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918void rt_mutex_setprio(struct task_struct *p, int prio)
4919{
4920 unsigned long flags;
4921 int oldprio, on_rq, running;
4922 struct rq *rq;
4923 const struct sched_class *prev_class = p->sched_class;
4924
4925 BUG_ON(prio < 0 || prio > MAX_PRIO);
4926
4927 rq = task_rq_lock(p, &flags);
4928 update_rq_clock(rq);
4929
4930 oldprio = p->prio;
4931 on_rq = p->se.on_rq;
4932 running = task_current(rq, p);
4933 if (on_rq)
4934 dequeue_task(rq, p, 0);
4935 if (running)
4936 p->sched_class->put_prev_task(rq, p);
4937
4938 if (rt_prio(prio))
4939 p->sched_class = &rt_sched_class;
4940 else
4941 p->sched_class = &fair_sched_class;
4942
4943 p->prio = prio;
4944
4945 if (running)
4946 p->sched_class->set_curr_task(rq);
4947 if (on_rq) {
4948 enqueue_task(rq, p, 0);
4949
4950 check_class_changed(rq, p, prev_class, oldprio, running);
4951 }
4952 task_rq_unlock(rq, &flags);
4953}
4954
4955#endif
4956
4957void set_user_nice(struct task_struct *p, long nice)
4958{
4959 int old_prio, delta, on_rq;
4960 unsigned long flags;
4961 struct rq *rq;
4962
4963 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4964 return;
4965
4966
4967
4968
4969 rq = task_rq_lock(p, &flags);
4970 update_rq_clock(rq);
4971
4972
4973
4974
4975
4976
4977 if (task_has_rt_policy(p)) {
4978 p->static_prio = NICE_TO_PRIO(nice);
4979 goto out_unlock;
4980 }
4981 on_rq = p->se.on_rq;
4982 if (on_rq)
4983 dequeue_task(rq, p, 0);
4984
4985 p->static_prio = NICE_TO_PRIO(nice);
4986 set_load_weight(p);
4987 old_prio = p->prio;
4988 p->prio = effective_prio(p);
4989 delta = p->prio - old_prio;
4990
4991 if (on_rq) {
4992 enqueue_task(rq, p, 0);
4993
4994
4995
4996
4997 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4998 resched_task(rq->curr);
4999 }
5000out_unlock:
5001 task_rq_unlock(rq, &flags);
5002}
5003EXPORT_SYMBOL(set_user_nice);
5004
5005
5006
5007
5008
5009
5010int can_nice(const struct task_struct *p, const int nice)
5011{
5012
5013 int nice_rlim = 20 - nice;
5014
5015 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
5016 capable(CAP_SYS_NICE));
5017}
5018
5019#ifdef __ARCH_WANT_SYS_NICE
5020
5021
5022
5023
5024
5025
5026
5027
5028asmlinkage long sys_nice(int increment)
5029{
5030 long nice, retval;
5031
5032
5033
5034
5035
5036
5037 if (increment < -40)
5038 increment = -40;
5039 if (increment > 40)
5040 increment = 40;
5041
5042 nice = PRIO_TO_NICE(current->static_prio) + increment;
5043 if (nice < -20)
5044 nice = -20;
5045 if (nice > 19)
5046 nice = 19;
5047
5048 if (increment < 0 && !can_nice(current, nice))
5049 return -EPERM;
5050
5051 retval = security_task_setnice(current, nice);
5052 if (retval)
5053 return retval;
5054
5055 set_user_nice(current, nice);
5056 return 0;
5057}
5058
5059#endif
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069int task_prio(const struct task_struct *p)
5070{
5071 return p->prio - MAX_RT_PRIO;
5072}
5073
5074
5075
5076
5077
5078int task_nice(const struct task_struct *p)
5079{
5080 return TASK_NICE(p);
5081}
5082EXPORT_SYMBOL(task_nice);
5083
5084
5085
5086
5087
5088int idle_cpu(int cpu)
5089{
5090 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
5091}
5092
5093
5094
5095
5096
5097struct task_struct *idle_task(int cpu)
5098{
5099 return cpu_rq(cpu)->idle;
5100}
5101
5102
5103
5104
5105
5106static struct task_struct *find_process_by_pid(pid_t pid)
5107{
5108 return pid ? find_task_by_vpid(pid) : current;
5109}
5110
5111
5112static void
5113__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5114{
5115 BUG_ON(p->se.on_rq);
5116
5117 p->policy = policy;
5118 switch (p->policy) {
5119 case SCHED_NORMAL:
5120 case SCHED_BATCH:
5121 case SCHED_IDLE:
5122 p->sched_class = &fair_sched_class;
5123 break;
5124 case SCHED_FIFO:
5125 case SCHED_RR:
5126 p->sched_class = &rt_sched_class;
5127 break;
5128 }
5129
5130 p->rt_priority = prio;
5131 p->normal_prio = normal_prio(p);
5132
5133 p->prio = rt_mutex_getprio(p);
5134 set_load_weight(p);
5135}
5136
5137static int __sched_setscheduler(struct task_struct *p, int policy,
5138 struct sched_param *param, bool user)
5139{
5140 int retval, oldprio, oldpolicy = -1, on_rq, running;
5141 unsigned long flags;
5142 const struct sched_class *prev_class = p->sched_class;
5143 struct rq *rq;
5144
5145
5146 BUG_ON(in_interrupt());
5147recheck:
5148
5149 if (policy < 0)
5150 policy = oldpolicy = p->policy;
5151 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
5152 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5153 policy != SCHED_IDLE)
5154 return -EINVAL;
5155
5156
5157
5158
5159
5160 if (param->sched_priority < 0 ||
5161 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5162 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5163 return -EINVAL;
5164 if (rt_policy(policy) != (param->sched_priority != 0))
5165 return -EINVAL;
5166
5167
5168
5169
5170 if (user && !capable(CAP_SYS_NICE)) {
5171 if (rt_policy(policy)) {
5172 unsigned long rlim_rtprio;
5173
5174 if (!lock_task_sighand(p, &flags))
5175 return -ESRCH;
5176 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
5177 unlock_task_sighand(p, &flags);
5178
5179
5180 if (policy != p->policy && !rlim_rtprio)
5181 return -EPERM;
5182
5183
5184 if (param->sched_priority > p->rt_priority &&
5185 param->sched_priority > rlim_rtprio)
5186 return -EPERM;
5187 }
5188
5189
5190
5191
5192 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
5193 return -EPERM;
5194
5195
5196 if ((current->euid != p->euid) &&
5197 (current->euid != p->uid))
5198 return -EPERM;
5199 }
5200
5201 if (user) {
5202#ifdef CONFIG_RT_GROUP_SCHED
5203
5204
5205
5206
5207 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5208 task_group(p)->rt_bandwidth.rt_runtime == 0)
5209 return -EPERM;
5210#endif
5211
5212 retval = security_task_setscheduler(p, policy, param);
5213 if (retval)
5214 return retval;
5215 }
5216
5217
5218
5219
5220
5221 spin_lock_irqsave(&p->pi_lock, flags);
5222
5223
5224
5225
5226 rq = __task_rq_lock(p);
5227
5228 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5229 policy = oldpolicy = -1;
5230 __task_rq_unlock(rq);
5231 spin_unlock_irqrestore(&p->pi_lock, flags);
5232 goto recheck;
5233 }
5234 update_rq_clock(rq);
5235 on_rq = p->se.on_rq;
5236 running = task_current(rq, p);
5237 if (on_rq)
5238 deactivate_task(rq, p, 0);
5239 if (running)
5240 p->sched_class->put_prev_task(rq, p);
5241
5242 oldprio = p->prio;
5243 __setscheduler(rq, p, policy, param->sched_priority);
5244
5245 if (running)
5246 p->sched_class->set_curr_task(rq);
5247 if (on_rq) {
5248 activate_task(rq, p, 0);
5249
5250 check_class_changed(rq, p, prev_class, oldprio, running);
5251 }
5252 __task_rq_unlock(rq);
5253 spin_unlock_irqrestore(&p->pi_lock, flags);
5254
5255 rt_mutex_adjust_pi(p);
5256
5257 return 0;
5258}
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268int sched_setscheduler(struct task_struct *p, int policy,
5269 struct sched_param *param)
5270{
5271 return __sched_setscheduler(p, policy, param, true);
5272}
5273EXPORT_SYMBOL_GPL(sched_setscheduler);
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5287 struct sched_param *param)
5288{
5289 return __sched_setscheduler(p, policy, param, false);
5290}
5291
5292static int
5293do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5294{
5295 struct sched_param lparam;
5296 struct task_struct *p;
5297 int retval;
5298
5299 if (!param || pid < 0)
5300 return -EINVAL;
5301 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5302 return -EFAULT;
5303
5304 rcu_read_lock();
5305 retval = -ESRCH;
5306 p = find_process_by_pid(pid);
5307 if (p != NULL)
5308 retval = sched_setscheduler(p, policy, &lparam);
5309 rcu_read_unlock();
5310
5311 return retval;
5312}
5313
5314
5315
5316
5317
5318
5319
5320asmlinkage long
5321sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5322{
5323
5324 if (policy < 0)
5325 return -EINVAL;
5326
5327 return do_sched_setscheduler(pid, policy, param);
5328}
5329
5330
5331
5332
5333
5334
5335asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
5336{
5337 return do_sched_setscheduler(pid, -1, param);
5338}
5339
5340
5341
5342
5343
5344asmlinkage long sys_sched_getscheduler(pid_t pid)
5345{
5346 struct task_struct *p;
5347 int retval;
5348
5349 if (pid < 0)
5350 return -EINVAL;
5351
5352 retval = -ESRCH;
5353 read_lock(&tasklist_lock);
5354 p = find_process_by_pid(pid);
5355 if (p) {
5356 retval = security_task_getscheduler(p);
5357 if (!retval)
5358 retval = p->policy;
5359 }
5360 read_unlock(&tasklist_lock);
5361 return retval;
5362}
5363
5364
5365
5366
5367
5368
5369asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
5370{
5371 struct sched_param lp;
5372 struct task_struct *p;
5373 int retval;
5374
5375 if (!param || pid < 0)
5376 return -EINVAL;
5377
5378 read_lock(&tasklist_lock);
5379 p = find_process_by_pid(pid);
5380 retval = -ESRCH;
5381 if (!p)
5382 goto out_unlock;
5383
5384 retval = security_task_getscheduler(p);
5385 if (retval)
5386 goto out_unlock;
5387
5388 lp.sched_priority = p->rt_priority;
5389 read_unlock(&tasklist_lock);
5390
5391
5392
5393
5394 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5395
5396 return retval;
5397
5398out_unlock:
5399 read_unlock(&tasklist_lock);
5400 return retval;
5401}
5402
5403long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5404{
5405 cpumask_t cpus_allowed;
5406 cpumask_t new_mask = *in_mask;
5407 struct task_struct *p;
5408 int retval;
5409
5410 get_online_cpus();
5411 read_lock(&tasklist_lock);
5412
5413 p = find_process_by_pid(pid);
5414 if (!p) {
5415 read_unlock(&tasklist_lock);
5416 put_online_cpus();
5417 return -ESRCH;
5418 }
5419
5420
5421
5422
5423
5424
5425 get_task_struct(p);
5426 read_unlock(&tasklist_lock);
5427
5428 retval = -EPERM;
5429 if ((current->euid != p->euid) && (current->euid != p->uid) &&
5430 !capable(CAP_SYS_NICE))
5431 goto out_unlock;
5432
5433 retval = security_task_setscheduler(p, 0, NULL);
5434 if (retval)
5435 goto out_unlock;
5436
5437 cpuset_cpus_allowed(p, &cpus_allowed);
5438 cpus_and(new_mask, new_mask, cpus_allowed);
5439 again:
5440 retval = set_cpus_allowed_ptr(p, &new_mask);
5441
5442 if (!retval) {
5443 cpuset_cpus_allowed(p, &cpus_allowed);
5444 if (!cpus_subset(new_mask, cpus_allowed)) {
5445
5446
5447
5448
5449
5450 new_mask = cpus_allowed;
5451 goto again;
5452 }
5453 }
5454out_unlock:
5455 put_task_struct(p);
5456 put_online_cpus();
5457 return retval;
5458}
5459
5460static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5461 cpumask_t *new_mask)
5462{
5463 if (len < sizeof(cpumask_t)) {
5464 memset(new_mask, 0, sizeof(cpumask_t));
5465 } else if (len > sizeof(cpumask_t)) {
5466 len = sizeof(cpumask_t);
5467 }
5468 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5469}
5470
5471
5472
5473
5474
5475
5476
5477asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5478 unsigned long __user *user_mask_ptr)
5479{
5480 cpumask_t new_mask;
5481 int retval;
5482
5483 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
5484 if (retval)
5485 return retval;
5486
5487 return sched_setaffinity(pid, &new_mask);
5488}
5489
5490long sched_getaffinity(pid_t pid, cpumask_t *mask)
5491{
5492 struct task_struct *p;
5493 int retval;
5494
5495 get_online_cpus();
5496 read_lock(&tasklist_lock);
5497
5498 retval = -ESRCH;
5499 p = find_process_by_pid(pid);
5500 if (!p)
5501 goto out_unlock;
5502
5503 retval = security_task_getscheduler(p);
5504 if (retval)
5505 goto out_unlock;
5506
5507 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
5508
5509out_unlock:
5510 read_unlock(&tasklist_lock);
5511 put_online_cpus();
5512
5513 return retval;
5514}
5515
5516
5517
5518
5519
5520
5521
5522asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5523 unsigned long __user *user_mask_ptr)
5524{
5525 int ret;
5526 cpumask_t mask;
5527
5528 if (len < sizeof(cpumask_t))
5529 return -EINVAL;
5530
5531 ret = sched_getaffinity(pid, &mask);
5532 if (ret < 0)
5533 return ret;
5534
5535 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
5536 return -EFAULT;
5537
5538 return sizeof(cpumask_t);
5539}
5540
5541
5542
5543
5544
5545
5546
5547asmlinkage long sys_sched_yield(void)
5548{
5549 struct rq *rq = this_rq_lock();
5550
5551 schedstat_inc(rq, yld_count);
5552 current->sched_class->yield_task(rq);
5553
5554
5555
5556
5557
5558 __release(rq->lock);
5559 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5560 _raw_spin_unlock(&rq->lock);
5561 preempt_enable_no_resched();
5562
5563 schedule();
5564
5565 return 0;
5566}
5567
5568static void __cond_resched(void)
5569{
5570#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5571 __might_sleep(__FILE__, __LINE__);
5572#endif
5573
5574
5575
5576
5577
5578 do {
5579 add_preempt_count(PREEMPT_ACTIVE);
5580 schedule();
5581 sub_preempt_count(PREEMPT_ACTIVE);
5582 } while (need_resched());
5583}
5584
5585int __sched _cond_resched(void)
5586{
5587 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
5588 system_state == SYSTEM_RUNNING) {
5589 __cond_resched();
5590 return 1;
5591 }
5592 return 0;
5593}
5594EXPORT_SYMBOL(_cond_resched);
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604int cond_resched_lock(spinlock_t *lock)
5605{
5606 int resched = need_resched() && system_state == SYSTEM_RUNNING;
5607 int ret = 0;
5608
5609 if (spin_needbreak(lock) || resched) {
5610 spin_unlock(lock);
5611 if (resched && need_resched())
5612 __cond_resched();
5613 else
5614 cpu_relax();
5615 ret = 1;
5616 spin_lock(lock);
5617 }
5618 return ret;
5619}
5620EXPORT_SYMBOL(cond_resched_lock);
5621
5622int __sched cond_resched_softirq(void)
5623{
5624 BUG_ON(!in_softirq());
5625
5626 if (need_resched() && system_state == SYSTEM_RUNNING) {
5627 local_bh_enable();
5628 __cond_resched();
5629 local_bh_disable();
5630 return 1;
5631 }
5632 return 0;
5633}
5634EXPORT_SYMBOL(cond_resched_softirq);
5635
5636
5637
5638
5639
5640
5641
5642void __sched yield(void)
5643{
5644 set_current_state(TASK_RUNNING);
5645 sys_sched_yield();
5646}
5647EXPORT_SYMBOL(yield);
5648
5649
5650
5651
5652
5653
5654
5655
5656void __sched io_schedule(void)
5657{
5658 struct rq *rq = &__raw_get_cpu_var(runqueues);
5659
5660 delayacct_blkio_start();
5661 atomic_inc(&rq->nr_iowait);
5662 schedule();
5663 atomic_dec(&rq->nr_iowait);
5664 delayacct_blkio_end();
5665}
5666EXPORT_SYMBOL(io_schedule);
5667
5668long __sched io_schedule_timeout(long timeout)
5669{
5670 struct rq *rq = &__raw_get_cpu_var(runqueues);
5671 long ret;
5672
5673 delayacct_blkio_start();
5674 atomic_inc(&rq->nr_iowait);
5675 ret = schedule_timeout(timeout);
5676 atomic_dec(&rq->nr_iowait);
5677 delayacct_blkio_end();
5678 return ret;
5679}
5680
5681
5682
5683
5684
5685
5686
5687
5688asmlinkage long sys_sched_get_priority_max(int policy)
5689{
5690 int ret = -EINVAL;
5691
5692 switch (policy) {
5693 case SCHED_FIFO:
5694 case SCHED_RR:
5695 ret = MAX_USER_RT_PRIO-1;
5696 break;
5697 case SCHED_NORMAL:
5698 case SCHED_BATCH:
5699 case SCHED_IDLE:
5700 ret = 0;
5701 break;
5702 }
5703 return ret;
5704}
5705
5706
5707
5708
5709
5710
5711
5712
5713asmlinkage long sys_sched_get_priority_min(int policy)
5714{
5715 int ret = -EINVAL;
5716
5717 switch (policy) {
5718 case SCHED_FIFO:
5719 case SCHED_RR:
5720 ret = 1;
5721 break;
5722 case SCHED_NORMAL:
5723 case SCHED_BATCH:
5724 case SCHED_IDLE:
5725 ret = 0;
5726 }
5727 return ret;
5728}
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738asmlinkage
5739long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
5740{
5741 struct task_struct *p;
5742 unsigned int time_slice;
5743 int retval;
5744 struct timespec t;
5745
5746 if (pid < 0)
5747 return -EINVAL;
5748
5749 retval = -ESRCH;
5750 read_lock(&tasklist_lock);
5751 p = find_process_by_pid(pid);
5752 if (!p)
5753 goto out_unlock;
5754
5755 retval = security_task_getscheduler(p);
5756 if (retval)
5757 goto out_unlock;
5758
5759
5760
5761
5762
5763 time_slice = 0;
5764 if (p->policy == SCHED_RR) {
5765 time_slice = DEF_TIMESLICE;
5766 } else if (p->policy != SCHED_FIFO) {
5767 struct sched_entity *se = &p->se;
5768 unsigned long flags;
5769 struct rq *rq;
5770
5771 rq = task_rq_lock(p, &flags);
5772 if (rq->cfs.load.weight)
5773 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5774 task_rq_unlock(rq, &flags);
5775 }
5776 read_unlock(&tasklist_lock);
5777 jiffies_to_timespec(time_slice, &t);
5778 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5779 return retval;
5780
5781out_unlock:
5782 read_unlock(&tasklist_lock);
5783 return retval;
5784}
5785
5786static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5787
5788void sched_show_task(struct task_struct *p)
5789{
5790 unsigned long free = 0;
5791 unsigned state;
5792
5793 state = p->state ? __ffs(p->state) + 1 : 0;
5794 printk(KERN_INFO "%-13.13s %c", p->comm,
5795 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5796#if BITS_PER_LONG == 32
5797 if (state == TASK_RUNNING)
5798 printk(KERN_CONT " running ");
5799 else
5800 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5801#else
5802 if (state == TASK_RUNNING)
5803 printk(KERN_CONT " running task ");
5804 else
5805 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5806#endif
5807#ifdef CONFIG_DEBUG_STACK_USAGE
5808 {
5809 unsigned long *n = end_of_stack(p);
5810 while (!*n)
5811 n++;
5812 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5813 }
5814#endif
5815 printk(KERN_CONT "%5lu %5d %6d\n", free,
5816 task_pid_nr(p), task_pid_nr(p->real_parent));
5817
5818 show_stack(p, NULL);
5819}
5820
5821void show_state_filter(unsigned long state_filter)
5822{
5823 struct task_struct *g, *p;
5824
5825#if BITS_PER_LONG == 32
5826 printk(KERN_INFO
5827 " task PC stack pid father\n");
5828#else
5829 printk(KERN_INFO
5830 " task PC stack pid father\n");
5831#endif
5832 read_lock(&tasklist_lock);
5833 do_each_thread(g, p) {
5834
5835
5836
5837
5838 touch_nmi_watchdog();
5839 if (!state_filter || (p->state & state_filter))
5840 sched_show_task(p);
5841 } while_each_thread(g, p);
5842
5843 touch_all_softlockup_watchdogs();
5844
5845#ifdef CONFIG_SCHED_DEBUG
5846 sysrq_sched_debug_show();
5847#endif
5848 read_unlock(&tasklist_lock);
5849
5850
5851
5852 if (state_filter == -1)
5853 debug_show_all_locks();
5854}
5855
5856void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5857{
5858 idle->sched_class = &idle_sched_class;
5859}
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869void __cpuinit init_idle(struct task_struct *idle, int cpu)
5870{
5871 struct rq *rq = cpu_rq(cpu);
5872 unsigned long flags;
5873
5874 spin_lock_irqsave(&rq->lock, flags);
5875
5876 __sched_fork(idle);
5877 idle->se.exec_start = sched_clock();
5878
5879 idle->prio = idle->normal_prio = MAX_PRIO;
5880 idle->cpus_allowed = cpumask_of_cpu(cpu);
5881 __set_task_cpu(idle, cpu);
5882
5883 rq->curr = rq->idle = idle;
5884#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5885 idle->oncpu = 1;
5886#endif
5887 spin_unlock_irqrestore(&rq->lock, flags);
5888
5889
5890#if defined(CONFIG_PREEMPT)
5891 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5892#else
5893 task_thread_info(idle)->preempt_count = 0;
5894#endif
5895
5896
5897
5898 idle->sched_class = &idle_sched_class;
5899}
5900
5901
5902
5903
5904
5905
5906
5907
5908cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919static inline void sched_init_granularity(void)
5920{
5921 unsigned int factor = 1 + ilog2(num_online_cpus());
5922 const unsigned long limit = 200000000;
5923
5924 sysctl_sched_min_granularity *= factor;
5925 if (sysctl_sched_min_granularity > limit)
5926 sysctl_sched_min_granularity = limit;
5927
5928 sysctl_sched_latency *= factor;
5929 if (sysctl_sched_latency > limit)
5930 sysctl_sched_latency = limit;
5931
5932 sysctl_sched_wakeup_granularity *= factor;
5933
5934 sysctl_sched_shares_ratelimit *= factor;
5935}
5936
5937#ifdef CONFIG_SMP
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5964{
5965 struct migration_req req;
5966 unsigned long flags;
5967 struct rq *rq;
5968 int ret = 0;
5969
5970 rq = task_rq_lock(p, &flags);
5971 if (!cpus_intersects(*new_mask, cpu_online_map)) {
5972 ret = -EINVAL;
5973 goto out;
5974 }
5975
5976 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5977 !cpus_equal(p->cpus_allowed, *new_mask))) {
5978 ret = -EINVAL;
5979 goto out;
5980 }
5981
5982 if (p->sched_class->set_cpus_allowed)
5983 p->sched_class->set_cpus_allowed(p, new_mask);
5984 else {
5985 p->cpus_allowed = *new_mask;
5986 p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
5987 }
5988
5989
5990 if (cpu_isset(task_cpu(p), *new_mask))
5991 goto out;
5992
5993 if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
5994
5995 task_rq_unlock(rq, &flags);
5996 wake_up_process(rq->migration_thread);
5997 wait_for_completion(&req.done);
5998 tlb_migrate_finish(p->mm);
5999 return 0;
6000 }
6001out:
6002 task_rq_unlock(rq, &flags);
6003
6004 return ret;
6005}
6006EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6020{
6021 struct rq *rq_dest, *rq_src;
6022 int ret = 0, on_rq;
6023
6024 if (unlikely(!cpu_active(dest_cpu)))
6025 return ret;
6026
6027 rq_src = cpu_rq(src_cpu);
6028 rq_dest = cpu_rq(dest_cpu);
6029
6030 double_rq_lock(rq_src, rq_dest);
6031
6032 if (task_cpu(p) != src_cpu)
6033 goto done;
6034
6035 if (!cpu_isset(dest_cpu, p->cpus_allowed))
6036 goto fail;
6037
6038 on_rq = p->se.on_rq;
6039 if (on_rq)
6040 deactivate_task(rq_src, p, 0);
6041
6042 set_task_cpu(p, dest_cpu);
6043 if (on_rq) {
6044 activate_task(rq_dest, p, 0);
6045 check_preempt_curr(rq_dest, p, 0);
6046 }
6047done:
6048 ret = 1;
6049fail:
6050 double_rq_unlock(rq_src, rq_dest);
6051 return ret;
6052}
6053
6054
6055
6056
6057
6058
6059static int migration_thread(void *data)
6060{
6061 int cpu = (long)data;
6062 struct rq *rq;
6063
6064 rq = cpu_rq(cpu);
6065 BUG_ON(rq->migration_thread != current);
6066
6067 set_current_state(TASK_INTERRUPTIBLE);
6068 while (!kthread_should_stop()) {
6069 struct migration_req *req;
6070 struct list_head *head;
6071
6072 spin_lock_irq(&rq->lock);
6073
6074 if (cpu_is_offline(cpu)) {
6075 spin_unlock_irq(&rq->lock);
6076 goto wait_to_die;
6077 }
6078
6079 if (rq->active_balance) {
6080 active_load_balance(rq, cpu);
6081 rq->active_balance = 0;
6082 }
6083
6084 head = &rq->migration_queue;
6085
6086 if (list_empty(head)) {
6087 spin_unlock_irq(&rq->lock);
6088 schedule();
6089 set_current_state(TASK_INTERRUPTIBLE);
6090 continue;
6091 }
6092 req = list_entry(head->next, struct migration_req, list);
6093 list_del_init(head->next);
6094
6095 spin_unlock(&rq->lock);
6096 __migrate_task(req->task, cpu, req->dest_cpu);
6097 local_irq_enable();
6098
6099 complete(&req->done);
6100 }
6101 __set_current_state(TASK_RUNNING);
6102 return 0;
6103
6104wait_to_die:
6105
6106 set_current_state(TASK_INTERRUPTIBLE);
6107 while (!kthread_should_stop()) {
6108 schedule();
6109 set_current_state(TASK_INTERRUPTIBLE);
6110 }
6111 __set_current_state(TASK_RUNNING);
6112 return 0;
6113}
6114
6115#ifdef CONFIG_HOTPLUG_CPU
6116
6117static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6118{
6119 int ret;
6120
6121 local_irq_disable();
6122 ret = __migrate_task(p, src_cpu, dest_cpu);
6123 local_irq_enable();
6124 return ret;
6125}
6126
6127
6128
6129
6130
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{
6133 unsigned long flags;
6134 cpumask_t mask;
6135 struct rq *rq;
6136 int dest_cpu;
6137
6138 do {
6139
6140 mask = node_to_cpumask(cpu_to_node(dead_cpu));
6141 cpus_and(mask, mask, p->cpus_allowed);
6142 dest_cpu = any_online_cpu(mask);
6143
6144
6145 if (dest_cpu >= nr_cpu_ids)
6146 dest_cpu = any_online_cpu(p->cpus_allowed);
6147
6148
6149 if (dest_cpu >= nr_cpu_ids) {
6150 cpumask_t cpus_allowed;
6151
6152 cpuset_cpus_allowed_locked(p, &cpus_allowed);
6153
6154
6155
6156
6157
6158
6159
6160 rq = task_rq_lock(p, &flags);
6161 p->cpus_allowed = cpus_allowed;
6162 dest_cpu = any_online_cpu(p->cpus_allowed);
6163 task_rq_unlock(rq, &flags);
6164
6165
6166
6167
6168
6169
6170 if (p->mm && printk_ratelimit()) {
6171 printk(KERN_INFO "process %d (%s) no "
6172 "longer affine to cpu%d\n",
6173 task_pid_nr(p), p->comm, dead_cpu);
6174 }
6175 }
6176 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
6177}
6178
6179
6180
6181
6182
6183
6184
6185
6186static void migrate_nr_uninterruptible(struct rq *rq_src)
6187{
6188 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
6189 unsigned long flags;
6190
6191 local_irq_save(flags);
6192 double_rq_lock(rq_src, rq_dest);
6193 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6194 rq_src->nr_uninterruptible = 0;
6195 double_rq_unlock(rq_src, rq_dest);
6196 local_irq_restore(flags);
6197}
6198
6199
6200static void migrate_live_tasks(int src_cpu)
6201{
6202 struct task_struct *p, *t;
6203
6204 read_lock(&tasklist_lock);
6205
6206 do_each_thread(t, p) {
6207 if (p == current)
6208 continue;
6209
6210 if (task_cpu(p) == src_cpu)
6211 move_task_off_dead_cpu(src_cpu, p);
6212 } while_each_thread(t, p);
6213
6214 read_unlock(&tasklist_lock);
6215}
6216
6217
6218
6219
6220
6221
6222void sched_idle_next(void)
6223{
6224 int this_cpu = smp_processor_id();
6225 struct rq *rq = cpu_rq(this_cpu);
6226 struct task_struct *p = rq->idle;
6227 unsigned long flags;
6228
6229
6230 BUG_ON(cpu_online(this_cpu));
6231
6232
6233
6234
6235
6236 spin_lock_irqsave(&rq->lock, flags);
6237
6238 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6239
6240 update_rq_clock(rq);
6241 activate_task(rq, p, 0);
6242
6243 spin_unlock_irqrestore(&rq->lock, flags);
6244}
6245
6246
6247
6248
6249
6250void idle_task_exit(void)
6251{
6252 struct mm_struct *mm = current->active_mm;
6253
6254 BUG_ON(cpu_online(smp_processor_id()));
6255
6256 if (mm != &init_mm)
6257 switch_mm(mm, &init_mm, current);
6258 mmdrop(mm);
6259}
6260
6261
6262static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6263{
6264 struct rq *rq = cpu_rq(dead_cpu);
6265
6266
6267 BUG_ON(!p->exit_state);
6268
6269
6270 BUG_ON(p->state == TASK_DEAD);
6271
6272 get_task_struct(p);
6273
6274
6275
6276
6277
6278
6279 spin_unlock_irq(&rq->lock);
6280 move_task_off_dead_cpu(dead_cpu, p);
6281 spin_lock_irq(&rq->lock);
6282
6283 put_task_struct(p);
6284}
6285
6286
6287static void migrate_dead_tasks(unsigned int dead_cpu)
6288{
6289 struct rq *rq = cpu_rq(dead_cpu);
6290 struct task_struct *next;
6291
6292 for ( ; ; ) {
6293 if (!rq->nr_running)
6294 break;
6295 update_rq_clock(rq);
6296 next = pick_next_task(rq, rq->curr);
6297 if (!next)
6298 break;
6299 next->sched_class->put_prev_task(rq, next);
6300 migrate_dead(dead_cpu, next);
6301
6302 }
6303}
6304#endif
6305
6306#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6307
6308static struct ctl_table sd_ctl_dir[] = {
6309 {
6310 .procname = "sched_domain",
6311 .mode = 0555,
6312 },
6313 {0, },
6314};
6315
6316static struct ctl_table sd_ctl_root[] = {
6317 {
6318 .ctl_name = CTL_KERN,
6319 .procname = "kernel",
6320 .mode = 0555,
6321 .child = sd_ctl_dir,
6322 },
6323 {0, },
6324};
6325
6326static struct ctl_table *sd_alloc_ctl_entry(int n)
6327{
6328 struct ctl_table *entry =
6329 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6330
6331 return entry;
6332}
6333
6334static void sd_free_ctl_entry(struct ctl_table **tablep)
6335{
6336 struct ctl_table *entry;
6337
6338
6339
6340
6341
6342
6343
6344 for (entry = *tablep; entry->mode; entry++) {
6345 if (entry->child)
6346 sd_free_ctl_entry(&entry->child);
6347 if (entry->proc_handler == NULL)
6348 kfree(entry->procname);
6349 }
6350
6351 kfree(*tablep);
6352 *tablep = NULL;
6353}
6354
6355static void
6356set_table_entry(struct ctl_table *entry,
6357 const char *procname, void *data, int maxlen,
6358 mode_t mode, proc_handler *proc_handler)
6359{
6360 entry->procname = procname;
6361 entry->data = data;
6362 entry->maxlen = maxlen;
6363 entry->mode = mode;
6364 entry->proc_handler = proc_handler;
6365}
6366
6367static struct ctl_table *
6368sd_alloc_ctl_domain_table(struct sched_domain *sd)
6369{
6370 struct ctl_table *table = sd_alloc_ctl_entry(13);
6371
6372 if (table == NULL)
6373 return NULL;
6374
6375 set_table_entry(&table[0], "min_interval", &sd->min_interval,
6376 sizeof(long), 0644, proc_doulongvec_minmax);
6377 set_table_entry(&table[1], "max_interval", &sd->max_interval,
6378 sizeof(long), 0644, proc_doulongvec_minmax);
6379 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6380 sizeof(int), 0644, proc_dointvec_minmax);
6381 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6382 sizeof(int), 0644, proc_dointvec_minmax);
6383 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6384 sizeof(int), 0644, proc_dointvec_minmax);
6385 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6386 sizeof(int), 0644, proc_dointvec_minmax);
6387 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6388 sizeof(int), 0644, proc_dointvec_minmax);
6389 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6390 sizeof(int), 0644, proc_dointvec_minmax);
6391 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6392 sizeof(int), 0644, proc_dointvec_minmax);
6393 set_table_entry(&table[9], "cache_nice_tries",
6394 &sd->cache_nice_tries,
6395 sizeof(int), 0644, proc_dointvec_minmax);
6396 set_table_entry(&table[10], "flags", &sd->flags,
6397 sizeof(int), 0644, proc_dointvec_minmax);
6398 set_table_entry(&table[11], "name", sd->name,
6399 CORENAME_MAX_SIZE, 0444, proc_dostring);
6400
6401
6402 return table;
6403}
6404
6405static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6406{
6407 struct ctl_table *entry, *table;
6408 struct sched_domain *sd;
6409 int domain_num = 0, i;
6410 char buf[32];
6411
6412 for_each_domain(cpu, sd)
6413 domain_num++;
6414 entry = table = sd_alloc_ctl_entry(domain_num + 1);
6415 if (table == NULL)
6416 return NULL;
6417
6418 i = 0;
6419 for_each_domain(cpu, sd) {
6420 snprintf(buf, 32, "domain%d", i);
6421 entry->procname = kstrdup(buf, GFP_KERNEL);
6422 entry->mode = 0555;
6423 entry->child = sd_alloc_ctl_domain_table(sd);
6424 entry++;
6425 i++;
6426 }
6427 return table;
6428}
6429
6430static struct ctl_table_header *sd_sysctl_header;
6431static void register_sched_domain_sysctl(void)
6432{
6433 int i, cpu_num = num_online_cpus();
6434 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6435 char buf[32];
6436
6437 WARN_ON(sd_ctl_dir[0].child);
6438 sd_ctl_dir[0].child = entry;
6439
6440 if (entry == NULL)
6441 return;
6442
6443 for_each_online_cpu(i) {
6444 snprintf(buf, 32, "cpu%d", i);
6445 entry->procname = kstrdup(buf, GFP_KERNEL);
6446 entry->mode = 0555;
6447 entry->child = sd_alloc_ctl_cpu_table(i);
6448 entry++;
6449 }
6450
6451 WARN_ON(sd_sysctl_header);
6452 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6453}
6454
6455
6456static void unregister_sched_domain_sysctl(void)
6457{
6458 if (sd_sysctl_header)
6459 unregister_sysctl_table(sd_sysctl_header);
6460 sd_sysctl_header = NULL;
6461 if (sd_ctl_dir[0].child)
6462 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6463}
6464#else
6465static void register_sched_domain_sysctl(void)
6466{
6467}
6468static void unregister_sched_domain_sysctl(void)
6469{
6470}
6471#endif
6472
6473static void set_rq_online(struct rq *rq)
6474{
6475 if (!rq->online) {
6476 const struct sched_class *class;
6477
6478 cpu_set(rq->cpu, rq->rd->online);
6479 rq->online = 1;
6480
6481 for_each_class(class) {
6482 if (class->rq_online)
6483 class->rq_online(rq);
6484 }
6485 }
6486}
6487
6488static void set_rq_offline(struct rq *rq)
6489{
6490 if (rq->online) {
6491 const struct sched_class *class;
6492
6493 for_each_class(class) {
6494 if (class->rq_offline)
6495 class->rq_offline(rq);
6496 }
6497
6498 cpu_clear(rq->cpu, rq->rd->online);
6499 rq->online = 0;
6500 }
6501}
6502
6503
6504
6505
6506
6507static int __cpuinit
6508migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6509{
6510 struct task_struct *p;
6511 int cpu = (long)hcpu;
6512 unsigned long flags;
6513 struct rq *rq;
6514
6515 switch (action) {
6516
6517 case CPU_UP_PREPARE:
6518 case CPU_UP_PREPARE_FROZEN:
6519 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
6520 if (IS_ERR(p))
6521 return NOTIFY_BAD;
6522 kthread_bind(p, cpu);
6523
6524 rq = task_rq_lock(p, &flags);
6525 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6526 task_rq_unlock(rq, &flags);
6527 cpu_rq(cpu)->migration_thread = p;
6528 break;
6529
6530 case CPU_ONLINE:
6531 case CPU_ONLINE_FROZEN:
6532
6533 wake_up_process(cpu_rq(cpu)->migration_thread);
6534
6535
6536 rq = cpu_rq(cpu);
6537 spin_lock_irqsave(&rq->lock, flags);
6538 if (rq->rd) {
6539 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6540
6541 set_rq_online(rq);
6542 }
6543 spin_unlock_irqrestore(&rq->lock, flags);
6544 break;
6545
6546#ifdef CONFIG_HOTPLUG_CPU
6547 case CPU_UP_CANCELED:
6548 case CPU_UP_CANCELED_FROZEN:
6549 if (!cpu_rq(cpu)->migration_thread)
6550 break;
6551
6552 kthread_bind(cpu_rq(cpu)->migration_thread,
6553 any_online_cpu(cpu_online_map));
6554 kthread_stop(cpu_rq(cpu)->migration_thread);
6555 cpu_rq(cpu)->migration_thread = NULL;
6556 break;
6557
6558 case CPU_DEAD:
6559 case CPU_DEAD_FROZEN:
6560 cpuset_lock();
6561 migrate_live_tasks(cpu);
6562 rq = cpu_rq(cpu);
6563 kthread_stop(rq->migration_thread);
6564 rq->migration_thread = NULL;
6565
6566 spin_lock_irq(&rq->lock);
6567 update_rq_clock(rq);
6568 deactivate_task(rq, rq->idle, 0);
6569 rq->idle->static_prio = MAX_PRIO;
6570 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6571 rq->idle->sched_class = &idle_sched_class;
6572 migrate_dead_tasks(cpu);
6573 spin_unlock_irq(&rq->lock);
6574 cpuset_unlock();
6575 migrate_nr_uninterruptible(rq);
6576 BUG_ON(rq->nr_running != 0);
6577
6578
6579
6580
6581
6582
6583 spin_lock_irq(&rq->lock);
6584 while (!list_empty(&rq->migration_queue)) {
6585 struct migration_req *req;
6586
6587 req = list_entry(rq->migration_queue.next,
6588 struct migration_req, list);
6589 list_del_init(&req->list);
6590 spin_unlock_irq(&rq->lock);
6591 complete(&req->done);
6592 spin_lock_irq(&rq->lock);
6593 }
6594 spin_unlock_irq(&rq->lock);
6595 break;
6596
6597 case CPU_DYING:
6598 case CPU_DYING_FROZEN:
6599
6600 rq = cpu_rq(cpu);
6601 spin_lock_irqsave(&rq->lock, flags);
6602 if (rq->rd) {
6603 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6604 set_rq_offline(rq);
6605 }
6606 spin_unlock_irqrestore(&rq->lock, flags);
6607 break;
6608#endif
6609 }
6610 return NOTIFY_OK;
6611}
6612
6613
6614
6615
6616static struct notifier_block __cpuinitdata migration_notifier = {
6617 .notifier_call = migration_call,
6618 .priority = 10
6619};
6620
6621static int __init migration_init(void)
6622{
6623 void *cpu = (void *)(long)smp_processor_id();
6624 int err;
6625
6626
6627 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6628 BUG_ON(err == NOTIFY_BAD);
6629 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6630 register_cpu_notifier(&migration_notifier);
6631
6632 return err;
6633}
6634early_initcall(migration_init);
6635#endif
6636
6637#ifdef CONFIG_SMP
6638
6639#ifdef CONFIG_SCHED_DEBUG
6640
6641static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6642{
6643 switch (lvl) {
6644 case SD_LV_NONE:
6645 return "NONE";
6646 case SD_LV_SIBLING:
6647 return "SIBLING";
6648 case SD_LV_MC:
6649 return "MC";
6650 case SD_LV_CPU:
6651 return "CPU";
6652 case SD_LV_NODE:
6653 return "NODE";
6654 case SD_LV_ALLNODES:
6655 return "ALLNODES";
6656 case SD_LV_MAX:
6657 return "MAX";
6658
6659 }
6660 return "MAX";
6661}
6662
6663static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6664 cpumask_t *groupmask)
6665{
6666 struct sched_group *group = sd->groups;
6667 char str[256];
6668
6669 cpulist_scnprintf(str, sizeof(str), sd->span);
6670 cpus_clear(*groupmask);
6671
6672 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6673
6674 if (!(sd->flags & SD_LOAD_BALANCE)) {
6675 printk("does not load-balance\n");
6676 if (sd->parent)
6677 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6678 " has parent");
6679 return -1;
6680 }
6681
6682 printk(KERN_CONT "span %s level %s\n",
6683 str, sd_level_to_string(sd->level));
6684
6685 if (!cpu_isset(cpu, sd->span)) {
6686 printk(KERN_ERR "ERROR: domain->span does not contain "
6687 "CPU%d\n", cpu);
6688 }
6689 if (!cpu_isset(cpu, group->cpumask)) {
6690 printk(KERN_ERR "ERROR: domain->groups does not contain"
6691 " CPU%d\n", cpu);
6692 }
6693
6694 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6695 do {
6696 if (!group) {
6697 printk("\n");
6698 printk(KERN_ERR "ERROR: group is NULL\n");
6699 break;
6700 }
6701
6702 if (!group->__cpu_power) {
6703 printk(KERN_CONT "\n");
6704 printk(KERN_ERR "ERROR: domain->cpu_power not "
6705 "set\n");
6706 break;
6707 }
6708
6709 if (!cpus_weight(group->cpumask)) {
6710 printk(KERN_CONT "\n");
6711 printk(KERN_ERR "ERROR: empty group\n");
6712 break;
6713 }
6714
6715 if (cpus_intersects(*groupmask, group->cpumask)) {
6716 printk(KERN_CONT "\n");
6717 printk(KERN_ERR "ERROR: repeated CPUs\n");
6718 break;
6719 }
6720
6721 cpus_or(*groupmask, *groupmask, group->cpumask);
6722
6723 cpulist_scnprintf(str, sizeof(str), group->cpumask);
6724 printk(KERN_CONT " %s", str);
6725
6726 group = group->next;
6727 } while (group != sd->groups);
6728 printk(KERN_CONT "\n");
6729
6730 if (!cpus_equal(sd->span, *groupmask))
6731 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6732
6733 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
6734 printk(KERN_ERR "ERROR: parent span is not a superset "
6735 "of domain->span\n");
6736 return 0;
6737}
6738
6739static void sched_domain_debug(struct sched_domain *sd, int cpu)
6740{
6741 cpumask_t *groupmask;
6742 int level = 0;
6743
6744 if (!sd) {
6745 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6746 return;
6747 }
6748
6749 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6750
6751 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6752 if (!groupmask) {
6753 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6754 return;
6755 }
6756
6757 for (;;) {
6758 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6759 break;
6760 level++;
6761 sd = sd->parent;
6762 if (!sd)
6763 break;
6764 }
6765 kfree(groupmask);
6766}
6767#else
6768# define sched_domain_debug(sd, cpu) do { } while (0)
6769#endif
6770
6771static int sd_degenerate(struct sched_domain *sd)
6772{
6773 if (cpus_weight(sd->span) == 1)
6774 return 1;
6775
6776
6777 if (sd->flags & (SD_LOAD_BALANCE |
6778 SD_BALANCE_NEWIDLE |
6779 SD_BALANCE_FORK |
6780 SD_BALANCE_EXEC |
6781 SD_SHARE_CPUPOWER |
6782 SD_SHARE_PKG_RESOURCES)) {
6783 if (sd->groups != sd->groups->next)
6784 return 0;
6785 }
6786
6787
6788 if (sd->flags & (SD_WAKE_IDLE |
6789 SD_WAKE_AFFINE |
6790 SD_WAKE_BALANCE))
6791 return 0;
6792
6793 return 1;
6794}
6795
6796static int
6797sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6798{
6799 unsigned long cflags = sd->flags, pflags = parent->flags;
6800
6801 if (sd_degenerate(parent))
6802 return 1;
6803
6804 if (!cpus_equal(sd->span, parent->span))
6805 return 0;
6806
6807
6808
6809 if (cflags & SD_WAKE_AFFINE)
6810 pflags &= ~SD_WAKE_BALANCE;
6811
6812 if (parent->groups == parent->groups->next) {
6813 pflags &= ~(SD_LOAD_BALANCE |
6814 SD_BALANCE_NEWIDLE |
6815 SD_BALANCE_FORK |
6816 SD_BALANCE_EXEC |
6817 SD_SHARE_CPUPOWER |
6818 SD_SHARE_PKG_RESOURCES);
6819 }
6820 if (~cflags & pflags)
6821 return 0;
6822
6823 return 1;
6824}
6825
6826static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6827{
6828 unsigned long flags;
6829
6830 spin_lock_irqsave(&rq->lock, flags);
6831
6832 if (rq->rd) {
6833 struct root_domain *old_rd = rq->rd;
6834
6835 if (cpu_isset(rq->cpu, old_rd->online))
6836 set_rq_offline(rq);
6837
6838 cpu_clear(rq->cpu, old_rd->span);
6839
6840 if (atomic_dec_and_test(&old_rd->refcount))
6841 kfree(old_rd);
6842 }
6843
6844 atomic_inc(&rd->refcount);
6845 rq->rd = rd;
6846
6847 cpu_set(rq->cpu, rd->span);
6848 if (cpu_isset(rq->cpu, cpu_online_map))
6849 set_rq_online(rq);
6850
6851 spin_unlock_irqrestore(&rq->lock, flags);
6852}
6853
6854static void init_rootdomain(struct root_domain *rd)
6855{
6856 memset(rd, 0, sizeof(*rd));
6857
6858 cpus_clear(rd->span);
6859 cpus_clear(rd->online);
6860
6861 cpupri_init(&rd->cpupri);
6862}
6863
6864static void init_defrootdomain(void)
6865{
6866 init_rootdomain(&def_root_domain);
6867 atomic_set(&def_root_domain.refcount, 1);
6868}
6869
6870static struct root_domain *alloc_rootdomain(void)
6871{
6872 struct root_domain *rd;
6873
6874 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6875 if (!rd)
6876 return NULL;
6877
6878 init_rootdomain(rd);
6879
6880 return rd;
6881}
6882
6883
6884
6885
6886
6887static void
6888cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6889{
6890 struct rq *rq = cpu_rq(cpu);
6891 struct sched_domain *tmp;
6892
6893
6894 for (tmp = sd; tmp; ) {
6895 struct sched_domain *parent = tmp->parent;
6896 if (!parent)
6897 break;
6898
6899 if (sd_parent_degenerate(tmp, parent)) {
6900 tmp->parent = parent->parent;
6901 if (parent->parent)
6902 parent->parent->child = tmp;
6903 } else
6904 tmp = tmp->parent;
6905 }
6906
6907 if (sd && sd_degenerate(sd)) {
6908 sd = sd->parent;
6909 if (sd)
6910 sd->child = NULL;
6911 }
6912
6913 sched_domain_debug(sd, cpu);
6914
6915 rq_attach_root(rq, rd);
6916 rcu_assign_pointer(rq->sd, sd);
6917}
6918
6919
6920static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6921
6922
6923static int __init isolated_cpu_setup(char *str)
6924{
6925 static int __initdata ints[NR_CPUS];
6926 int i;
6927
6928 str = get_options(str, ARRAY_SIZE(ints), ints);
6929 cpus_clear(cpu_isolated_map);
6930 for (i = 1; i <= ints[0]; i++)
6931 if (ints[i] < NR_CPUS)
6932 cpu_set(ints[i], cpu_isolated_map);
6933 return 1;
6934}
6935
6936__setup("isolcpus=", isolated_cpu_setup);
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948static void
6949init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6950 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
6951 struct sched_group **sg,
6952 cpumask_t *tmpmask),
6953 cpumask_t *covered, cpumask_t *tmpmask)
6954{
6955 struct sched_group *first = NULL, *last = NULL;
6956 int i;
6957
6958 cpus_clear(*covered);
6959
6960 for_each_cpu_mask_nr(i, *span) {
6961 struct sched_group *sg;
6962 int group = group_fn(i, cpu_map, &sg, tmpmask);
6963 int j;
6964
6965 if (cpu_isset(i, *covered))
6966 continue;
6967
6968 cpus_clear(sg->cpumask);
6969 sg->__cpu_power = 0;
6970
6971 for_each_cpu_mask_nr(j, *span) {
6972 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6973 continue;
6974
6975 cpu_set(j, *covered);
6976 cpu_set(j, sg->cpumask);
6977 }
6978 if (!first)
6979 first = sg;
6980 if (last)
6981 last->next = sg;
6982 last = sg;
6983 }
6984 last->next = first;
6985}
6986
6987#define SD_NODES_PER_DOMAIN 16
6988
6989#ifdef CONFIG_NUMA
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001static int find_next_best_node(int node, nodemask_t *used_nodes)
7002{
7003 int i, n, val, min_val, best_node = 0;
7004
7005 min_val = INT_MAX;
7006
7007 for (i = 0; i < nr_node_ids; i++) {
7008
7009 n = (node + i) % nr_node_ids;
7010
7011 if (!nr_cpus_node(n))
7012 continue;
7013
7014
7015 if (node_isset(n, *used_nodes))
7016 continue;
7017
7018
7019 val = node_distance(node, n);
7020
7021 if (val < min_val) {
7022 min_val = val;
7023 best_node = n;
7024 }
7025 }
7026
7027 node_set(best_node, *used_nodes);
7028 return best_node;
7029}
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040static void sched_domain_node_span(int node, cpumask_t *span)
7041{
7042 nodemask_t used_nodes;
7043 node_to_cpumask_ptr(nodemask, node);
7044 int i;
7045
7046 cpus_clear(*span);
7047 nodes_clear(used_nodes);
7048
7049 cpus_or(*span, *span, *nodemask);
7050 node_set(node, used_nodes);
7051
7052 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7053 int next_node = find_next_best_node(node, &used_nodes);
7054
7055 node_to_cpumask_ptr_next(nodemask, next_node);
7056 cpus_or(*span, *span, *nodemask);
7057 }
7058}
7059#endif
7060
7061int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7062
7063
7064
7065
7066#ifdef CONFIG_SCHED_SMT
7067static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
7068static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
7069
7070static int
7071cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7072 cpumask_t *unused)
7073{
7074 if (sg)
7075 *sg = &per_cpu(sched_group_cpus, cpu);
7076 return cpu;
7077}
7078#endif
7079
7080
7081
7082
7083#ifdef CONFIG_SCHED_MC
7084static DEFINE_PER_CPU(struct sched_domain, core_domains);
7085static DEFINE_PER_CPU(struct sched_group, sched_group_core);
7086#endif
7087
7088#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7089static int
7090cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7091 cpumask_t *mask)
7092{
7093 int group;
7094
7095 *mask = per_cpu(cpu_sibling_map, cpu);
7096 cpus_and(*mask, *mask, *cpu_map);
7097 group = first_cpu(*mask);
7098 if (sg)
7099 *sg = &per_cpu(sched_group_core, group);
7100 return group;
7101}
7102#elif defined(CONFIG_SCHED_MC)
7103static int
7104cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7105 cpumask_t *unused)
7106{
7107 if (sg)
7108 *sg = &per_cpu(sched_group_core, cpu);
7109 return cpu;
7110}
7111#endif
7112
7113static DEFINE_PER_CPU(struct sched_domain, phys_domains);
7114static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
7115
7116static int
7117cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7118 cpumask_t *mask)
7119{
7120 int group;
7121#ifdef CONFIG_SCHED_MC
7122 *mask = cpu_coregroup_map(cpu);
7123 cpus_and(*mask, *mask, *cpu_map);
7124 group = first_cpu(*mask);
7125#elif defined(CONFIG_SCHED_SMT)
7126 *mask = per_cpu(cpu_sibling_map, cpu);
7127 cpus_and(*mask, *mask, *cpu_map);
7128 group = first_cpu(*mask);
7129#else
7130 group = cpu;
7131#endif
7132 if (sg)
7133 *sg = &per_cpu(sched_group_phys, group);
7134 return group;
7135}
7136
7137#ifdef CONFIG_NUMA
7138
7139
7140
7141
7142
7143static DEFINE_PER_CPU(struct sched_domain, node_domains);
7144static struct sched_group ***sched_group_nodes_bycpu;
7145
7146static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7147static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
7148
7149static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
7150 struct sched_group **sg, cpumask_t *nodemask)
7151{
7152 int group;
7153
7154 *nodemask = node_to_cpumask(cpu_to_node(cpu));
7155 cpus_and(*nodemask, *nodemask, *cpu_map);
7156 group = first_cpu(*nodemask);
7157
7158 if (sg)
7159 *sg = &per_cpu(sched_group_allnodes, group);
7160 return group;
7161}
7162
7163static void init_numa_sched_groups_power(struct sched_group *group_head)
7164{
7165 struct sched_group *sg = group_head;
7166 int j;
7167
7168 if (!sg)
7169 return;
7170 do {
7171 for_each_cpu_mask_nr(j, sg->cpumask) {
7172 struct sched_domain *sd;
7173
7174 sd = &per_cpu(phys_domains, j);
7175 if (j != first_cpu(sd->groups->cpumask)) {
7176
7177
7178
7179
7180 continue;
7181 }
7182
7183 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
7184 }
7185 sg = sg->next;
7186 } while (sg != group_head);
7187}
7188#endif
7189
7190#ifdef CONFIG_NUMA
7191
7192static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7193{
7194 int cpu, i;
7195
7196 for_each_cpu_mask_nr(cpu, *cpu_map) {
7197 struct sched_group **sched_group_nodes
7198 = sched_group_nodes_bycpu[cpu];
7199
7200 if (!sched_group_nodes)
7201 continue;
7202
7203 for (i = 0; i < nr_node_ids; i++) {
7204 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7205
7206 *nodemask = node_to_cpumask(i);
7207 cpus_and(*nodemask, *nodemask, *cpu_map);
7208 if (cpus_empty(*nodemask))
7209 continue;
7210
7211 if (sg == NULL)
7212 continue;
7213 sg = sg->next;
7214next_sg:
7215 oldsg = sg;
7216 sg = sg->next;
7217 kfree(oldsg);
7218 if (oldsg != sched_group_nodes[i])
7219 goto next_sg;
7220 }
7221 kfree(sched_group_nodes);
7222 sched_group_nodes_bycpu[cpu] = NULL;
7223 }
7224}
7225#else
7226static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7227{
7228}
7229#endif
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7246{
7247 struct sched_domain *child;
7248 struct sched_group *group;
7249
7250 WARN_ON(!sd || !sd->groups);
7251
7252 if (cpu != first_cpu(sd->groups->cpumask))
7253 return;
7254
7255 child = sd->child;
7256
7257 sd->groups->__cpu_power = 0;
7258
7259
7260
7261
7262
7263
7264
7265
7266 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
7267 (child->flags &
7268 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
7269 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
7270 return;
7271 }
7272
7273
7274
7275
7276 group = child->groups;
7277 do {
7278 sg_inc_cpu_power(sd->groups, group->__cpu_power);
7279 group = group->next;
7280 } while (group != child->groups);
7281}
7282
7283
7284
7285
7286
7287
7288#ifdef CONFIG_SCHED_DEBUG
7289# define SD_INIT_NAME(sd, type) sd->name = #type
7290#else
7291# define SD_INIT_NAME(sd, type) do { } while (0)
7292#endif
7293
7294#define SD_INIT(sd, type) sd_init_##type(sd)
7295
7296#define SD_INIT_FUNC(type) \
7297static noinline void sd_init_##type(struct sched_domain *sd) \
7298{ \
7299 memset(sd, 0, sizeof(*sd)); \
7300 *sd = SD_##type##_INIT; \
7301 sd->level = SD_LV_##type; \
7302 SD_INIT_NAME(sd, type); \
7303}
7304
7305SD_INIT_FUNC(CPU)
7306#ifdef CONFIG_NUMA
7307 SD_INIT_FUNC(ALLNODES)
7308 SD_INIT_FUNC(NODE)
7309#endif
7310#ifdef CONFIG_SCHED_SMT
7311 SD_INIT_FUNC(SIBLING)
7312#endif
7313#ifdef CONFIG_SCHED_MC
7314 SD_INIT_FUNC(MC)
7315#endif
7316
7317
7318
7319
7320
7321
7322struct allmasks {
7323 cpumask_t tmpmask;
7324 union {
7325 cpumask_t nodemask;
7326 cpumask_t this_sibling_map;
7327 cpumask_t this_core_map;
7328 };
7329 cpumask_t send_covered;
7330
7331#ifdef CONFIG_NUMA
7332 cpumask_t domainspan;
7333 cpumask_t covered;
7334 cpumask_t notcovered;
7335#endif
7336};
7337
7338#if NR_CPUS > 128
7339#define SCHED_CPUMASK_ALLOC 1
7340#define SCHED_CPUMASK_FREE(v) kfree(v)
7341#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7342#else
7343#define SCHED_CPUMASK_ALLOC 0
7344#define SCHED_CPUMASK_FREE(v)
7345#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7346#endif
7347
7348#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7349 ((unsigned long)(a) + offsetof(struct allmasks, v))
7350
7351static int default_relax_domain_level = -1;
7352
7353static int __init setup_relax_domain_level(char *str)
7354{
7355 unsigned long val;
7356
7357 val = simple_strtoul(str, NULL, 0);
7358 if (val < SD_LV_MAX)
7359 default_relax_domain_level = val;
7360
7361 return 1;
7362}
7363__setup("relax_domain_level=", setup_relax_domain_level);
7364
7365static void set_domain_attribute(struct sched_domain *sd,
7366 struct sched_domain_attr *attr)
7367{
7368 int request;
7369
7370 if (!attr || attr->relax_domain_level < 0) {
7371 if (default_relax_domain_level < 0)
7372 return;
7373 else
7374 request = default_relax_domain_level;
7375 } else
7376 request = attr->relax_domain_level;
7377 if (request < sd->level) {
7378
7379 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
7380 } else {
7381
7382 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
7383 }
7384}
7385
7386
7387
7388
7389
7390static int __build_sched_domains(const cpumask_t *cpu_map,
7391 struct sched_domain_attr *attr)
7392{
7393 int i;
7394 struct root_domain *rd;
7395 SCHED_CPUMASK_DECLARE(allmasks);
7396 cpumask_t *tmpmask;
7397#ifdef CONFIG_NUMA
7398 struct sched_group **sched_group_nodes = NULL;
7399 int sd_allnodes = 0;
7400
7401
7402
7403
7404 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
7405 GFP_KERNEL);
7406 if (!sched_group_nodes) {
7407 printk(KERN_WARNING "Can not alloc sched group node list\n");
7408 return -ENOMEM;
7409 }
7410#endif
7411
7412 rd = alloc_rootdomain();
7413 if (!rd) {
7414 printk(KERN_WARNING "Cannot alloc root domain\n");
7415#ifdef CONFIG_NUMA
7416 kfree(sched_group_nodes);
7417#endif
7418 return -ENOMEM;
7419 }
7420
7421#if SCHED_CPUMASK_ALLOC
7422
7423 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7424 if (!allmasks) {
7425 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7426 kfree(rd);
7427#ifdef CONFIG_NUMA
7428 kfree(sched_group_nodes);
7429#endif
7430 return -ENOMEM;
7431 }
7432#endif
7433 tmpmask = (cpumask_t *)allmasks;
7434
7435
7436#ifdef CONFIG_NUMA
7437 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7438#endif
7439
7440
7441
7442
7443 for_each_cpu_mask_nr(i, *cpu_map) {
7444 struct sched_domain *sd = NULL, *p;
7445 SCHED_CPUMASK_VAR(nodemask, allmasks);
7446
7447 *nodemask = node_to_cpumask(cpu_to_node(i));
7448 cpus_and(*nodemask, *nodemask, *cpu_map);
7449
7450#ifdef CONFIG_NUMA
7451 if (cpus_weight(*cpu_map) >
7452 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
7453 sd = &per_cpu(allnodes_domains, i);
7454 SD_INIT(sd, ALLNODES);
7455 set_domain_attribute(sd, attr);
7456 sd->span = *cpu_map;
7457 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7458 p = sd;
7459 sd_allnodes = 1;
7460 } else
7461 p = NULL;
7462
7463 sd = &per_cpu(node_domains, i);
7464 SD_INIT(sd, NODE);
7465 set_domain_attribute(sd, attr);
7466 sched_domain_node_span(cpu_to_node(i), &sd->span);
7467 sd->parent = p;
7468 if (p)
7469 p->child = sd;
7470 cpus_and(sd->span, sd->span, *cpu_map);
7471#endif
7472
7473 p = sd;
7474 sd = &per_cpu(phys_domains, i);
7475 SD_INIT(sd, CPU);
7476 set_domain_attribute(sd, attr);
7477 sd->span = *nodemask;
7478 sd->parent = p;
7479 if (p)
7480 p->child = sd;
7481 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
7482
7483#ifdef CONFIG_SCHED_MC
7484 p = sd;
7485 sd = &per_cpu(core_domains, i);
7486 SD_INIT(sd, MC);
7487 set_domain_attribute(sd, attr);
7488 sd->span = cpu_coregroup_map(i);
7489 cpus_and(sd->span, sd->span, *cpu_map);
7490 sd->parent = p;
7491 p->child = sd;
7492 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
7493#endif
7494
7495#ifdef CONFIG_SCHED_SMT
7496 p = sd;
7497 sd = &per_cpu(cpu_domains, i);
7498 SD_INIT(sd, SIBLING);
7499 set_domain_attribute(sd, attr);
7500 sd->span = per_cpu(cpu_sibling_map, i);
7501 cpus_and(sd->span, sd->span, *cpu_map);
7502 sd->parent = p;
7503 p->child = sd;
7504 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
7505#endif
7506 }
7507
7508#ifdef CONFIG_SCHED_SMT
7509
7510 for_each_cpu_mask_nr(i, *cpu_map) {
7511 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7512 SCHED_CPUMASK_VAR(send_covered, allmasks);
7513
7514 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7515 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7516 if (i != first_cpu(*this_sibling_map))
7517 continue;
7518
7519 init_sched_build_groups(this_sibling_map, cpu_map,
7520 &cpu_to_cpu_group,
7521 send_covered, tmpmask);
7522 }
7523#endif
7524
7525#ifdef CONFIG_SCHED_MC
7526
7527 for_each_cpu_mask_nr(i, *cpu_map) {
7528 SCHED_CPUMASK_VAR(this_core_map, allmasks);
7529 SCHED_CPUMASK_VAR(send_covered, allmasks);
7530
7531 *this_core_map = cpu_coregroup_map(i);
7532 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7533 if (i != first_cpu(*this_core_map))
7534 continue;
7535
7536 init_sched_build_groups(this_core_map, cpu_map,
7537 &cpu_to_core_group,
7538 send_covered, tmpmask);
7539 }
7540#endif
7541
7542
7543 for (i = 0; i < nr_node_ids; i++) {
7544 SCHED_CPUMASK_VAR(nodemask, allmasks);
7545 SCHED_CPUMASK_VAR(send_covered, allmasks);
7546
7547 *nodemask = node_to_cpumask(i);
7548 cpus_and(*nodemask, *nodemask, *cpu_map);
7549 if (cpus_empty(*nodemask))
7550 continue;
7551
7552 init_sched_build_groups(nodemask, cpu_map,
7553 &cpu_to_phys_group,
7554 send_covered, tmpmask);
7555 }
7556
7557#ifdef CONFIG_NUMA
7558
7559 if (sd_allnodes) {
7560 SCHED_CPUMASK_VAR(send_covered, allmasks);
7561
7562 init_sched_build_groups(cpu_map, cpu_map,
7563 &cpu_to_allnodes_group,
7564 send_covered, tmpmask);
7565 }
7566
7567 for (i = 0; i < nr_node_ids; i++) {
7568
7569 struct sched_group *sg, *prev;
7570 SCHED_CPUMASK_VAR(nodemask, allmasks);
7571 SCHED_CPUMASK_VAR(domainspan, allmasks);
7572 SCHED_CPUMASK_VAR(covered, allmasks);
7573 int j;
7574
7575 *nodemask = node_to_cpumask(i);
7576 cpus_clear(*covered);
7577
7578 cpus_and(*nodemask, *nodemask, *cpu_map);
7579 if (cpus_empty(*nodemask)) {
7580 sched_group_nodes[i] = NULL;
7581 continue;
7582 }
7583
7584 sched_domain_node_span(i, domainspan);
7585 cpus_and(*domainspan, *domainspan, *cpu_map);
7586
7587 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
7588 if (!sg) {
7589 printk(KERN_WARNING "Can not alloc domain group for "
7590 "node %d\n", i);
7591 goto error;
7592 }
7593 sched_group_nodes[i] = sg;
7594 for_each_cpu_mask_nr(j, *nodemask) {
7595 struct sched_domain *sd;
7596
7597 sd = &per_cpu(node_domains, j);
7598 sd->groups = sg;
7599 }
7600 sg->__cpu_power = 0;
7601 sg->cpumask = *nodemask;
7602 sg->next = sg;
7603 cpus_or(*covered, *covered, *nodemask);
7604 prev = sg;
7605
7606 for (j = 0; j < nr_node_ids; j++) {
7607 SCHED_CPUMASK_VAR(notcovered, allmasks);
7608 int n = (i + j) % nr_node_ids;
7609 node_to_cpumask_ptr(pnodemask, n);
7610
7611 cpus_complement(*notcovered, *covered);
7612 cpus_and(*tmpmask, *notcovered, *cpu_map);
7613 cpus_and(*tmpmask, *tmpmask, *domainspan);
7614 if (cpus_empty(*tmpmask))
7615 break;
7616
7617 cpus_and(*tmpmask, *tmpmask, *pnodemask);
7618 if (cpus_empty(*tmpmask))
7619 continue;
7620
7621 sg = kmalloc_node(sizeof(struct sched_group),
7622 GFP_KERNEL, i);
7623 if (!sg) {
7624 printk(KERN_WARNING
7625 "Can not alloc domain group for node %d\n", j);
7626 goto error;
7627 }
7628 sg->__cpu_power = 0;
7629 sg->cpumask = *tmpmask;
7630 sg->next = prev->next;
7631 cpus_or(*covered, *covered, *tmpmask);
7632 prev->next = sg;
7633 prev = sg;
7634 }
7635 }
7636#endif
7637
7638
7639#ifdef CONFIG_SCHED_SMT
7640 for_each_cpu_mask_nr(i, *cpu_map) {
7641 struct sched_domain *sd = &per_cpu(cpu_domains, i);
7642
7643 init_sched_groups_power(i, sd);
7644 }
7645#endif
7646#ifdef CONFIG_SCHED_MC
7647 for_each_cpu_mask_nr(i, *cpu_map) {
7648 struct sched_domain *sd = &per_cpu(core_domains, i);
7649
7650 init_sched_groups_power(i, sd);
7651 }
7652#endif
7653
7654 for_each_cpu_mask_nr(i, *cpu_map) {
7655 struct sched_domain *sd = &per_cpu(phys_domains, i);
7656
7657 init_sched_groups_power(i, sd);
7658 }
7659
7660#ifdef CONFIG_NUMA
7661 for (i = 0; i < nr_node_ids; i++)
7662 init_numa_sched_groups_power(sched_group_nodes[i]);
7663
7664 if (sd_allnodes) {
7665 struct sched_group *sg;
7666
7667 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
7668 tmpmask);
7669 init_numa_sched_groups_power(sg);
7670 }
7671#endif
7672
7673
7674 for_each_cpu_mask_nr(i, *cpu_map) {
7675 struct sched_domain *sd;
7676#ifdef CONFIG_SCHED_SMT
7677 sd = &per_cpu(cpu_domains, i);
7678#elif defined(CONFIG_SCHED_MC)
7679 sd = &per_cpu(core_domains, i);
7680#else
7681 sd = &per_cpu(phys_domains, i);
7682#endif
7683 cpu_attach_domain(sd, rd, i);
7684 }
7685
7686 SCHED_CPUMASK_FREE((void *)allmasks);
7687 return 0;
7688
7689#ifdef CONFIG_NUMA
7690error:
7691 free_sched_groups(cpu_map, tmpmask);
7692 SCHED_CPUMASK_FREE((void *)allmasks);
7693 kfree(rd);
7694 return -ENOMEM;
7695#endif
7696}
7697
7698static int build_sched_domains(const cpumask_t *cpu_map)
7699{
7700 return __build_sched_domains(cpu_map, NULL);
7701}
7702
7703static cpumask_t *doms_cur;
7704static int ndoms_cur;
7705static struct sched_domain_attr *dattr_cur;
7706
7707
7708
7709
7710
7711
7712
7713static cpumask_t fallback_doms;
7714
7715void __attribute__((weak)) arch_update_cpu_topology(void)
7716{
7717}
7718
7719
7720
7721
7722
7723
7724static int arch_init_sched_domains(const cpumask_t *cpu_map)
7725{
7726 int err;
7727
7728 arch_update_cpu_topology();
7729 ndoms_cur = 1;
7730 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
7731 if (!doms_cur)
7732 doms_cur = &fallback_doms;
7733 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
7734 dattr_cur = NULL;
7735 err = build_sched_domains(doms_cur);
7736 register_sched_domain_sysctl();
7737
7738 return err;
7739}
7740
7741static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7742 cpumask_t *tmpmask)
7743{
7744 free_sched_groups(cpu_map, tmpmask);
7745}
7746
7747
7748
7749
7750
7751static void detach_destroy_domains(const cpumask_t *cpu_map)
7752{
7753 cpumask_t tmpmask;
7754 int i;
7755
7756 unregister_sched_domain_sysctl();
7757
7758 for_each_cpu_mask_nr(i, *cpu_map)
7759 cpu_attach_domain(NULL, &def_root_domain, i);
7760 synchronize_sched();
7761 arch_destroy_sched_domains(cpu_map, &tmpmask);
7762}
7763
7764
7765static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7766 struct sched_domain_attr *new, int idx_new)
7767{
7768 struct sched_domain_attr tmp;
7769
7770
7771 if (!new && !cur)
7772 return 1;
7773
7774 tmp = SD_ATTR_INIT;
7775 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7776 new ? (new + idx_new) : &tmp,
7777 sizeof(struct sched_domain_attr));
7778}
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7807 struct sched_domain_attr *dattr_new)
7808{
7809 int i, j, n;
7810
7811 mutex_lock(&sched_domains_mutex);
7812
7813
7814 unregister_sched_domain_sysctl();
7815
7816 n = doms_new ? ndoms_new : 0;
7817
7818
7819 for (i = 0; i < ndoms_cur; i++) {
7820 for (j = 0; j < n; j++) {
7821 if (cpus_equal(doms_cur[i], doms_new[j])
7822 && dattrs_equal(dattr_cur, i, dattr_new, j))
7823 goto match1;
7824 }
7825
7826 detach_destroy_domains(doms_cur + i);
7827match1:
7828 ;
7829 }
7830
7831 if (doms_new == NULL) {
7832 ndoms_cur = 0;
7833 doms_new = &fallback_doms;
7834 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7835 dattr_new = NULL;
7836 }
7837
7838
7839 for (i = 0; i < ndoms_new; i++) {
7840 for (j = 0; j < ndoms_cur; j++) {
7841 if (cpus_equal(doms_new[i], doms_cur[j])
7842 && dattrs_equal(dattr_new, i, dattr_cur, j))
7843 goto match2;
7844 }
7845
7846 __build_sched_domains(doms_new + i,
7847 dattr_new ? dattr_new + i : NULL);
7848match2:
7849 ;
7850 }
7851
7852
7853 if (doms_cur != &fallback_doms)
7854 kfree(doms_cur);
7855 kfree(dattr_cur);
7856 doms_cur = doms_new;
7857 dattr_cur = dattr_new;
7858 ndoms_cur = ndoms_new;
7859
7860 register_sched_domain_sysctl();
7861
7862 mutex_unlock(&sched_domains_mutex);
7863}
7864
7865#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7866int arch_reinit_sched_domains(void)
7867{
7868 get_online_cpus();
7869
7870
7871 partition_sched_domains(0, NULL, NULL);
7872
7873 rebuild_sched_domains();
7874 put_online_cpus();
7875
7876 return 0;
7877}
7878
7879static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7880{
7881 int ret;
7882
7883 if (buf[0] != '0' && buf[0] != '1')
7884 return -EINVAL;
7885
7886 if (smt)
7887 sched_smt_power_savings = (buf[0] == '1');
7888 else
7889 sched_mc_power_savings = (buf[0] == '1');
7890
7891 ret = arch_reinit_sched_domains();
7892
7893 return ret ? ret : count;
7894}
7895
7896#ifdef CONFIG_SCHED_MC
7897static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7898 char *page)
7899{
7900 return sprintf(page, "%u\n", sched_mc_power_savings);
7901}
7902static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7903 const char *buf, size_t count)
7904{
7905 return sched_power_savings_store(buf, count, 0);
7906}
7907static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7908 sched_mc_power_savings_show,
7909 sched_mc_power_savings_store);
7910#endif
7911
7912#ifdef CONFIG_SCHED_SMT
7913static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7914 char *page)
7915{
7916 return sprintf(page, "%u\n", sched_smt_power_savings);
7917}
7918static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7919 const char *buf, size_t count)
7920{
7921 return sched_power_savings_store(buf, count, 1);
7922}
7923static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7924 sched_smt_power_savings_show,
7925 sched_smt_power_savings_store);
7926#endif
7927
7928int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7929{
7930 int err = 0;
7931
7932#ifdef CONFIG_SCHED_SMT
7933 if (smt_capable())
7934 err = sysfs_create_file(&cls->kset.kobj,
7935 &attr_sched_smt_power_savings.attr);
7936#endif
7937#ifdef CONFIG_SCHED_MC
7938 if (!err && mc_capable())
7939 err = sysfs_create_file(&cls->kset.kobj,
7940 &attr_sched_mc_power_savings.attr);
7941#endif
7942 return err;
7943}
7944#endif
7945
7946#ifndef CONFIG_CPUSETS
7947
7948
7949
7950
7951static int update_sched_domains(struct notifier_block *nfb,
7952 unsigned long action, void *hcpu)
7953{
7954 switch (action) {
7955 case CPU_ONLINE:
7956 case CPU_ONLINE_FROZEN:
7957 case CPU_DEAD:
7958 case CPU_DEAD_FROZEN:
7959 partition_sched_domains(1, NULL, NULL);
7960 return NOTIFY_OK;
7961
7962 default:
7963 return NOTIFY_DONE;
7964 }
7965}
7966#endif
7967
7968static int update_runtime(struct notifier_block *nfb,
7969 unsigned long action, void *hcpu)
7970{
7971 int cpu = (int)(long)hcpu;
7972
7973 switch (action) {
7974 case CPU_DOWN_PREPARE:
7975 case CPU_DOWN_PREPARE_FROZEN:
7976 disable_runtime(cpu_rq(cpu));
7977 return NOTIFY_OK;
7978
7979 case CPU_DOWN_FAILED:
7980 case CPU_DOWN_FAILED_FROZEN:
7981 case CPU_ONLINE:
7982 case CPU_ONLINE_FROZEN:
7983 enable_runtime(cpu_rq(cpu));
7984 return NOTIFY_OK;
7985
7986 default:
7987 return NOTIFY_DONE;
7988 }
7989}
7990
7991void __init sched_init_smp(void)
7992{
7993 cpumask_t non_isolated_cpus;
7994
7995#if defined(CONFIG_NUMA)
7996 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7997 GFP_KERNEL);
7998 BUG_ON(sched_group_nodes_bycpu == NULL);
7999#endif
8000 get_online_cpus();
8001 mutex_lock(&sched_domains_mutex);
8002 arch_init_sched_domains(&cpu_online_map);
8003 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
8004 if (cpus_empty(non_isolated_cpus))
8005 cpu_set(smp_processor_id(), non_isolated_cpus);
8006 mutex_unlock(&sched_domains_mutex);
8007 put_online_cpus();
8008
8009#ifndef CONFIG_CPUSETS
8010
8011 hotcpu_notifier(update_sched_domains, 0);
8012#endif
8013
8014
8015 hotcpu_notifier(update_runtime, 0);
8016
8017 init_hrtick();
8018
8019
8020 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
8021 BUG();
8022 sched_init_granularity();
8023}
8024#else
8025void __init sched_init_smp(void)
8026{
8027 sched_init_granularity();
8028}
8029#endif
8030
8031int in_sched_functions(unsigned long addr)
8032{
8033 return in_lock_functions(addr) ||
8034 (addr >= (unsigned long)__sched_text_start
8035 && addr < (unsigned long)__sched_text_end);
8036}
8037
8038static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
8039{
8040 cfs_rq->tasks_timeline = RB_ROOT;
8041 INIT_LIST_HEAD(&cfs_rq->tasks);
8042#ifdef CONFIG_FAIR_GROUP_SCHED
8043 cfs_rq->rq = rq;
8044#endif
8045 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8046}
8047
8048static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8049{
8050 struct rt_prio_array *array;
8051 int i;
8052
8053 array = &rt_rq->active;
8054 for (i = 0; i < MAX_RT_PRIO; i++) {
8055 INIT_LIST_HEAD(array->queue + i);
8056 __clear_bit(i, array->bitmap);
8057 }
8058
8059 __set_bit(MAX_RT_PRIO, array->bitmap);
8060
8061#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8062 rt_rq->highest_prio = MAX_RT_PRIO;
8063#endif
8064#ifdef CONFIG_SMP
8065 rt_rq->rt_nr_migratory = 0;
8066 rt_rq->overloaded = 0;
8067#endif
8068
8069 rt_rq->rt_time = 0;
8070 rt_rq->rt_throttled = 0;
8071 rt_rq->rt_runtime = 0;
8072 spin_lock_init(&rt_rq->rt_runtime_lock);
8073
8074#ifdef CONFIG_RT_GROUP_SCHED
8075 rt_rq->rt_nr_boosted = 0;
8076 rt_rq->rq = rq;
8077#endif
8078}
8079
8080#ifdef CONFIG_FAIR_GROUP_SCHED
8081static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8082 struct sched_entity *se, int cpu, int add,
8083 struct sched_entity *parent)
8084{
8085 struct rq *rq = cpu_rq(cpu);
8086 tg->cfs_rq[cpu] = cfs_rq;
8087 init_cfs_rq(cfs_rq, rq);
8088 cfs_rq->tg = tg;
8089 if (add)
8090 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8091
8092 tg->se[cpu] = se;
8093
8094 if (!se)
8095 return;
8096
8097 if (!parent)
8098 se->cfs_rq = &rq->cfs;
8099 else
8100 se->cfs_rq = parent->my_q;
8101
8102 se->my_q = cfs_rq;
8103 se->load.weight = tg->shares;
8104 se->load.inv_weight = 0;
8105 se->parent = parent;
8106}
8107#endif
8108
8109#ifdef CONFIG_RT_GROUP_SCHED
8110static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8111 struct sched_rt_entity *rt_se, int cpu, int add,
8112 struct sched_rt_entity *parent)
8113{
8114 struct rq *rq = cpu_rq(cpu);
8115
8116 tg->rt_rq[cpu] = rt_rq;
8117 init_rt_rq(rt_rq, rq);
8118 rt_rq->tg = tg;
8119 rt_rq->rt_se = rt_se;
8120 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8121 if (add)
8122 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8123
8124 tg->rt_se[cpu] = rt_se;
8125 if (!rt_se)
8126 return;
8127
8128 if (!parent)
8129 rt_se->rt_rq = &rq->rt;
8130 else
8131 rt_se->rt_rq = parent->my_q;
8132
8133 rt_se->my_q = rt_rq;
8134 rt_se->parent = parent;
8135 INIT_LIST_HEAD(&rt_se->run_list);
8136}
8137#endif
8138
8139void __init sched_init(void)
8140{
8141 int i, j;
8142 unsigned long alloc_size = 0, ptr;
8143
8144#ifdef CONFIG_FAIR_GROUP_SCHED
8145 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8146#endif
8147#ifdef CONFIG_RT_GROUP_SCHED
8148 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8149#endif
8150#ifdef CONFIG_USER_SCHED
8151 alloc_size *= 2;
8152#endif
8153
8154
8155
8156
8157 if (alloc_size) {
8158 ptr = (unsigned long)alloc_bootmem(alloc_size);
8159
8160#ifdef CONFIG_FAIR_GROUP_SCHED
8161 init_task_group.se = (struct sched_entity **)ptr;
8162 ptr += nr_cpu_ids * sizeof(void **);
8163
8164 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
8165 ptr += nr_cpu_ids * sizeof(void **);
8166
8167#ifdef CONFIG_USER_SCHED
8168 root_task_group.se = (struct sched_entity **)ptr;
8169 ptr += nr_cpu_ids * sizeof(void **);
8170
8171 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8172 ptr += nr_cpu_ids * sizeof(void **);
8173#endif
8174#endif
8175#ifdef CONFIG_RT_GROUP_SCHED
8176 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8177 ptr += nr_cpu_ids * sizeof(void **);
8178
8179 init_task_group.rt_rq = (struct rt_rq **)ptr;
8180 ptr += nr_cpu_ids * sizeof(void **);
8181
8182#ifdef CONFIG_USER_SCHED
8183 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8184 ptr += nr_cpu_ids * sizeof(void **);
8185
8186 root_task_group.rt_rq = (struct rt_rq **)ptr;
8187 ptr += nr_cpu_ids * sizeof(void **);
8188#endif
8189#endif
8190 }
8191
8192#ifdef CONFIG_SMP
8193 init_defrootdomain();
8194#endif
8195
8196 init_rt_bandwidth(&def_rt_bandwidth,
8197 global_rt_period(), global_rt_runtime());
8198
8199#ifdef CONFIG_RT_GROUP_SCHED
8200 init_rt_bandwidth(&init_task_group.rt_bandwidth,
8201 global_rt_period(), global_rt_runtime());
8202#ifdef CONFIG_USER_SCHED
8203 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8204 global_rt_period(), RUNTIME_INF);
8205#endif
8206#endif
8207
8208#ifdef CONFIG_GROUP_SCHED
8209 list_add(&init_task_group.list, &task_groups);
8210 INIT_LIST_HEAD(&init_task_group.children);
8211
8212#ifdef CONFIG_USER_SCHED
8213 INIT_LIST_HEAD(&root_task_group.children);
8214 init_task_group.parent = &root_task_group;
8215 list_add(&init_task_group.siblings, &root_task_group.children);
8216#endif
8217#endif
8218
8219 for_each_possible_cpu(i) {
8220 struct rq *rq;
8221
8222 rq = cpu_rq(i);
8223 spin_lock_init(&rq->lock);
8224 rq->nr_running = 0;
8225 init_cfs_rq(&rq->cfs, rq);
8226 init_rt_rq(&rq->rt, rq);
8227#ifdef CONFIG_FAIR_GROUP_SCHED
8228 init_task_group.shares = init_task_group_load;
8229 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8230#ifdef CONFIG_CGROUP_SCHED
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
8251#elif defined CONFIG_USER_SCHED
8252 root_task_group.shares = NICE_0_LOAD;
8253 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265 init_tg_cfs_entry(&init_task_group,
8266 &per_cpu(init_cfs_rq, i),
8267 &per_cpu(init_sched_entity, i), i, 1,
8268 root_task_group.se[i]);
8269
8270#endif
8271#endif
8272
8273 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8274#ifdef CONFIG_RT_GROUP_SCHED
8275 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8276#ifdef CONFIG_CGROUP_SCHED
8277 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8278#elif defined CONFIG_USER_SCHED
8279 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
8280 init_tg_rt_entry(&init_task_group,
8281 &per_cpu(init_rt_rq, i),
8282 &per_cpu(init_sched_rt_entity, i), i, 1,
8283 root_task_group.rt_se[i]);
8284#endif
8285#endif
8286
8287 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8288 rq->cpu_load[j] = 0;
8289#ifdef CONFIG_SMP
8290 rq->sd = NULL;
8291 rq->rd = NULL;
8292 rq->active_balance = 0;
8293 rq->next_balance = jiffies;
8294 rq->push_cpu = 0;
8295 rq->cpu = i;
8296 rq->online = 0;
8297 rq->migration_thread = NULL;
8298 INIT_LIST_HEAD(&rq->migration_queue);
8299 rq_attach_root(rq, &def_root_domain);
8300#endif
8301 init_rq_hrtick(rq);
8302 atomic_set(&rq->nr_iowait, 0);
8303 }
8304
8305 set_load_weight(&init_task);
8306
8307#ifdef CONFIG_PREEMPT_NOTIFIERS
8308 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8309#endif
8310
8311#ifdef CONFIG_SMP
8312 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8313#endif
8314
8315#ifdef CONFIG_RT_MUTEXES
8316 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
8317#endif
8318
8319
8320
8321
8322 atomic_inc(&init_mm.mm_count);
8323 enter_lazy_tlb(&init_mm, current);
8324
8325
8326
8327
8328
8329
8330
8331 init_idle(current, smp_processor_id());
8332
8333
8334
8335 current->sched_class = &fair_sched_class;
8336
8337 scheduler_running = 1;
8338}
8339
8340#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
8341void __might_sleep(char *file, int line)
8342{
8343#ifdef in_atomic
8344 static unsigned long prev_jiffy;
8345
8346 if ((!in_atomic() && !irqs_disabled()) ||
8347 system_state != SYSTEM_RUNNING || oops_in_progress)
8348 return;
8349 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8350 return;
8351 prev_jiffy = jiffies;
8352
8353 printk(KERN_ERR
8354 "BUG: sleeping function called from invalid context at %s:%d\n",
8355 file, line);
8356 printk(KERN_ERR
8357 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8358 in_atomic(), irqs_disabled(),
8359 current->pid, current->comm);
8360
8361 debug_show_held_locks(current);
8362 if (irqs_disabled())
8363 print_irqtrace_events(current);
8364 dump_stack();
8365#endif
8366}
8367EXPORT_SYMBOL(__might_sleep);
8368#endif
8369
8370#ifdef CONFIG_MAGIC_SYSRQ
8371static void normalize_task(struct rq *rq, struct task_struct *p)
8372{
8373 int on_rq;
8374
8375 update_rq_clock(rq);
8376 on_rq = p->se.on_rq;
8377 if (on_rq)
8378 deactivate_task(rq, p, 0);
8379 __setscheduler(rq, p, SCHED_NORMAL, 0);
8380 if (on_rq) {
8381 activate_task(rq, p, 0);
8382 resched_task(rq->curr);
8383 }
8384}
8385
8386void normalize_rt_tasks(void)
8387{
8388 struct task_struct *g, *p;
8389 unsigned long flags;
8390 struct rq *rq;
8391
8392 read_lock_irqsave(&tasklist_lock, flags);
8393 do_each_thread(g, p) {
8394
8395
8396
8397 if (!p->mm)
8398 continue;
8399
8400 p->se.exec_start = 0;
8401#ifdef CONFIG_SCHEDSTATS
8402 p->se.wait_start = 0;
8403 p->se.sleep_start = 0;
8404 p->se.block_start = 0;
8405#endif
8406
8407 if (!rt_task(p)) {
8408
8409
8410
8411
8412 if (TASK_NICE(p) < 0 && p->mm)
8413 set_user_nice(p, 0);
8414 continue;
8415 }
8416
8417 spin_lock(&p->pi_lock);
8418 rq = __task_rq_lock(p);
8419
8420 normalize_task(rq, p);
8421
8422 __task_rq_unlock(rq);
8423 spin_unlock(&p->pi_lock);
8424 } while_each_thread(g, p);
8425
8426 read_unlock_irqrestore(&tasklist_lock, flags);
8427}
8428
8429#endif
8430
8431#ifdef CONFIG_IA64
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448struct task_struct *curr_task(int cpu)
8449{
8450 return cpu_curr(cpu);
8451}
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468void set_curr_task(int cpu, struct task_struct *p)
8469{
8470 cpu_curr(cpu) = p;
8471}
8472
8473#endif
8474
8475#ifdef CONFIG_FAIR_GROUP_SCHED
8476static void free_fair_sched_group(struct task_group *tg)
8477{
8478 int i;
8479
8480 for_each_possible_cpu(i) {
8481 if (tg->cfs_rq)
8482 kfree(tg->cfs_rq[i]);
8483 if (tg->se)
8484 kfree(tg->se[i]);
8485 }
8486
8487 kfree(tg->cfs_rq);
8488 kfree(tg->se);
8489}
8490
8491static
8492int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8493{
8494 struct cfs_rq *cfs_rq;
8495 struct sched_entity *se, *parent_se;
8496 struct rq *rq;
8497 int i;
8498
8499 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8500 if (!tg->cfs_rq)
8501 goto err;
8502 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8503 if (!tg->se)
8504 goto err;
8505
8506 tg->shares = NICE_0_LOAD;
8507
8508 for_each_possible_cpu(i) {
8509 rq = cpu_rq(i);
8510
8511 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
8512 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8513 if (!cfs_rq)
8514 goto err;
8515
8516 se = kmalloc_node(sizeof(struct sched_entity),
8517 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8518 if (!se)
8519 goto err;
8520
8521 parent_se = parent ? parent->se[i] : NULL;
8522 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8523 }
8524
8525 return 1;
8526
8527 err:
8528 return 0;
8529}
8530
8531static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8532{
8533 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8534 &cpu_rq(cpu)->leaf_cfs_rq_list);
8535}
8536
8537static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8538{
8539 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8540}
8541#else
8542static inline void free_fair_sched_group(struct task_group *tg)
8543{
8544}
8545
8546static inline
8547int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8548{
8549 return 1;
8550}
8551
8552static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8553{
8554}
8555
8556static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8557{
8558}
8559#endif
8560
8561#ifdef CONFIG_RT_GROUP_SCHED
8562static void free_rt_sched_group(struct task_group *tg)
8563{
8564 int i;
8565
8566 destroy_rt_bandwidth(&tg->rt_bandwidth);
8567
8568 for_each_possible_cpu(i) {
8569 if (tg->rt_rq)
8570 kfree(tg->rt_rq[i]);
8571 if (tg->rt_se)
8572 kfree(tg->rt_se[i]);
8573 }
8574
8575 kfree(tg->rt_rq);
8576 kfree(tg->rt_se);
8577}
8578
8579static
8580int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8581{
8582 struct rt_rq *rt_rq;
8583 struct sched_rt_entity *rt_se, *parent_se;
8584 struct rq *rq;
8585 int i;
8586
8587 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8588 if (!tg->rt_rq)
8589 goto err;
8590 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8591 if (!tg->rt_se)
8592 goto err;
8593
8594 init_rt_bandwidth(&tg->rt_bandwidth,
8595 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8596
8597 for_each_possible_cpu(i) {
8598 rq = cpu_rq(i);
8599
8600 rt_rq = kmalloc_node(sizeof(struct rt_rq),
8601 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8602 if (!rt_rq)
8603 goto err;
8604
8605 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
8606 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8607 if (!rt_se)
8608 goto err;
8609
8610 parent_se = parent ? parent->rt_se[i] : NULL;
8611 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8612 }
8613
8614 return 1;
8615
8616 err:
8617 return 0;
8618}
8619
8620static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8621{
8622 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8623 &cpu_rq(cpu)->leaf_rt_rq_list);
8624}
8625
8626static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8627{
8628 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8629}
8630#else
8631static inline void free_rt_sched_group(struct task_group *tg)
8632{
8633}
8634
8635static inline
8636int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8637{
8638 return 1;
8639}
8640
8641static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8642{
8643}
8644
8645static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8646{
8647}
8648#endif
8649
8650#ifdef CONFIG_GROUP_SCHED
8651static void free_sched_group(struct task_group *tg)
8652{
8653 free_fair_sched_group(tg);
8654 free_rt_sched_group(tg);
8655 kfree(tg);
8656}
8657
8658
8659struct task_group *sched_create_group(struct task_group *parent)
8660{
8661 struct task_group *tg;
8662 unsigned long flags;
8663 int i;
8664
8665 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8666 if (!tg)
8667 return ERR_PTR(-ENOMEM);
8668
8669 if (!alloc_fair_sched_group(tg, parent))
8670 goto err;
8671
8672 if (!alloc_rt_sched_group(tg, parent))
8673 goto err;
8674
8675 spin_lock_irqsave(&task_group_lock, flags);
8676 for_each_possible_cpu(i) {
8677 register_fair_sched_group(tg, i);
8678 register_rt_sched_group(tg, i);
8679 }
8680 list_add_rcu(&tg->list, &task_groups);
8681
8682 WARN_ON(!parent);
8683
8684 tg->parent = parent;
8685 INIT_LIST_HEAD(&tg->children);
8686 list_add_rcu(&tg->siblings, &parent->children);
8687 spin_unlock_irqrestore(&task_group_lock, flags);
8688
8689 return tg;
8690
8691err:
8692 free_sched_group(tg);
8693 return ERR_PTR(-ENOMEM);
8694}
8695
8696
8697static void free_sched_group_rcu(struct rcu_head *rhp)
8698{
8699
8700 free_sched_group(container_of(rhp, struct task_group, rcu));
8701}
8702
8703
8704void sched_destroy_group(struct task_group *tg)
8705{
8706 unsigned long flags;
8707 int i;
8708
8709 spin_lock_irqsave(&task_group_lock, flags);
8710 for_each_possible_cpu(i) {
8711 unregister_fair_sched_group(tg, i);
8712 unregister_rt_sched_group(tg, i);
8713 }
8714 list_del_rcu(&tg->list);
8715 list_del_rcu(&tg->siblings);
8716 spin_unlock_irqrestore(&task_group_lock, flags);
8717
8718
8719 call_rcu(&tg->rcu, free_sched_group_rcu);
8720}
8721
8722
8723
8724
8725
8726
8727void sched_move_task(struct task_struct *tsk)
8728{
8729 int on_rq, running;
8730 unsigned long flags;
8731 struct rq *rq;
8732
8733 rq = task_rq_lock(tsk, &flags);
8734
8735 update_rq_clock(rq);
8736
8737 running = task_current(rq, tsk);
8738 on_rq = tsk->se.on_rq;
8739
8740 if (on_rq)
8741 dequeue_task(rq, tsk, 0);
8742 if (unlikely(running))
8743 tsk->sched_class->put_prev_task(rq, tsk);
8744
8745 set_task_rq(tsk, task_cpu(tsk));
8746
8747#ifdef CONFIG_FAIR_GROUP_SCHED
8748 if (tsk->sched_class->moved_group)
8749 tsk->sched_class->moved_group(tsk);
8750#endif
8751
8752 if (unlikely(running))
8753 tsk->sched_class->set_curr_task(rq);
8754 if (on_rq)
8755 enqueue_task(rq, tsk, 0);
8756
8757 task_rq_unlock(rq, &flags);
8758}
8759#endif
8760
8761#ifdef CONFIG_FAIR_GROUP_SCHED
8762static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8763{
8764 struct cfs_rq *cfs_rq = se->cfs_rq;
8765 int on_rq;
8766
8767 on_rq = se->on_rq;
8768 if (on_rq)
8769 dequeue_entity(cfs_rq, se, 0);
8770
8771 se->load.weight = shares;
8772 se->load.inv_weight = 0;
8773
8774 if (on_rq)
8775 enqueue_entity(cfs_rq, se, 0);
8776}
8777
8778static void set_se_shares(struct sched_entity *se, unsigned long shares)
8779{
8780 struct cfs_rq *cfs_rq = se->cfs_rq;
8781 struct rq *rq = cfs_rq->rq;
8782 unsigned long flags;
8783
8784 spin_lock_irqsave(&rq->lock, flags);
8785 __set_se_shares(se, shares);
8786 spin_unlock_irqrestore(&rq->lock, flags);
8787}
8788
8789static DEFINE_MUTEX(shares_mutex);
8790
8791int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8792{
8793 int i;
8794 unsigned long flags;
8795
8796
8797
8798
8799 if (!tg->se[0])
8800 return -EINVAL;
8801
8802 if (shares < MIN_SHARES)
8803 shares = MIN_SHARES;
8804 else if (shares > MAX_SHARES)
8805 shares = MAX_SHARES;
8806
8807 mutex_lock(&shares_mutex);
8808 if (tg->shares == shares)
8809 goto done;
8810
8811 spin_lock_irqsave(&task_group_lock, flags);
8812 for_each_possible_cpu(i)
8813 unregister_fair_sched_group(tg, i);
8814 list_del_rcu(&tg->siblings);
8815 spin_unlock_irqrestore(&task_group_lock, flags);
8816
8817
8818 synchronize_sched();
8819
8820
8821
8822
8823
8824 tg->shares = shares;
8825 for_each_possible_cpu(i) {
8826
8827
8828
8829 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8830 set_se_shares(tg->se[i], shares);
8831 }
8832
8833
8834
8835
8836
8837 spin_lock_irqsave(&task_group_lock, flags);
8838 for_each_possible_cpu(i)
8839 register_fair_sched_group(tg, i);
8840 list_add_rcu(&tg->siblings, &tg->parent->children);
8841 spin_unlock_irqrestore(&task_group_lock, flags);
8842done:
8843 mutex_unlock(&shares_mutex);
8844 return 0;
8845}
8846
8847unsigned long sched_group_shares(struct task_group *tg)
8848{
8849 return tg->shares;
8850}
8851#endif
8852
8853#ifdef CONFIG_RT_GROUP_SCHED
8854
8855
8856
8857static DEFINE_MUTEX(rt_constraints_mutex);
8858
8859static unsigned long to_ratio(u64 period, u64 runtime)
8860{
8861 if (runtime == RUNTIME_INF)
8862 return 1ULL << 20;
8863
8864 return div64_u64(runtime << 20, period);
8865}
8866
8867
8868static inline int tg_has_rt_tasks(struct task_group *tg)
8869{
8870 struct task_struct *g, *p;
8871
8872 do_each_thread(g, p) {
8873 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8874 return 1;
8875 } while_each_thread(g, p);
8876
8877 return 0;
8878}
8879
8880struct rt_schedulable_data {
8881 struct task_group *tg;
8882 u64 rt_period;
8883 u64 rt_runtime;
8884};
8885
8886static int tg_schedulable(struct task_group *tg, void *data)
8887{
8888 struct rt_schedulable_data *d = data;
8889 struct task_group *child;
8890 unsigned long total, sum = 0;
8891 u64 period, runtime;
8892
8893 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8894 runtime = tg->rt_bandwidth.rt_runtime;
8895
8896 if (tg == d->tg) {
8897 period = d->rt_period;
8898 runtime = d->rt_runtime;
8899 }
8900
8901
8902
8903
8904 if (runtime > period && runtime != RUNTIME_INF)
8905 return -EINVAL;
8906
8907
8908
8909
8910 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8911 return -EBUSY;
8912
8913 total = to_ratio(period, runtime);
8914
8915
8916
8917
8918 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8919 return -EINVAL;
8920
8921
8922
8923
8924 list_for_each_entry_rcu(child, &tg->children, siblings) {
8925 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8926 runtime = child->rt_bandwidth.rt_runtime;
8927
8928 if (child == d->tg) {
8929 period = d->rt_period;
8930 runtime = d->rt_runtime;
8931 }
8932
8933 sum += to_ratio(period, runtime);
8934 }
8935
8936 if (sum > total)
8937 return -EINVAL;
8938
8939 return 0;
8940}
8941
8942static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8943{
8944 struct rt_schedulable_data data = {
8945 .tg = tg,
8946 .rt_period = period,
8947 .rt_runtime = runtime,
8948 };
8949
8950 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8951}
8952
8953static int tg_set_bandwidth(struct task_group *tg,
8954 u64 rt_period, u64 rt_runtime)
8955{
8956 int i, err = 0;
8957
8958 mutex_lock(&rt_constraints_mutex);
8959 read_lock(&tasklist_lock);
8960 err = __rt_schedulable(tg, rt_period, rt_runtime);
8961 if (err)
8962 goto unlock;
8963
8964 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8965 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8966 tg->rt_bandwidth.rt_runtime = rt_runtime;
8967
8968 for_each_possible_cpu(i) {
8969 struct rt_rq *rt_rq = tg->rt_rq[i];
8970
8971 spin_lock(&rt_rq->rt_runtime_lock);
8972 rt_rq->rt_runtime = rt_runtime;
8973 spin_unlock(&rt_rq->rt_runtime_lock);
8974 }
8975 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8976 unlock:
8977 read_unlock(&tasklist_lock);
8978 mutex_unlock(&rt_constraints_mutex);
8979
8980 return err;
8981}
8982
8983int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8984{
8985 u64 rt_runtime, rt_period;
8986
8987 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8988 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8989 if (rt_runtime_us < 0)
8990 rt_runtime = RUNTIME_INF;
8991
8992 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8993}
8994
8995long sched_group_rt_runtime(struct task_group *tg)
8996{
8997 u64 rt_runtime_us;
8998
8999 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
9000 return -1;
9001
9002 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
9003 do_div(rt_runtime_us, NSEC_PER_USEC);
9004 return rt_runtime_us;
9005}
9006
9007int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
9008{
9009 u64 rt_runtime, rt_period;
9010
9011 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
9012 rt_runtime = tg->rt_bandwidth.rt_runtime;
9013
9014 if (rt_period == 0)
9015 return -EINVAL;
9016
9017 return tg_set_bandwidth(tg, rt_period, rt_runtime);
9018}
9019
9020long sched_group_rt_period(struct task_group *tg)
9021{
9022 u64 rt_period_us;
9023
9024 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
9025 do_div(rt_period_us, NSEC_PER_USEC);
9026 return rt_period_us;
9027}
9028
9029static int sched_rt_global_constraints(void)
9030{
9031 u64 runtime, period;
9032 int ret = 0;
9033
9034 if (sysctl_sched_rt_period <= 0)
9035 return -EINVAL;
9036
9037 runtime = global_rt_runtime();
9038 period = global_rt_period();
9039
9040
9041
9042
9043 if (runtime > period && runtime != RUNTIME_INF)
9044 return -EINVAL;
9045
9046 mutex_lock(&rt_constraints_mutex);
9047 read_lock(&tasklist_lock);
9048 ret = __rt_schedulable(NULL, 0, 0);
9049 read_unlock(&tasklist_lock);
9050 mutex_unlock(&rt_constraints_mutex);
9051
9052 return ret;
9053}
9054#else
9055static int sched_rt_global_constraints(void)
9056{
9057 unsigned long flags;
9058 int i;
9059
9060 if (sysctl_sched_rt_period <= 0)
9061 return -EINVAL;
9062
9063 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9064 for_each_possible_cpu(i) {
9065 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9066
9067 spin_lock(&rt_rq->rt_runtime_lock);
9068 rt_rq->rt_runtime = global_rt_runtime();
9069 spin_unlock(&rt_rq->rt_runtime_lock);
9070 }
9071 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9072
9073 return 0;
9074}
9075#endif
9076
9077int sched_rt_handler(struct ctl_table *table, int write,
9078 struct file *filp, void __user *buffer, size_t *lenp,
9079 loff_t *ppos)
9080{
9081 int ret;
9082 int old_period, old_runtime;
9083 static DEFINE_MUTEX(mutex);
9084
9085 mutex_lock(&mutex);
9086 old_period = sysctl_sched_rt_period;
9087 old_runtime = sysctl_sched_rt_runtime;
9088
9089 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
9090
9091 if (!ret && write) {
9092 ret = sched_rt_global_constraints();
9093 if (ret) {
9094 sysctl_sched_rt_period = old_period;
9095 sysctl_sched_rt_runtime = old_runtime;
9096 } else {
9097 def_rt_bandwidth.rt_runtime = global_rt_runtime();
9098 def_rt_bandwidth.rt_period =
9099 ns_to_ktime(global_rt_period());
9100 }
9101 }
9102 mutex_unlock(&mutex);
9103
9104 return ret;
9105}
9106
9107#ifdef CONFIG_CGROUP_SCHED
9108
9109
9110static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
9111{
9112 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9113 struct task_group, css);
9114}
9115
9116static struct cgroup_subsys_state *
9117cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9118{
9119 struct task_group *tg, *parent;
9120
9121 if (!cgrp->parent) {
9122
9123 return &init_task_group.css;
9124 }
9125
9126 parent = cgroup_tg(cgrp->parent);
9127 tg = sched_create_group(parent);
9128 if (IS_ERR(tg))
9129 return ERR_PTR(-ENOMEM);
9130
9131 return &tg->css;
9132}
9133
9134static void
9135cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9136{
9137 struct task_group *tg = cgroup_tg(cgrp);
9138
9139 sched_destroy_group(tg);
9140}
9141
9142static int
9143cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9144 struct task_struct *tsk)
9145{
9146#ifdef CONFIG_RT_GROUP_SCHED
9147
9148 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9149 return -EINVAL;
9150#else
9151
9152 if (tsk->sched_class != &fair_sched_class)
9153 return -EINVAL;
9154#endif
9155
9156 return 0;
9157}
9158
9159static void
9160cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9161 struct cgroup *old_cont, struct task_struct *tsk)
9162{
9163 sched_move_task(tsk);
9164}
9165
9166#ifdef CONFIG_FAIR_GROUP_SCHED
9167static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9168 u64 shareval)
9169{
9170 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9171}
9172
9173static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9174{
9175 struct task_group *tg = cgroup_tg(cgrp);
9176
9177 return (u64) tg->shares;
9178}
9179#endif
9180
9181#ifdef CONFIG_RT_GROUP_SCHED
9182static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9183 s64 val)
9184{
9185 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9186}
9187
9188static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9189{
9190 return sched_group_rt_runtime(cgroup_tg(cgrp));
9191}
9192
9193static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9194 u64 rt_period_us)
9195{
9196 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9197}
9198
9199static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9200{
9201 return sched_group_rt_period(cgroup_tg(cgrp));
9202}
9203#endif
9204
9205static struct cftype cpu_files[] = {
9206#ifdef CONFIG_FAIR_GROUP_SCHED
9207 {
9208 .name = "shares",
9209 .read_u64 = cpu_shares_read_u64,
9210 .write_u64 = cpu_shares_write_u64,
9211 },
9212#endif
9213#ifdef CONFIG_RT_GROUP_SCHED
9214 {
9215 .name = "rt_runtime_us",
9216 .read_s64 = cpu_rt_runtime_read,
9217 .write_s64 = cpu_rt_runtime_write,
9218 },
9219 {
9220 .name = "rt_period_us",
9221 .read_u64 = cpu_rt_period_read_uint,
9222 .write_u64 = cpu_rt_period_write_uint,
9223 },
9224#endif
9225};
9226
9227static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9228{
9229 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
9230}
9231
9232struct cgroup_subsys cpu_cgroup_subsys = {
9233 .name = "cpu",
9234 .create = cpu_cgroup_create,
9235 .destroy = cpu_cgroup_destroy,
9236 .can_attach = cpu_cgroup_can_attach,
9237 .attach = cpu_cgroup_attach,
9238 .populate = cpu_cgroup_populate,
9239 .subsys_id = cpu_cgroup_subsys_id,
9240 .early_init = 1,
9241};
9242
9243#endif
9244
9245#ifdef CONFIG_CGROUP_CPUACCT
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255struct cpuacct {
9256 struct cgroup_subsys_state css;
9257
9258 u64 *cpuusage;
9259};
9260
9261struct cgroup_subsys cpuacct_subsys;
9262
9263
9264static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9265{
9266 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9267 struct cpuacct, css);
9268}
9269
9270
9271static inline struct cpuacct *task_ca(struct task_struct *tsk)
9272{
9273 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9274 struct cpuacct, css);
9275}
9276
9277
9278static struct cgroup_subsys_state *cpuacct_create(
9279 struct cgroup_subsys *ss, struct cgroup *cgrp)
9280{
9281 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9282
9283 if (!ca)
9284 return ERR_PTR(-ENOMEM);
9285
9286 ca->cpuusage = alloc_percpu(u64);
9287 if (!ca->cpuusage) {
9288 kfree(ca);
9289 return ERR_PTR(-ENOMEM);
9290 }
9291
9292 return &ca->css;
9293}
9294
9295
9296static void
9297cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9298{
9299 struct cpuacct *ca = cgroup_ca(cgrp);
9300
9301 free_percpu(ca->cpuusage);
9302 kfree(ca);
9303}
9304
9305
9306static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9307{
9308 struct cpuacct *ca = cgroup_ca(cgrp);
9309 u64 totalcpuusage = 0;
9310 int i;
9311
9312 for_each_possible_cpu(i) {
9313 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9314
9315
9316
9317
9318
9319 spin_lock_irq(&cpu_rq(i)->lock);
9320 totalcpuusage += *cpuusage;
9321 spin_unlock_irq(&cpu_rq(i)->lock);
9322 }
9323
9324 return totalcpuusage;
9325}
9326
9327static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9328 u64 reset)
9329{
9330 struct cpuacct *ca = cgroup_ca(cgrp);
9331 int err = 0;
9332 int i;
9333
9334 if (reset) {
9335 err = -EINVAL;
9336 goto out;
9337 }
9338
9339 for_each_possible_cpu(i) {
9340 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9341
9342 spin_lock_irq(&cpu_rq(i)->lock);
9343 *cpuusage = 0;
9344 spin_unlock_irq(&cpu_rq(i)->lock);
9345 }
9346out:
9347 return err;
9348}
9349
9350static struct cftype files[] = {
9351 {
9352 .name = "usage",
9353 .read_u64 = cpuusage_read,
9354 .write_u64 = cpuusage_write,
9355 },
9356};
9357
9358static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9359{
9360 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9361}
9362
9363
9364
9365
9366
9367
9368static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9369{
9370 struct cpuacct *ca;
9371
9372 if (!cpuacct_subsys.active)
9373 return;
9374
9375 ca = task_ca(tsk);
9376 if (ca) {
9377 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9378
9379 *cpuusage += cputime;
9380 }
9381}
9382
9383struct cgroup_subsys cpuacct_subsys = {
9384 .name = "cpuacct",
9385 .create = cpuacct_create,
9386 .destroy = cpuacct_destroy,
9387 .populate = cpuacct_populate,
9388 .subsys_id = cpuacct_subsys_id,
9389};
9390#endif