by Mike Rieker
Introduction
Рассматривается мульти-процессорная система .
Пусть у нас имеется список трэдов , стоящих в очереди .
Надо понять , как сделать так , чтобы очередной тред запускался только одним процессором .
Если вы не пишете мульти-процессорную ось , то эта статья не для вас .
Getting along without them
So let's take our example started above. We have a
list of threads that are ready to be executed, and we want to be sure
that only one cpu will start executing the thread. Here's the idea:
* | lock all other cpu's out of the list of threads waiting for a cpu |
* | unlink the first one that's waiting, if any |
* | if we didn't find a thread, repeat the whole thing |
* | else, jump to the thread |
Now how do we do that 'lock all other cpu's...'?
If we just:
[view code in window]
static int threads_waiting_to_run_lock = 0;
static Thread *threads_waiting_to_run = NULL;
static Thread **threads_waiting_to_run_end = &threads_waiting_to_run;
Thread *find_a_thread_to_run (void)
{
Thread *thread_to_start;
do {
while (threads_waiting_to_run_lock) {} // repeat while some other cpu is using the queue
threads_waiting_to_run_lock = 1; // tell other cpu's the queue is being used
thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}
lock = 0; // tell other cpu's the queue is no longer busy
} while (thread_to_start == NULL); // repeat if I didn't get anything to do
return (thread_to_start);
}
That won't work because if two cpu's simultaneously hit the while
statement, they will both see lock as being zero, then they both will
set it to one. And they both will dequeue the same top thread, which is
no good.
No simple C statements will solve this problem.
What makes it work
So, for Intel processors, there is the 'lock'
instruction prefix. This says, that for the following instruction, this
instruction shall be the only one that accesses that memory location.
So we can make a routine like this:
[view code in window]
int test_and_set (int new_value, int *lock_pointer);
.globl test_and_set
test_and_set:
movl 4(%esp),%eax # get new_value into %eax
movl 8(%esp),%edx # get lock_pointer into %edx
lock # next instruction is locked
xchgl %eax,(%edx) # swap %eax with what is stored in (%edx)
# ... and don't let any other cpu touch that
# ... memory location while you're swapping
ret # return the old value that's in %eax
Now we can correctly make our routine:
[view code in window]
Thread *find_a_thread_to_run (void)
{
Thread *thread_to_start;
do {
while (test_and_set (1, &threads_waiting_to_run_lock)) {} // repeat while some other cpu is using the queue
// ... then tell other cpu's the queue is being used
thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}
threads_waiting_to_run_lock = 0; // tell other cpu's the queue is no longer busy
} while (thread_to_start == NULL); // repeat if I didn't get anything to do
return (thread_to_start);
}
So the 'while' loop can be said to be 'spinning' on that test_and_set call while some other cpu is using the lock.
Dealing with interrupts
Now for the complications...
You may ask yourself, isn't this thing going to sit there forever
in that do..while loop if there is nothing in the queue? What puts
things in the queue?
So some device gives an hardware interrupt while we're in the loop.
This interrupt routine checks the disk or keyboard status, does it's
thing, and realizes it is time to wake up a thread that is waiting for
the I/O. Here is where it puts the thread in the
'threads_waiting_to_run' queue.
Now it can't just:
[view code in window]
void wake_thread (Thread *thread_to_wake)
{
thread_to_wake -> next_thread = NULL;
*threads_waiting_to_run_end = thread_to_wake;
threads_waiting_to_run_end = &(thread_to_wake -> next_thread);
}
... because the interrupt may have happened between the second and third lines of:
[view code in window]
thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}
So you in essence have the following happening:
[view code in window]
thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
--> interrupt
thread_to_wake -> next_thread = NULL;
*threads_waiting_to_run_end = thread_to_wake;
threads_waiting_to_run_end = &(thread_to_wake -> next_thread);
<-- return from interrupt
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}
... and we end up possibly losing our 'thread_to_wake'.
To solve this, we just inhibit hardware interrupt delivery as well as lock our spinlock:
[view code in window]
Thread *find_a_thread_to_run (void)
{
Thread *thread_to_start;
do {
inhib_hw_int_delivery (); // make sure interrupt routine doesn't interfere
while (test_and_set (1, &threads_waiting_to_run_lock)) {} // repeat while some other cpu is using the queue
// ... then tell other cpu's the queue is being used
thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) {
threads_waiting_to_run = thread_to_start -> next_thread;
if (threads_waiting_to_run == NULL) threads_waiting_to_run_end = &threads_waiting_to_run;
}
threads_waiting_to_run_lock = 0; // tell other cpu's the queue is no longer busy
enable_hw_int_delivery (); // allow interrupts to happen now
} while (thread_to_start == NULL); // repeat if I didn't get anything to do
return (thread_to_start);
}
Interrupts on a multiprocessor
Now we're almost done. If we are on a
multiprocessor, one of the cpu's might be executing the do..while that
is waiting for something to work on, and another cpu might be doing the
interrupt processing. So we end up with our bad situation again, where
we have the four statements executing simultaneously. Easy to fix. We
just put our spinlock around the statements in our interrupt routine as
well:
[view code in window]
void wake_thread (Thread *thread_to_wake)
{
int hwi;
thread_to_wake -> next_thread = NULL;
hwi = inhib_hw_int_delivery (); // make sure interrupt delivery is inhibited
while (test_and_set (1, &threads_waiting_to_run_lock)) {} // repeat while some other cpu is using the queue
// ... then tell other cpu's the queue is being used
*threads_waiting_to_run_end = thread_to_wake;
threads_waiting_to_run_end = &(thread_to_wake -> next_thread);
if (hwi) enable_hw_int_delivery (); // restore interrupt delivery state
}
Rules to live by
It seems very complicated from all that. Well, if you follow a couple simple rules, it isn't!
1) When modifying a variable that can possibly be modified by more
than one cpu at a time, then you need to protect it with a spinlock.
You must *always* have this spinlock when accessing that variable.
2) If the variable can also be modified by an interrupt routine,
you must disable (at least that) interrupt while the spinlock is being
held.
Uniprocessors
So now, you uniprocessor people wonder why all this is necessary. For a uniprocessor, you have:
[view code in window]
Thread *find_a_thread_to_run (void)
{
do {
inhib_hw_int_delivery (); // make sure interrupt routine doesn't interfere
thread_to_start = threads_waiting_to_run;
if (thread_to_start != NULL) threads_waiting_to_run = thread_to_start -> next_thread;
enable_hw_int_delivery (); // allow interrupts to happen now
} while (thread_to_start == NULL); // repeat if I didn't get anything to do
return (thread_to_start);
}
void wake_thread (Thread *thread_to_wake)
{
int hwi;
thread_to_wake -> next_thread = NULL;
hwi = inhib_hw_int_delivery (); // make sure interrupt delivery is inhibited
*threads_waiting_to_run_end = thread_to_wake;
threads_waiting_to_run_end = &(thread_to_wake -> next_thread);
if (hwi) enable_hw_int_delivery (); // restore interrupt delivery state
}
So you can see that it takes all of 2 extra lines per routine to make it multiprocessor ready!
Part 2 talks about some more practical stuff.
Simultaneous spinlocks
Now after you get into this stuff, you may find
that you have to have more than one spinlock at a time. This can lead
to problems. Consider this:
Routine A
lock spinlock X
maybe do some work
lock spinlock Y
do some more work
unlock spinlock Y
unlock spinlock X
And Routine B
lock spinlock Y
maybe do some work
lock spinlock X
do some more work
unlock spinlock X
unlock spinlock Y
So CPU #1 comes along and starts routine A, while CPU #2 starts
routine B. Well, they'll never finish, because CPU #1 will be waiting
for CPU #2 to release spinlock Y and CPU #2 will be waiting for CPU #1
to release spinlock X. So we have another simple rule:
* When locking more than one spinlock, they must always be locked in the same order.
So for our example, both routines need to be changed so that they
either both lock X then Y or they both lock Y then X. You may be
thinking, 'I might not be able to do that in ALL cases!' Easy to fix,
replace the two spinlocks with one, say Z.
Now I am terrible at checking to make sure I do everything in
order. Computers are good at that sort of thing. So rather than just
using an 'int' for my spinlocks, I use a struct as follows:
[view code in window]
typedef struct { char spinlock_flag; // the usual 0=unlocked, 1=locked
unsigned char spinlock_level; // 'locking' order
} Spinlock;
Then I have a spinlock_wait routine that checks to make sure my new
spinlock's level is .gt. the last spinlock's level that was locked by
this cpu. If I try to do it out of order, the routine panics/BSOD's on
the spot.
Interrupts
Another question you may wonder, must I always inhibit hardware interrupt delivery when I have a spinlock?
No. Only if the data being worked with can also be accessed by an
interrupt routine. And then, you only have to disable those interrupts
that have access to it.
For example, I don't allow my interrupt routines to do malloc's. So
the malloc routine can run with interrupts enabled, even when it has
the spinlock. Same with pagetables, disk cache pages, and other stuff
like that.
You must, however, block any pre-emptive scheduling while you have
a any spinlock. Or at least, you will make things *very* complicated if
you don't.
So I ended up with several 'classes' of spinlocks:
low priority - these run with all hardware interrupt delivery enabled
interrupt level - there is one per IRQ level. IRQ 0 is the highest,
IRQ 7 is the lowest. when one of these is set on a cpu, it inhibits
that IRQ and any lower priority interrupts
high priority - these run with all hardware interrupt delivery inhibited
So if a cpu has one of the low priority spinlocks, it is running
with hardware interrupt delivery enabled (but the pre-emptive scheduler
is disabled).
There is one spinlock per interrupt level. For example, the
keyboard driver uses IRQ 1. So whenever the keyboard interrupt routine
is active, I have the cpu take the IRQ 1 spinlock. Whenever the main
keyboard driver routine is accessing data that the interrupt routine
would access, I take the IRQ 1 spinlock. So it in essence acts as a
'inhibit interrupt delivery' mechanism that works across cpu's in an
SMP system.
The high-priority interrupts block all interrupt delivery on the
cpu. These are used for routines (like wake_a_thread) that need to be
callable from any interrupt level.
Performance
This is a big thing about spinlocks. It's easy to make your performance go to pot with poor use of spinlocks.
Consider that you can *theoretically* use just *one* spinlock,
period. Whenever you enter kernel mode, inhibit all interrupt delivery
and set your one-and-only spinlock. When you leave kernel mode, release
the spinlock then enable interrupt delivery. Great!
Well, you can imagine then, that what you end up with is having
just one cpu doing anything useful at a time in kernel mode, while the
other(s) just grind away spinning. Really bad!
So what you do is try to come up with as many spinlocks as you can.
Above I mentioned that you must combine X and Y if you have a case
where you must set X then Y and another case where you must set Y then
X. So now you have two limits to work with:
* | Too few spinlocks and you have poor performance |
* | Too many spinlocks and you violate the ordering |
Here's where the creativity comes in, working with those two limits
to maximize performance. Strive to eliminate as much spinning as
possible.
How does one measure spinlock performance? Make an array of counters like this:
[view code in window]
static int spin_counts[256];
Then increment spin_counts[spinlock_level] each time the
'test_and_set' fails. After a while of running stuff, find which
spinlock_level has big counts (compared to the others). Try to break
that smplock down into smaller ones if you can figure out how to.
Summary
They may seem sort of nebulous to begin with. But
after you work with them, application becomes very scientific. I hope
this little tutorial has helped clear up some issues and gives you
confidence.
Next
In some cases, there are simple and effective alternatives to spinlocks.
Increment/Decrement
Consider the case:
[view code in window]
void dec_thread_refcount (Thread *thread)
{
int new_refcount;
acquire_spinlock (&(thread -> spinlock)); // lock access to thread -> refcount
new_refcount = -- (thread -> refcount); // decrement it, save new value
release_spinlock (&(thread -> spinlock)); // unlock
if (new_refcount == 0) kfree (thread); // if no one is using it anymore, free it
}
Seems like a lot of work acquiring and releasing the spinlock just
to decrement the refcount. Well, remember the 'lock' prefix instruction
we used to make the test_and_set thing? We can be a little fancier
about it and make a 'dec_and_testz' routine:
[view code in window]
int locked_dec_and_testz (int *refcount); // decrement *refcount, return whether or not it went zero
.globl locked_dec_and_testz
locked_dec_and_testz:
xorl %eax,%eax # clear all of %eax
movl 4(%esp),%edx # get pointer to refcount
lock # keep other cpu's out of next instruction
decl (%edx) # decrement refcount, keeping other cpu's out of refcount
setz %al # set %eax=1 if result is zero, set %eax=0 otherwise
ret # return %eax as the result
So now we can write:
[view code in window]
void dec_thread_refcount (Thread *thread)
{
if (locked_dec_and_testz (&(thread -> refcount)) {
kfree (thread);
}
}
Now this has a little gotcha. 'refcount' must now be thought of as
being a variable not protected by a spinlock, but by 'lock instruction
prefix' access only! So any modifications to refcount must be done with
the lock prefix. So we can no longer:
[view code in window]
acquire_spinlock (&(thread -> spinlock));
(thread -> refcount) ++;
release_spinlock (&(thread -> spinlock));
... because the locked_dec_and_testz routine might be in progress
on another cpu, and our 'spinlock' won't do anything to stop it! So we
have to supply a 'locked_inc' routine:
[view code in window]
void locked_inc (int *refcount);
.globl locked_inc
locked_inc:
movl 4(%esp),%edx
lock
incl (%edx)
ret
But we still come out ahead, because there is no possibility of
doing any spinning in any of these routines. (The actual spinning is
done at the CPU bus level, which is very very fast).
Atomic arithmetic
Now we can generally apply increment/decrement to
any single arithmetic operation on a single variable. It is possible to
write routines to do add, subtract, or, and, xor, etc, and have the
routine return the previous value.
For example, this routine or's in 'new_bits' into *value, and returns the previous contents of value:
[view code in window]
int atomic_or_rtnold (int new_bits, int *value);
.globl atomic_or_rtnold
atomic_or_rtnold:
movl 8(%esp),%edx # point to value
movl (%edx),%eax # sample the current contents of value
atomic_or_loop:
movl 4(%esp),%ecx # get the new_bits
orl %eax,%ecx # or them together
lock # bus lock the next instruction
cmpxchgl %ecx,(%edx) # if 'value' hasn't changed, store our new value there
jne atomic_or_loop # repeat if 'value' has changed
ret # return with old (sampled) contents of value
Now you notice this does have a little 'spin' in it, it has a loop
back. But the loop is so short, that the chances of conflicting with
other modifications of the *value is very slim, so it is highly
unlikely that it will ever loop.
If we don't want the old value, we can do this:
[view code in window]
void atomic_or (int new_bits, int *value);
.globl atomic_or
atomic_or:
movl 4(%esp),%eax # get the new_bits
movl 8(%esp),%edx # point to value
lock # bus lock the next instruction
orl %eax,(%edx) # update value
ret
... and not have any loop in there at all.
Atomic linked stack
You can get 'nasty' and implement a LIFO linked stack with atomic operations. Suppose we have:
[view code in window]
typedef struct Block Block;
struct Block { Block *next_block;
...
};
static Block *free_blocks = NULL;
void free_a_block (Block *block_to_free)
{
Block *old_top_free_block;
do {
old_top_free_block = free_blocks;
block_to_free = old_top_free_block;
} while (!atomic_setif_ptr (&free_blocks, block_to_free, old_top_free_block));
}
atomic_setif_ptr says:
if (free_blocks != old_top_free_block) return (0);
else {
free_blocks = block_to_free;
return (1);
}
.globl atomic_setif_ptr
atomic_setif_ptr:
movl 4(%esp),%edx # get pointer to free_blocks
movl 8(%esp),%ecx # get block_to_free
movl 12(%esp),%eax # get old_free_top_block
lock # bus lock the next instruction
cmpxchgl %ecx,(%edx) # if free_blocks == old_free_top_block, then free_blocks = block_to_free
jne atomic_setif_failed
movl $1,%eax # success, return 1
ret
atomic_setif_failed:
xorl %eax,%eax # failure, return 0
ret
Now we can use that same routine to write the alloc routine:
[view code in window]
Block *alloc_a_block (void)
{
Block *sample_block;
do {
sample_block = free_blocks;
if (sample_block == NULL) break;
} while (!atomic_setif_ptr (&free_blocks, sample_block -> next_block, sample_block));
return (sample_block);
}
But again, the gotcha is that 'free_blocks' can only be modified
with atomic operations. So if you must also scan the list, you will
have to use a spinlock.
Summary
Simple spinlocked sequences can be replaced with
atomic operations and can result in performance improvements. Now you
have more fun stuff to play with!
|
|